In [1]:
from IPython.display import Image
import collections
import csv,re,sys,os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from pandas.io import wb
import locale
import matplotlib
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [2]:
locale.setlocale(locale.LC_ALL, 'en_US.utf8')

def replaceThousands(n):
    '''Format large numbers'''
    return locale.format("%d",n, grouping=True)

In [3]:
sns.set_context('paper')

In [4]:
def setFigSize(dim=(16.5,5.5)):
    fig = matplotlib.pyplot.gcf()
    fig.set_size_inches(dim[0],dim[1])

###Grab CAIDA AS Data

In [12]:
if not os.path.exists('../data'):
    os.mkdir('../data')

if not os.path.exists('../data/midar-iff.nodes.geo'):
    !wget http://data.caida.org/datasets/topology/ark/ipv4/itdk/2013-04/kapar-midar-iff.nodes.geo.gz && \
        gunzip ../data/midar-iff.nodes.geo.gz


In [13]:
if False:
    !grep -v '#' midar-iff.nodes.geo | awk -F '\t' '{print $1"\t"$3}' > midar-iff.nodes.geo_reduced


###Read in Geolocated AS Data

In [18]:
df=pd.read_csv('../data/midar-iff.nodes.geo_reduced',quotechar='#',delimiter='\t',header=False,names=['id','country'])

In [6]:
df.shape

(31788985, 2)

###Get Country Names and ISO Codes

In [7]:
iso=wb.get_countries()
# 'name','iso2c'
iso.incomeLevel=iso.incomeLevel.apply(lambda x: re.sub(r'High income: nonOECD|High income: OECD','High income',x))
# Collpase OECD and non OECD high income countries

In [8]:
iso.incomeLevel.value_counts()

High income            80
Upper middle income    53
Lower middle income    51
Aggregates             49
Low income             31
dtype: int64

In [9]:
isoHash={}

for r in iso.iterrows():
#    print r[1][4],r[1][8]
    isoHash[r[1][4]]=r[1][8]

In [19]:
df.head()

Unnamed: 0,id,country
0,node.geo N2:,SG
1,node.geo N3:,SG
2,node.geo N4:,SG
3,node.geo N5:,IL
4,node.geo N6:,FR


In [20]:
%%time
replaceRe=re.compile(r'node.geo|:')
df['idClean']=df.id.apply(lambda x:re.sub(replaceRe,'',x))
#df['idClean']=df['idClean'].apply(lambda x:re.sub(':','',x))

CPU times: user 3min 14s, sys: 1.75 s, total: 3min 15s
Wall time: 3min 13s


In [21]:
df.head()

Unnamed: 0,id,country,idClean
0,node.geo N2:,SG,N2
1,node.geo N3:,SG,N3
2,node.geo N4:,SG,N4
3,node.geo N5:,IL,N5
4,node.geo N6:,FR,N6


###Join the Dataframes

In [22]:
iso.index=iso.iso2c

In [23]:
%time df=df.join(iso,on='country')

CPU times: user 7.4 s, sys: 1.44 s, total: 8.84 s
Wall time: 8.84 s


###Clean out Aggregates

In [25]:
df[df['incomeLevel']=='Aggregates']=None
iso[iso['incomeLevel']=='Aggregates']=None

In [26]:
df.head()

Unnamed: 0,id,country,idClean,adminregion,capitalCity,iso3c,incomeLevel,iso2c,latitude,lendingType,longitude,name,region
0,node.geo N2:,SG,N2,,Singapore,SGP,High income,SG,1.28941,Not classified,103.85,Singapore,East Asia & Pacific (all income levels)
1,node.geo N3:,SG,N3,,Singapore,SGP,High income,SG,1.28941,Not classified,103.85,Singapore,East Asia & Pacific (all income levels)
2,node.geo N4:,SG,N4,,Singapore,SGP,High income,SG,1.28941,Not classified,103.85,Singapore,East Asia & Pacific (all income levels)
3,node.geo N5:,IL,N5,,,ISR,High income,IL,31.7717,Not classified,35.2035,Israel,Middle East & North Africa (all income levels)
4,node.geo N6:,FR,N6,,Paris,FRA,High income,FR,48.8566,Not classified,2.35097,France,Europe & Central Asia (all income levels)


In [27]:
#iso.index=iso.iso2c
iso.head()

Unnamed: 0_level_0,adminregion,capitalCity,iso3c,incomeLevel,iso2c,latitude,lendingType,longitude,name,region
iso2c,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AW,,Oranjestad,ABW,High income,AW,12.5167,Not classified,-70.0167,Aruba,Latin America & Caribbean (all income levels)
AF,South Asia,Kabul,AFG,Low income,AF,34.5228,IDA,69.1761,Afghanistan,South Asia
,,,,,,,,,,
AO,Sub-Saharan Africa (developing only),Luanda,AGO,Upper middle income,AO,-8.81155,IBRD,13.242,Angola,Sub-Saharan Africa (all income levels)
AL,Europe & Central Asia (developing only),Tirane,ALB,Upper middle income,AL,41.3317,IBRD,19.8172,Albania,Europe & Central Asia (all income levels)


In [None]:
'''
%%time
nodeGeoHash=collections.OrderedDict()
# Node #X=>country Y

for r in df.iterrows():
#    print r[1][1],r[1][2]
    nodeGeoHash[r[1][2]]=r[1][1]
#    sys.exit(1)
'''

In [36]:
levelHash={}

for r in iso.iterrows():
#    print r[1][8]
#    print r[1][8],r[1][3]
    levelHash[r[1][8]]=r[1][3]

In [None]:
df['fullName']=df['country'].apply(lambda x:isoHash.get(x))
df['level']=df['country'].apply(lambda x:levelHash.get(x))

In [28]:
iso[iso.iso2c=='GB'].incomeLevel.values[0]

u'High income'

In [16]:
df.head()

Unnamed: 0,id,country,idClean,fullName,level
0,node.geo N2:,SG,N2,Singapore,
1,node.geo N3:,SG,N3,Singapore,
2,node.geo N4:,SG,N4,Singapore,
3,node.geo N5:,IL,N5,Israel,
4,node.geo N6:,FR,N6,France,


In [17]:
df.idClean.values[0]

'N2'

In [29]:
levelValues=iso.incomeLevel.values
levels=iso.incomeLevel.unique()

In [30]:
iso.incomeLevel.value_counts()

High income            80
Upper middle income    53
Lower middle income    51
Low income             31
dtype: int64

In [38]:
colourHash={}

colours=sns.color_palette("dark",n_colors=len(levels))

for level, colour in zip(levels,colours):
    if level:
        colourHash[level]=colour

In [39]:
print levels

[u'High income' u'Low income' None u'Upper middle income'
 u'Lower middle income']


In [40]:
counts=df.name.value_counts()

In [None]:
countsHash=counts.to_dict()

In [None]:
setFigSize(dim=(15,35))
barList=plt.barh(range(len(counts)),counts.values[::-1],0.8,log=True)
trash=plt.yticks([r+0.5 for r in range(len(counts))],[c+' ('+replaceThousands(countsHash[c])+')' for n,c in enumerate(counts.index[::-1])])

for n,bar in enumerate(barList):
    bar.set_color(colourHash[levelHash[counts.index[::-1][n]]])
#    print counts.index[::-1][n],levelHash[counts.index[::-1][n]]
#sys.exit(1)
for n,(k,v) in enumerate(colourHash.items()):
#    print k,v
    plt.annotate(k,xy=(0.05,0.91-(n*0.013)),xycoords='figure fraction',fontsize=24,color=v)

plt.savefig('../charts/as_count.png',dpi=200)

###Now look at edges between countries

In [43]:
edgeFile=csv.reader(open('../data/midar-iff.links'),delimiter=' ')

In [26]:
!sed -n 1530,1540p ../data/midar-iff.links

link L1:  N8888118:1.0.0.1 N120707 N120708 
link L2:  N8888119:1.0.0.2 N120709 N120710 N106378 N120711 N120712 
link L3:  N8888120:1.0.0.6 N106378 N120711 
link L4:  N8888121:1.0.0.13 N120713 
link L5:  N8888122:1.0.0.164 N120714 
link L6:  N8888123:1.0.4.10 N120715 
link L7:  N8888124:1.0.8.250 N120716 N120717 
link L8:  N8888125:1.0.20.46 N120718 
link L9:  N120718:1.0.24.22 N120719 
link L10:  N8888126:1.0.24.150 N120720 
link L11:  N699985:1.1.0.1 N120721 N120722 


In [56]:
edgeCounter=collections.Counter()
ipCounter=collections.Counter()

for line in edgeFile:
    if not re.search(r'#',line[0]):
#        print line
        source=line[3].partition(':')[0]
#        print source
#        print nodeGeoHash.get(source)
#        print line[4:-1]
        
        for dest in line[4:-1]:
            edgeCounter[(nodeGeoHash.get(source),nodeGeoHash.get(dest))]+=1

        ipCounter[source]+=1
        # Count number of IPs associated with each AS
        
#        sys.exit(1)

In [54]:
for k,v in edgeCounter.most_common():
    if not k[0]==k[1] and k[0] and k[1]:
        print k,v

('US', 'EG') 71707
('US', 'EU') 29796
('US', 'DE') 28630
('DE', 'US') 25837
('ES', 'PE') 18148
('CA', 'US') 16871
('EU', 'DE') 14785
('SE', 'NO') 14190
('GB', 'US') 14071
('BR', 'US') 13912
('US', 'CA') 12679
('US', 'AU') 11041
('DE', 'EU') 10549
('US', 'GB') 10487
('US', 'BR') 10244
('NO', 'SE') 9760
('US', 'JP') 9749
('US', 'NL') 8937
('NL', 'US') 8776
('EG', 'US') 8458
('GB', 'DE') 8341
('US', 'IN') 8319
('EU', 'US') 7772
('IN', 'US') 7612
('AR', 'US') 7280
('DK', 'US') 7198
('RU', 'US') 7115
('DE', 'GB') 7093
('NL', 'SE') 6890
('FR', 'US') 6434
('RU', 'DE') 6091
('MX', 'US') 6002
('IT', 'US') 5708
('US', 'AR') 5239
('US', 'SG') 5191
('FR', 'DE') 4902
('EU', 'GB') 4803
('DE', 'NL') 4695
('JP', 'US') 4541
('HR', 'SE') 4317
('SE', 'US') 4258
('US', 'TH') 4230
('RU', 'GB') 4198
('LT', 'SE') 4142
('DE', 'FR') 4046
('HK', 'US') 4019
('DK', 'DE') 3953
('GB', 'EU') 3827
('SE', 'NL') 3513
('BR', 'ES') 3486
('CR', 'US') 3442
('FR', 'GB') 3411
('US', 'RU') 3406
('US', 'MX') 3211
('GB', 'NL') 

In [28]:
!head ../data/midar-iff.links

# version: $Id: kapar.cc,v 1.162 2012/03/14 00:24:54 kkeys Exp $
# version: $Id: PathLoader.cc,v 1.19 2012/01/04 20:44:46 kkeys Exp $
# start time: 1367051656 (2013-04-27 01:34:16 PDT)
# command line: /home/kkeys/WIP/topology/archipelago/aliasResolution/midar/../kapar/kapar -il -py -r31 -sir -c0.5 -nv -adms -d1 -mn -lb -1a -oals -O /scratch/alias-resolution/run-20130422/kapar/midar-iff -z 24
#   -B /scratch/alias-resolution/run-20130422/bogon-bn-agg-20130422.txt
#   -A /scratch/alias-resolution/run-20130422/kapar/midar-iff.pairs
#   -P /san5/topology/ark/data/team-probing/list-7.allpref24/team-1/daily/2013/cycle-20130407/daily.l7.t1.c002462.20130407.ams3-nl.warts.gz
#   -P /san5/topology/ark/data/team-probing/list-7.allpref24/team-1/daily/2013/cycle-20130407/daily.l7.t1.c002462.20130407.amw-us.warts.gz
#   -P /san5/topology/ark/data/team-probing/list-7.allpref24/team-1/daily/2013/cycle-20130407/daily.l7.t1.c002462.20130407.bcn-es.warts.gz
#   -P /san5/topology/ark/data/team-pr

In [70]:
nodesDf=pd.read_csv('../data/midar-iff.nodes_count',delimiter='  ',header=False,names=['id','n'])

In [71]:
nodesDf.head()

Unnamed: 0,id,n
0,N2,697
1,N3,681
2,N4,630
3,N5,548
4,N6,359


In [72]:
nodesDf['country']=nodesDf['id'].apply(lambda x:nodeGeoHash.get(x))

In [73]:
nodesDf.head()

Unnamed: 0,id,n,country
0,N2,697,SG
1,N3,681,SG
2,N4,630,SG
3,N5,548,IL
4,N6,359,FR


In [78]:
groups=nodesDf.groupby('country')

In [84]:
aggregated=groups.agg(np.sum)

In [86]:
aggregated.head()

Unnamed: 0_level_0,n
country,Unnamed: 1_level_1
A1,11397
A2,9724
AD,1026
AE,32211
AF,3220


In [88]:
aggregated.sort('n',ascending=False)

Unnamed: 0_level_0,n
country,Unnamed: 1_level_1
US,23600900
CN,18204066
KR,6239463
DE,4978297
BR,3633394
FR,3503206
JP,2819883
GB,2417173
ES,2182683
MX,1954976
