In [101]:
import numpy as np
import pandas as pd
from math import log
import json

In [79]:
# Load reference datasets
countries  = pd.read_csv('../raw_data/CountriesList.txt', delimiter=' , ', index_col='Country Code')

latlong = pd.read_csv('../raw_data/LatLong.csv', index_col='Alpha-3',
                      usecols=['Alpha-3', 'lat', 'long'])

good_indices = (countries.index & latlong.index)

iso_codes = pd.read_csv('../raw_data/ISO_codes.csv', index_col='country-code')
def num_to_alpha3(num_code): return iso_codes.loc[num_code, 'alpha-3']

# Load population dataset
label_cols = ['Country Name', 'Country Code']
year_cols = ['1990', '1995', '2000', '2005', '2010', '2015']
keep_cols = label_cols + year_cols
pop = pd.read_csv('../raw_data/Population.csv',
                  index_col='Country Code', usecols=keep_cols)
pop = pop.reindex(pop.index & good_indices)

  from ipykernel import kernelapp as app


In [94]:
bad_cols = ['Destination', 'Numeric', 'Data Type', 'Total', 'Other North', 'Other South']

def get_migrate_df(year):
    df = pd.read_excel('../raw_data/Migrate_'+year+'.xlsx', header=0)
    
    df = df.query('Numeric < 900 and Numeric != 830')
    df['Country Code'] = df['Numeric'].apply(num_to_alpha3)
    df.set_index('Country Code', inplace=True)
    df = df.reindex(df.index & pop[year].dropna().index)
    
    good_countries = set(df['Destination'])
    
    for ccol in df.columns[6:]:
        if ccol not in good_countries:
            df.drop(ccol, axis=1, inplace=True)
        else:
            df.rename(columns={ccol: df.index[df['Destination'] == ccol][0]}, inplace=True)
    
    return df.drop(bad_cols, axis=1)

In [119]:
immigrants = {}
emigrants = {}
logmax = {}

for year in year_cols:
    print("Getting im_ & em_ df's for: ", year)
    
    immigrants[year] = get_migrate_df(year)
    logmax[year] = log(immigrants[year].max().max()) 
    emigrants[year] = immigrants[year].transpose()
    
    print(immigrants[year].shape, emigrants[year].shape, immigrants[year].max().max(), logmax[year])

Getting to/from df's for:  1990


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(168, 168) (168, 168) 5018098.0 15.428561535410907
Getting to/from df's for:  1995
(168, 168) (168, 168) 6602801.0 15.70300451090641
Getting to/from df's for:  2000
(168, 168) (168, 168) 9177487.0 16.03226397784436
Getting to/from df's for:  2005
(168, 168) (168, 168) 10309054.0 16.148533096206233
Getting to/from df's for:  2010
(168, 168) (168, 168) 11566960.0 16.26366331615796
Getting to/from df's for:  2015
(167, 167) (167, 167) 12050031.0 16.304577790511736


In [100]:
test = migrate['1995'].drop(bad_cols, axis=1)
from math import log
log(test.max().max(), 2)

22.6546467349417

In [120]:
def single_arc(origin, destination, strokeWidth):
    d = {}
    d['origin'] = {'latitude': latlong.loc[origin]['lat'],
                   'longitude': latlong.loc[origin]['long']}
    d['destination'] = {'latitude': latlong.loc[destination]['lat'],
                        'longitude': latlong.loc[destination]['long']}
    d['strokeWidth'] = strokeWidth
    return d

def row_arcs(row_name, row, status):
    if status == 'im':
        # for immigration, row_name = destination  
        l = [single_arc(origin, row_name, log(value)) for origin, value in row.items()]
    elif status == 'em':
        # for emigration, row_name = origin
        l = [single_arc(row_name, destination, log(value)) for destination, value in row.items()]
            
    return l

# threshold: we only count arcs where (num_people > threshold)
def df_arcs(df, status, threshold):
    assert status in ['im', 'em']
    d = {}
    if status == 'im':
        for code, row in df.iterrows():
            d[code] = row_arcs(code, row[row > threshold], 'im')
    elif status == 'em':
        for code, row in df.iterrows():
            d[code] = row_arcs(code, row[row > threshold], 'em')
    
    return d

In [117]:
test_row[test_row > 100].apply(lambda x: log(x))

BEL     6.431331
COD    10.632171
FRA     5.736572
KEN     5.375278
RWA    11.984160
UGA     7.343426
TZA     9.188095
Name: BDI, dtype: float64

In [122]:
im_arcs = {}
em_arcs = {}

for year in year_cols:
    im_arcs[year] = df_arcs(immigrants[year], 'im', 100)
    em_arcs[year] = df_arcs(emigrants[year], 'em', 100)

In [123]:

with open('immigrant_arcs.json', 'w') as out:
    json.dump(im_arcs, out, indent=2)
    
with open('emigrant_arcs.json', 'w') as out:
    json.dump(em_arcs, out, indent=2)