In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import ShuffleSplit
import math

In [2]:
#creating whole dataset
seventeen = pd.read_json("https://missingmigrants.iom.int/global-figures/2017/json")
sixteen = pd.read_json("https://missingmigrants.iom.int/global-figures/2016/json")
fifteen = pd.read_json("https://missingmigrants.iom.int/global-figures/2015/json")
fourteen = pd.read_json("https://missingmigrants.iom.int/global-figures/2014/json")

frames = [seventeen,sixteen,fifteen,fourteen]



In [3]:
df = pd.concat(frames)
#cleaning data
df = df.replace('', 0,regex=True)
df = df.replace('nan', 0, regex=True)
df.drop(['Information Source'])
df = df[df['Web ID'] != 42424] #drops row with -1 as number of survivors


df['Reported Date'] = pd.to_datetime(df['Reported Date'])
df['total people'] = pd.to_numeric(df['Number of survivors']) + pd.to_numeric(df['Total Dead and Missing'])
df['percent female'] = (pd.to_numeric(df['Number of Female'])/ df['total people']).fillna(0)
df['percent male'] =  (pd.to_numeric(df['Number of Male'])/ df['total people']).fillna(0)
df['percent kids'] =  (pd.to_numeric(df['Number of Children']) / df['total people']).fillna(0)
df['month'] = df['Reported Date'].dt.month
df['day'] = df['Reported Date'].dt.day

R = 6371
df['x'] = df['Location'].apply(lambda latLong: R*math.cos(float(latLong.split(", ")[0]))*math.cos(float(latLong.split(", ")[1])))
df['y'] = df['Location'].apply(lambda latLong: R*math.cos(float(latLong.split(", ")[0]))*math.sin(float(latLong.split(", ")[1])))
df['z'] = df['Location'].apply(lambda latLong: R*math.sin(float(latLong.split(", ")[0])))

# the index will help us get back from number to category later
df['UNSD Geographical Grouping'], geoIndex = pd.Series(df['UNSD Geographical Grouping']).factorize()
#df['Migrant Route'], migrantIndex = pd.Series(df['Migrant Route']).factorize()
#df['Region of Incident'], regionIndex = pd.Series(df['Region of Incident']).factorize()
# one hot coding
ohc_UNSD = pd.get_dummies(df['UNSD Geographical Grouping'],prefix='UNSD')
ohc_Route = pd.get_dummies(df['Migrant Route'],prefix='Route')
ohc_Region = pd.get_dummies(df['Region of Incident'],prefix='Region')

df = df.join(ohc_UNSD)
df = df.join(ohc_Region)
df = df.join(ohc_Route)
# df

In [4]:
#separating data
fourteen = df[(df['Reported Date'] >= '2014-01-01') & (df['Reported Date'] < '2015-01-01')]
fifteen = df[(df['Reported Date'] >= '2015-01-01') & (df['Reported Date'] < '2016-01-01')]
sixteen = df[(df['Reported Date'] >= '2016-01-01') & (df['Reported Date'] < '2017-01-01')]
seventeen = df[(df['Reported Date'] >= '2017-01-01') & (df['Reported Date'] < '2018-01-01')]

In [27]:
def initTest(df):
    y = np.array((pd.to_numeric(df['Number of survivors']) / df['total people']).fillna(0))
    y = np.nan_to_num(y)
    y[y == np.abs(np.inf)] = 0
    
    y = (100*y).astype(int)


    x = np.array(df[['percent female', 'percent kids', 'percent male', 'day','month']])
    x = np.column_stack((x,np.array(df.loc[:,'UNSD_0':])))
    
    x = np.nan_to_num(x)
    x[:,0] = (100*x[:,0]).astype(int)
    x[:,1] = (100*x[:,1]).astype(int)
    x[:,2] = (100*x[:,2]).astype(int)
    
    rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0)

    for train_index, test_index in rs.split(x):
        regr = linear_model.LinearRegression()
        regr.fit(x[train_index],y[train_index])
        print("train: ", regr.score(x[train_index],y[train_index]) )
        print("test: ", regr.score(x[test_index],y[test_index]))
        print(regr.coef_)
        print()
    print()
initTest(sixteen)

train:  0.116304628372
test:  0.103028043632
[ -1.40149394e-01   2.38733457e-02  -1.32286811e-01   9.10707121e-03
  -3.47441045e-01   1.47455951e+00  -1.51877285e+00  -2.53361631e+00
  -1.24908888e+00   3.08771507e+00  -2.08666862e+00   3.95012134e+00
  -6.45324248e+00  -4.16267447e+00  -6.00257206e-01  -6.32334090e+00
  -1.65148514e+01  -1.27501632e+00   3.11484148e+00   3.07913398e+01
   1.12078274e+01  -1.15723285e+00  -9.75164231e+00   7.58805425e+00
  -7.85246284e-01   2.02657014e+01   6.86475485e-01  -5.51238123e+00
   2.09380657e+00  -1.00158633e+00  -6.14156851e+00  -1.89199379e+00
  -3.25554108e+00  -1.24886721e+01  -6.92094682e-01   3.26370719e+00
  -2.12866092e+00  -1.56808854e+00  -2.31257048e+00  -6.60673515e+00
  -9.14175813e+00   3.99650850e+00  -3.91572551e+00  -2.11344377e+01
   3.23356894e+00  -1.19345074e+01   1.45839367e+01  -3.87185458e+00
  -1.33385919e+01   5.51694594e+01  -4.50202798e+00   2.38732834e+00
  -1.04450441e+00]

train:  0.113125764802
test:  0.112806

In [6]:
onlyUS = df[df['UNSD Geographical Grouping'] == geoIndex.get_loc('Northern America')]

In [7]:
initTest(onlyUS)
initTest(df)
initTest(df[df['UNSD Geographical Grouping'] == geoIndex.get_loc('Central America')])

train:  0.0437878326222
test:  0.0385038904718
[ -3.70036684e-02   5.28201779e-03  -3.67308709e-02  -5.97266447e-02
   2.56539094e-01   3.00522059e-01   1.07295902e-02  -1.76310511e-01
  -6.99272945e-01   1.86865659e+00   3.26024652e+00  -1.37431698e+00
  -1.87789167e+00   1.74397862e+00  -2.30309309e+00  -1.68378333e-01
  -6.66133815e-15  -5.73147286e-01   1.28180111e-01   1.57548613e+00
  -8.80439396e-01  -5.72861672e-01  -2.62087731e-01  -1.83445529e+00
  -1.10948751e+00  -1.63743271e+00   2.98845735e-01  -1.21534058e+00
   6.08852822e-01  -2.09779719e-01  -8.30819421e-02   1.02083030e+00
   4.19727478e-01  -3.23169528e+00   2.89606668e+00   3.51659311e+00
   5.60356918e-01   1.38246962e+00   1.47786832e-01  -5.03932666e+00
   8.88324715e-01   9.26208465e-01  -6.94178236e-01  -1.11022302e-16
   1.32646271e+00   0.00000000e+00   1.70681196e+00   0.00000000e+00
  -3.80147814e+00   0.00000000e+00  -5.23449378e-02  -4.44760324e-02
   3.25373971e+00]

train:  0.0407023872536
test:  0.046

  del sys.path[0]
  


train:  9.93153800078e-05
test:  3.88887919244e-05
[  5.61427103e-19   5.61427103e-19   2.24503235e-36  -3.06888159e-38
  -1.66887640e-37  -2.74481031e-38   1.39459640e-38   6.51793276e-39
   1.54790729e-39   8.94560190e-40  -1.54944855e-38   7.42137196e-40
   2.45652591e-40   3.28449353e-39   1.18682758e-39   1.17757861e-38
   1.08767864e-40   1.67295334e-39   2.88567803e-40   6.28929827e-41
   2.78948876e-40   1.68701177e-40   2.20495398e-40   1.03736426e-39
   3.96743733e-39   7.76913316e-41   3.06695780e-39   3.12985079e-39
  -2.97419868e-38   1.60192127e-39   1.16566594e-38   1.62781838e-41
   1.86459196e-40   1.59822168e-40  -1.42586838e-38   2.21827250e-39
   1.68819564e-38   2.71589350e-38   1.00332805e-39   9.17497630e-41
   1.79799939e-40  -5.22475252e-39   9.91489375e-41   2.81168629e-41
  -2.66874122e-38   5.17942211e-42   1.27265800e-40   1.33925057e-40
   1.40584314e-41   3.03366152e-41   2.21235316e-40   6.60746277e-40
   2.15833918e-39]

train:  0.000102690121929
test: 

In [19]:
from sklearn import svm
from sklearn.metrics import confusion_matrix

In [28]:
def suppVector(dataframe):

    y = np.array((pd.to_numeric(dataframe['Number of survivors']) / dataframe['total people']).fillna(0))
    y = np.nan_to_num(y)
    y[y == np.abs(np.inf)] = 0
    y = (100*y).astype(int)

    x = np.array(dataframe[['percent female', 'percent kids', 'percent male', 'day','month']])
    x = np.column_stack((x,np.array(dataframe.loc[:,'UNSD_0':])))
    x = np.nan_to_num(x)
    x[:,0] = (100*x[:,0]).astype(int)
    x[:,1] = (100*x[:,1]).astype(int)
    x[:,2] = (100*x[:,2]).astype(int)

    ntr = len(y)//4 *3
    x_tr = x[:ntr,:]
    y_tr = y[:ntr]
    x_ts = x[ntr:,:]
    y_ts = y[ntr:]
    
    svc = svm.SVC(probability = False, kernel = "rbf", C = 1.0, gamma = 1/ntr, verbose = 10)
    
    print(x_ts.shape,y_ts.shape)
    print(x_tr.shape,y_tr.shape)
    print(ntr)
    svc.fit(x_tr,y_tr)
    
    yhat = svc.predict(x_ts)
    acc = np.mean(yhat == y_ts)
    print('Accuracy = {0:f}'.format(acc))

In [29]:
suppVector(df)

  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


(27442, 53) (27442,)
(82317, 53) (82317,)
82317
[LibSVM]Accuracy = 0.850703
