In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import ShuffleSplit

In [44]:
#creating whole dataset
seventeen = pd.read_json("https://missingmigrants.iom.int/global-figures/2017/json")
sixteen = pd.read_json("https://missingmigrants.iom.int/global-figures/2016/json")
fifteen = pd.read_json("https://missingmigrants.iom.int/global-figures/2015/json")
fourteen = pd.read_json("https://missingmigrants.iom.int/global-figures/2014/json")

frames = [seventeen,sixteen,fifteen,fourteen]

df = pd.concat(frames)

In [45]:
#cleaning data
df = df.replace('', 0,regex=True)
df = df.replace('nan', 0, regex=True)
df.drop(['Information Source'])
df = df[df['Web ID'] != 42424] #drops row with -1 as number of survivors


df['Reported Date'] = pd.to_datetime(df['Reported Date'])
df['total people'] = pd.to_numeric(df['Number of survivors']) + pd.to_numeric(df['Total Dead and Missing'])
df['percent female'] = (pd.to_numeric(df['Number of Female'])/ df['total people']).fillna(0)
df['percent male'] =  (pd.to_numeric(df['Number of Male'])/ df['total people']).fillna(0)
df['percent kids'] =  (pd.to_numeric(df['Number of Children']) / df['total people']).fillna(0)
df['month'] = df['Reported Date'].dt.month
df['day'] = df['Reported Date'].dt.day

# the index will help us get back from number to category later
df['UNSD Geographical Grouping'], geoIndex = pd.Series(df['UNSD Geographical Grouping']).factorize()
df['Migrant Route'], migrantIndex = pd.Series(df['Migrant Route']).factorize()
df['Region of Incident'], regionIndex = pd.Series(df['Region of Incident']).factorize()

In [46]:
#separating data
fourteen = df[(df['Reported Date'] >= '2014-01-01') & (df['Reported Date'] < '2015-01-01')]
fifteen = df[(df['Reported Date'] >= '2015-01-01') & (df['Reported Date'] < '2016-01-01')]
sixteen = df[(df['Reported Date'] >= '2016-01-01') & (df['Reported Date'] < '2017-01-01')]
seventeen = df[(df['Reported Date'] >= '2017-01-01') & (df['Reported Date'] < '2018-01-01')]

In [47]:
total_data = np.concatenate((np.array(fourteen),np.array(fifteen),np.array(sixteen),np.array(seventeen)))
np.random.shuffle(total_data)
y = total_data[:,11].astype(int) / (total_data[:,14].astype(int) + total_data[:,11].astype(int)) 
# y is what we predict -- total sruvivors over total survivors+missing and dead 
y = np.nan_to_num(y)
y = (100*y).astype(int)
x = np.column_stack((total_data[:,5], total_data[:,12],total_data[:,15]))
# x is the parts of the data we are using to predict Location, route, region, and Geographical Grouping
x[1]

  This is separate from the ipykernel package so we can avoid doing imports until


array([1, 4, 2], dtype=object)

In [48]:
train = 2000
x_tr = x[:train,:]
y_tr = y[:train]
x_ts = x[train:,:]
y_ts = y[train:]
from sklearn import linear_model

In [49]:
regr = linear_model.LinearRegression()
regr.fit(x_tr,y_tr)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [50]:
regr.coef_

array([ 2.33136254,  0.06412425, -1.38129398])

In [51]:
#training data
y_pred_tr = regr.predict(x_tr)
RSS = np.mean((y_pred_tr - y_tr) **2)/(np.std(y_tr)**2)
print("RSS: = {0:f}".format(RSS))
print("R^2: = {0:f}".format(1-RSS))
print("R^2= {0:f}".format(regr.score(x_tr,y_tr)))

RSS: = 0.926494
R^2: = 0.073506
R^2= 0.073506


In [52]:
#test data
y_pred_ts = regr.predict(x_ts)

RSS = np.mean((y_pred_ts - y_ts) **2)/(np.std(y_ts)**2)
print("RSS: = {0:f}".format(RSS))
print("R^2: = {0:f}".format(1-RSS))

RSS: = 0.962636
R^2: = 0.037364


In [56]:
def initTest(df):
    y = np.array((pd.to_numeric(df['Number of survivors']) / df['total people']).fillna(0))
    y = np.nan_to_num(y)
    y[y == np.abs(np.inf)] = 0
    
    y = (100*y).astype(int)


    x = np.array(df[['UNSD Geographical Grouping', 'Migrant Route', 'Region of Incident','percent female', 'percent kids', 'percent male', 'day','month']])
    x = np.nan_to_num(x)

    x[:,3] = (100*x[:,3]).astype(int)
    x[:,4] = (100*x[:,4]).astype(int)
    x[:,5] = (100*x[:,5]).astype(int)


    rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0)

    for train_index, test_index in rs.split(x):
        regr = linear_model.LinearRegression()
        regr.fit(x[train_index],y[train_index])
        print("train: ", regr.score(x[train_index],y[train_index]) )
        print("test: ", regr.score(x[test_index],y[test_index]))
        print(regr.coef_)
        print()
    print()

In [57]:
onlyUS = df[df['UNSD Geographical Grouping'] == geoIndex.get_loc('Northern America')]

In [58]:
initTest(onlyUS)
initTest(df)
initTest(df[df['UNSD Geographical Grouping'] == geoIndex.get_loc('Central America')])

train:  0.0424244972933
test:  -0.0037503123579
[ 0.         -0.27017474  0.01892827 -0.05200004 -0.00093992 -0.05091294
 -0.02089713  0.11393549]

train:  0.0458488044058
test:  -0.160132944365
[  0.00000000e+00  -4.85722573e-17   3.16414433e-02  -5.49076145e-02
   2.99037036e-03  -5.43578601e-02  -5.13860140e-02   2.23140405e-01]

train:  0.0358732900304
test:  0.0312031628027
[ 0.         -0.28562755  0.03790224 -0.04170855  0.00181967 -0.04050936
 -0.05433346  0.21099163]


train:  0.190071394152
test:  0.173655623067
[-0.75021194  1.80405348 -1.39362357 -0.10863548  0.1086355  -0.21016226
 -0.04699752 -0.02482061]

train:  0.18231732413
test:  0.192084887746
[-0.67130365  1.48688289 -1.41716563 -0.11286576  0.11286578 -0.19951491
 -0.07270437 -0.03475173]

train:  0.181808425782
test:  0.200602121232
[-0.7851339   1.84463063 -1.22928795 -0.09865012  0.09865014 -0.20556224
 -0.02495267 -0.1336195 ]


train:  0.33001319662
test:  0.414727379975
[ 0.         -0.71021626  1.06505759 -

  if sys.path[0] == '':
  del sys.path[0]
