In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import ShuffleSplit
import math

In [2]:
#creating whole dataset
seventeen = pd.read_json("https://missingmigrants.iom.int/global-figures/2017/json")
sixteen = pd.read_json("https://missingmigrants.iom.int/global-figures/2016/json")
fifteen = pd.read_json("https://missingmigrants.iom.int/global-figures/2015/json")
fourteen = pd.read_json("https://missingmigrants.iom.int/global-figures/2014/json")

frames = [seventeen,sixteen,fifteen,fourteen]



In [4]:
df = pd.concat(frames)
#cleaning data
df = df.replace('', 0,regex=True)
df = df.replace('nan', 0, regex=True)
df.drop(['Information Source'])
df = df[df['Web ID'] != 42424] #drops row with -1 as number of survivors


df['Reported Date'] = pd.to_datetime(df['Reported Date'])
df['total people'] = pd.to_numeric(df['Number of survivors']) + pd.to_numeric(df['Total Dead and Missing'])
df['percent female'] = (pd.to_numeric(df['Number of Female'])/ df['total people']).fillna(0)
df['percent male'] =  (pd.to_numeric(df['Number of Male'])/ df['total people']).fillna(0)
df['percent kids'] =  (pd.to_numeric(df['Number of Children']) / df['total people']).fillna(0)
df['month'] = df['Reported Date'].dt.month
df['day'] = df['Reported Date'].dt.day

R = 6371
df['x'] = df['Location'].apply(lambda latLong: R*math.cos(float(latLong.split(", ")[0]))*math.cos(float(latLong.split(", ")[1])))
df['y'] = df['Location'].apply(lambda latLong: R*math.cos(float(latLong.split(", ")[0]))*math.sin(float(latLong.split(", ")[1])))
df['z'] = df['Location'].apply(lambda latLong: R*math.sin(float(latLong.split(", ")[0])))

# the index will help us get back from number to category later
##df['UNSD Geographical Grouping'], geoIndex = pd.Series(df['UNSD Geographical Grouping']).factorize()
#df['Migrant Route'], migrantIndex = pd.Series(df['Migrant Route']).factorize()
#df['Region of Incident'], regionIndex = pd.Series(df['Region of Incident']).factorize()
# one hot coding
ohc_UNSD = pd.get_dummies(df['UNSD Geographical Grouping'],prefix='UNSD')
ohc_Route = pd.get_dummies(df['Migrant Route'],prefix='Route')
ohc_Region = pd.get_dummies(df['Region of Incident'],prefix='Region')

df = df.join(ohc_UNSD)
df = df.join(ohc_Region)
df = df.join(ohc_Route)
# df

In [5]:
#separating data
fourteen = df[(df['Reported Date'] >= '2014-01-01') & (df['Reported Date'] < '2015-01-01')]
fifteen = df[(df['Reported Date'] >= '2015-01-01') & (df['Reported Date'] < '2016-01-01')]
sixteen = df[(df['Reported Date'] >= '2016-01-01') & (df['Reported Date'] < '2017-01-01')]
seventeen = df[(df['Reported Date'] >= '2017-01-01') & (df['Reported Date'] < '2018-01-01')]

In [7]:
# total_data = np.concatenate((np.array(fourteen),np.array(fifteen),np.array(sixteen),np.array(seventeen)))
# np.random.shuffle(total_data)
# y = total_data[:,11].astype(int) / (total_data[:,14].astype(int) + total_data[:,11].astype(int)) 
# # y is what we predict -- total sruvivors over total survivors+missing and dead 
# y = np.nan_to_num(y)
# y = (100*y).astype(int)
# # x is the parts of the data we are using to predict Location, route, region, and Geographical Grouping
# x[1]

In [8]:
# train = 2000
# x_tr = x[:train,:]
# y_tr = y[:train]
# x_ts = x[train:,:]
# y_ts = y[train:]
# from sklearn import linear_model

In [9]:
def initTest(df):
    y = np.array((pd.to_numeric(df['Number of survivors']) / df['total people']).fillna(0))
    y = np.nan_to_num(y)
    y[y == np.abs(np.inf)] = 0
    
    y = (100*y).astype(int)


    x = np.array(df[['percent female', 'percent kids', 'percent male', 'day','month']])
    x = np.column_stack((x,np.array(df.loc[:,'UNSD_0':])))
    
    x = np.nan_to_num(x)
    x[:,0] = (100*x[:,0]).astype(int)
    x[:,1] = (100*x[:,1]).astype(int)
    x[:,2] = (100*x[:,2]).astype(int)
    
    rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0)

    for train_index, test_index in rs.split(x):
        regr = linear_model.LinearRegression()
        regr.fit(x[train_index],y[train_index])
        print("train: ", regr.score(x[train_index],y[train_index]) )
        print("test: ", regr.score(x[test_index],y[test_index]))
        print(regr.coef_)
        print()
    print()
initTest(sixteen)

train:  0.116304628372
test:  0.103028043632
[ -1.40149394e-01   2.38733457e-02  -1.32286811e-01   9.10707121e-03
  -3.47441045e-01   1.12078274e+01   3.95012134e+00  -2.53361631e+00
  -4.16267447e+00   3.07913398e+01   3.11484148e+00  -6.32334090e+00
  -1.51877285e+00  -9.75164231e+00  -1.15723285e+00  -2.08666862e+00
  -1.65148514e+01  -6.45324248e+00   3.08771507e+00   1.47455951e+00
  -6.00257206e-01  -1.24908888e+00  -1.27501632e+00   7.58805425e+00
  -7.85246284e-01   2.02657014e+01   6.86475485e-01  -5.51238123e+00
   2.09380657e+00  -1.00158633e+00  -6.14156851e+00  -1.89199379e+00
  -3.25554108e+00  -1.24886721e+01  -6.92094682e-01   3.26370719e+00
  -2.12866092e+00  -1.56808854e+00  -2.31257048e+00  -6.60673515e+00
  -9.14175813e+00   3.99650850e+00  -3.91572551e+00  -2.11344377e+01
   3.23356894e+00  -1.19345074e+01   1.45839367e+01  -3.87185458e+00
  -1.33385919e+01   5.51694594e+01  -4.50202798e+00   2.38732834e+00
  -1.04450441e+00]

train:  0.113125764802
test:  0.112806

In [None]:
onlyUS = df[df['UNSD Geographical Grouping'] == geoIndex.get_loc('Northern America')]

In [None]:
initTest(onlyUS)
initTest(df)
initTest(df[df['UNSD Geographical Grouping'] == geoIndex.get_loc('Central America')])