In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import ShuffleSplit

In [21]:
#creating whole dataset
seventeen = pd.read_json("https://missingmigrants.iom.int/global-figures/2017/json")
sixteen = pd.read_json("https://missingmigrants.iom.int/global-figures/2016/json")
fifteen = pd.read_json("https://missingmigrants.iom.int/global-figures/2015/json")
fourteen = pd.read_json("https://missingmigrants.iom.int/global-figures/2014/json")

frames = [seventeen,sixteen,fifteen,fourteen]

df = pd.concat(frames)

In [54]:
#cleaning data
df = df.replace('', 0,regex=True)
df = df.replace('nan', 0, regex=True)
df.drop(['Information Source'])
df = df[df['Web ID'] != 42424] #drops row with -1 as number of survivors


df['Reported Date'] = pd.to_datetime(df['Reported Date'])
df['total people'] = pd.to_numeric(df['Number of survivors']) + pd.to_numeric(df['Total Dead and Missing'])
df['percent female'] = (pd.to_numeric(df['Number of Female'])/ df['total people']).fillna(0)
df['percent male'] =  (pd.to_numeric(df['Number of Male'])/ df['total people']).fillna(0)
df['percent kids'] =  (pd.to_numeric(df['Number of Children']) / df['total people']).fillna(0)
df['month'] = df['Reported Date'].dt.month
df['day'] = df['Reported Date'].dt.day

# the index will help us get back from number to category later
##df['UNSD Geographical Grouping'], geoIndex = pd.Series(df['UNSD Geographical Grouping']).factorize()
#df['Migrant Route'], migrantIndex = pd.Series(df['Migrant Route']).factorize()
#df['Region of Incident'], regionIndex = pd.Series(df['Region of Incident']).factorize()
# one hot coding
ohc_UNSD = pd.get_dummies(df['UNSD Geographical Grouping'])
ohc_Route = pd.get_dummies(df['Migrant Route'])
ohc_Region = pd.get_dummies(df['Reg'])

Unnamed: 0,Caribbean,Cause of death,Central America,East Asia,Europe,Horn of Africa,Information Reliability,Information Source,Location,Location Description,...,UNSD Geographical Grouping,URL,US-Mexico Border,Web ID,day,month,percent female,percent kids,percent male,total people
0,,Presumed drowning,,,,,Verified,0,"35.882833113479, -5.803099736328",Unspecified location off the coast of Morocco,...,Uncategorized,"http://bit.ly/2zjWYzz, http://bit.ly/2zjX9uJ",,44036.0,16.0,12.0,0.000000,0.000000,0.000000,23.0
1,,Hypothermia,,,,,Partially verified,The Monitor,"27.071795621620, -98.265021009375","Ranch near Falfurrias, Brooks County, Texas",...,Uncategorized,http://bit.ly/2CyYtsz,,44037.0,14.0,12.0,0.000000,0.000000,0.000000,1.0
2,,Drowning,,,,,Verified,Salvamento Marítimo,"35.938908600000, -3.180596302148","5 nautical miles west of Alboran Island, betwe...",...,Uncategorized,http://bit.ly/2C5bx8y,,44027.0,14.0,12.0,0.000000,0.000000,0.000000,69.0
3,,Presumed drowning,,,,,Verified,Salvamento Marítimo,"36.214401400000, -3.419552700000","Unspecified location in the Alboran Sea, betwe...",...,Uncategorized,http://bit.ly/2jWbwxp,,44026.0,13.0,12.0,0.000000,0.000000,0.000000,33.0
4,,Hypothermia,,,,,Partially verified,KGNS,"27.817448700000, -99.012892600000","Various locations in Texas, USA",...,Northern America,http://bit.ly/2BlQWQN,,44025.0,13.0,12.0,0.200000,0.000000,0.800000,10.0
5,,Hypothermia,,,,,Verified,U.S. Border Patrol,"27.252321571872, -97.863335577246","Paloma Ranch, near Eagle Pass, Maverick County...",...,Northern America,"http://bit.ly/2BT4Czx, http://bit.ly/2AqCeZe, ...",,44017.0,11.0,12.0,0.000000,0.000000,0.066667,15.0
6,,Hypothermia,,,,,Partially verified,"El Diaro de Sonora, InfoNogales","31.329820649800, -110.920176967090","Colonia Pima 2, northeast Nogales, Sonora, Mexico",...,Central America,"http://bit.ly/2Av0cCl, http://bit.ly/2BagwrF",,44024.0,9.0,12.0,0.000000,0.000000,1.000000,1.0
7,,Drowning,,,,,Partially verified,Le Quotidien d'Oran,"36.949195213437, 8.448509527539","Off Plage de Messida, El Kala, wilaya of El Ta...",...,Uncategorized,"http://bit.ly/2javfZd, http://bit.ly/2Cz4wNx",,44035.0,8.0,12.0,0.000000,0.000000,0.250000,12.0
8,,Hypothermia,,,,,Verified,U.S. Border Patrol,"29.127486900000, -103.242537900000","Big Bend area, near Marfa, Presidio County, Te...",...,Northern America,"http://bit.ly/2iUG8OU, http://bit.ly/2l1uhCP, ...",,44023.0,8.0,12.0,0.000000,0.000000,0.066667,15.0
9,,Unknown,,,,,Verified,U.S. Border Patrol,"27.151251312948, -98.203222913672","Ranch near Falfurrias, Brooks County, Texas",...,Northern America,http://bit.ly/2jz49Pk,,44021.0,7.0,12.0,0.000000,0.000000,0.000000,1.0


In [32]:
#separating data
fourteen = df[(df['Reported Date'] >= '2014-01-01') & (df['Reported Date'] < '2015-01-01')]
fifteen = df[(df['Reported Date'] >= '2015-01-01') & (df['Reported Date'] < '2016-01-01')]
sixteen = df[(df['Reported Date'] >= '2016-01-01') & (df['Reported Date'] < '2017-01-01')]
seventeen = df[(df['Reported Date'] >= '2017-01-01') & (df['Reported Date'] < '2018-01-01')]

In [39]:
total_data = np.concatenate((np.array(fourteen),np.array(fifteen),np.array(sixteen),np.array(seventeen)))
np.random.shuffle(total_data)
y = total_data[:,11].astype(int) / (total_data[:,14].astype(int) + total_data[:,11].astype(int)) 
# y is what we predict -- total sruvivors over total survivors+missing and dead 
y = np.nan_to_num(y)
y = (100*y).astype(int)
# x is the parts of the data we are using to predict Location, route, region, and Geographical Grouping
x[1]

  This is separate from the ipykernel package so we can avoid doing imports until


array([0, 'US-Mexico Border', 'Northern America'], dtype=object)

In [40]:
train = 2000
x_tr = x[:train,:]
y_tr = y[:train]
x_ts = x[train:,:]
y_ts = y[train:]
from sklearn import linear_model

In [53]:
def initTest(df):
    y = np.array((pd.to_numeric(df['Number of survivors']) / df['total people']).fillna(0))
    y = np.nan_to_num(y)
    y[y == np.abs(np.inf)] = 0
    
    y = (100*y).astype(int)


    x = np.array(df[['percent female', 'percent kids', 'percent male', 'day','month']])
    
    x = np.nan_to_num(x)
    print(x.shape)
    print(np.array(ohc_Region).shape)
    x[:,3] = (100*x[:,3]).astype(int)
    x[:,4] = (100*x[:,4]).astype(int)
    x[:,5] = (100*x[:,5]).astype(int)


    rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0)

    for train_index, test_index in rs.split(x):
        regr = linear_model.LinearRegression()
        regr.fit(x[train_index],y[train_index])
        print("train: ", regr.score(x[train_index],y[train_index]) )
        print("test: ", regr.score(x[test_index],y[test_index]))
        print(regr.coef_)
        print()
    print()
initTest(sixteen)

(1239, 5)
(3191, 14)


IndexError: index 5 is out of bounds for axis 1 with size 5

In [29]:
onlyUS = df[df['UNSD Geographical Grouping'] == geoIndex.get_loc('Northern America')]

In [13]:
initTest(onlyUS)
initTest(df)
initTest(df[df['UNSD Geographical Grouping'] == geoIndex.get_loc('Central America')])

train:  0.0424244972933
test:  -0.0037503123579
[ 0.         -0.27017474  0.01892827 -0.05200004 -0.00093992 -0.05091294
 -0.02089713  0.11393549]

train:  0.0458488044058
test:  -0.160132944365
[  0.00000000e+00   2.08166817e-17   3.16414433e-02  -5.49076145e-02
   2.99037036e-03  -5.43578601e-02  -5.13860140e-02   2.23140405e-01]

train:  0.0358732900304
test:  0.0312031628027
[ 0.         -0.28562755  0.03790224 -0.04170855  0.00181967 -0.04050936
 -0.05433346  0.21099163]


train:  5.85401037247e-05
test:  -0.000834101585802
[  1.54921797e-09  -9.16960237e-10   1.44737299e-09   8.76251527e-06
  -8.76251527e-06  -2.88830341e-08   1.69732413e-10   5.80434177e-11]

train:  5.49241846519e-05
test:  -0.00504737383895
[  5.86868581e-34   0.00000000e+00   4.27613726e-34   4.90610548e-19
   4.90610548e-19  -1.58167733e-33  -1.09892281e-34   1.56697995e-36]

train:  5.90736787328e-05
test:  -0.00100962002004
[  4.09976594e-11  -3.17995807e-11   3.69700888e-11   4.34078474e-07
  -4.34078474e

  if sys.path[0] == '':
  del sys.path[0]
