In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

In [51]:
#creating whole dataset
seventeen = pd.read_json("https://missingmigrants.iom.int/global-figures/2017/json")
sixteen = pd.read_json("https://missingmigrants.iom.int/global-figures/2016/json")
fifteen = pd.read_json("https://missingmigrants.iom.int/global-figures/2015/json")
fourteen = pd.read_json("https://missingmigrants.iom.int/global-figures/2014/json")

In [52]:
frames = [seventeen,sixteen,fifteen,fourteen]
df = pd.concat(frames)

In [53]:
#cleaning data
df = df.replace('', 0,regex=True)
df = df.replace('nan', 0, regex=True)
df.drop(['Information Source'])
df = df[df['Web ID'] != 42424] #drops row with -1 as number of survivors

df['Reported Date'] = pd.to_datetime(df['Reported Date'])
df['total people'] = pd.to_numeric(df['Number of survivors']) + pd.to_numeric(df['Total Dead and Missing'])
df['percent female'] = (pd.to_numeric(df['Number of Female'])/ df['total people']).fillna(0)
df['percent male'] =  (pd.to_numeric(df['Number of Male'])/ df['total people']).fillna(0)
df['percent kids'] =  (pd.to_numeric(df['Number of Children']) / df['total people']).fillna(0)
df['month'] = df['Reported Date'].dt.month
df['day'] = df['Reported Date'].dt.day

#latitude and longitude to cartesian
R = 6371
df['x'] = df['Location'].apply(lambda latLong: R*math.cos(float(latLong.split(", ")[0]))*math.cos(float(latLong.split(", ")[1])))
df['y'] = df['Location'].apply(lambda latLong: R*math.cos(float(latLong.split(", ")[0]))*math.sin(float(latLong.split(", ")[1])))
df['z'] = df['Location'].apply(lambda latLong: R*math.sin(float(latLong.split(", ")[0])))

In [54]:
#separating data
# fourteen = df[(df['Reported Date'] >= '2014-01-01') & (df['Reported Date'] < '2015-01-01')]
# fifteen = df[(df['Reported Date'] >= '2015-01-01') & (df['Reported Date'] < '2016-01-01')]
# sixteen = df[(df['Reported Date'] >= '2016-01-01') & (df['Reported Date'] < '2017-01-01')]
# seventeen = df[(df['Reported Date'] >= '2017-01-01') & (df['Reported Date'] < '2018-01-01')]

In [55]:
from sklearn import linear_model
from sklearn.model_selection import ShuffleSplit

In [56]:
def linear(dataframe):
    # the index will help us get back from number to category later
    dataframe['UNSD Geographical Grouping'], geoIndex = pd.Series(dataframe['UNSD Geographical Grouping']).factorize()
    dataframe['Migrant Route'], migrantIndex = pd.Series(dataframe['Migrant Route']).factorize()
    dataframe['Region of Incident'], regionIndex = pd.Series(dataframe['Region of Incident']).factorize()

    y = np.array((pd.to_numeric(dataframe['Number of survivors']) / dataframe['total people']).fillna(0))
    y = np.nan_to_num(y)
    y[y == np.abs(np.inf)] = 0
    
    y = (100*y).astype(int)


    x = np.array(dataframe[['UNSD Geographical Grouping', 'Migrant Route', 'Region of Incident', 'percent female',
                     'percent kids', 'percent male', 'month', 'day']])
    x = np.nan_to_num(x)

    x[:,3] = (100*x[:,3]).astype(int)
    x[:,4] = (100*x[:,4]).astype(int)
    x[:,2] = (100*x[:,2]).astype(int)


    rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0)

    for train_index, test_index in rs.split(x):
        regr = linear_model.LinearRegression()
        regr.fit(x[train_index],y[train_index])
        print("train: ", regr.score(x[train_index],y[train_index]) )
        print("test: ", regr.score(x[test_index],y[test_index]))
    print()

In [57]:
linearFrame = df.copy(deep=True) #as to keep same df for later use

linear(linearFrame)
linear(linearFrame[linearFrame['UNSD Geographical Grouping'] == geoIndex.get_loc('Central America')])



train:  0.189575694258
test:  0.173185902101
train:  0.181848515751
test:  0.191545532187
train:  0.181317381178
test:  0.200121181546

train:  0.328144743217
test:  0.414092450658
train:  0.391303557173
test:  0.160383453985
train:  0.366127206034
test:  0.318608173157



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [58]:
import keras
from keras.models import Model,Sequential
from keras.layers import Dense,Activation
import keras.backend as K
from sklearn.model_selection import train_test_split
from keras import optimizers

In [59]:
def neural(dataframe):
    ohc_UNSD = pd.get_dummies(dataframe['UNSD Geographical Grouping'],prefix='UNSD')
    ohc_Route = pd.get_dummies(dataframe['Migrant Route'],prefix='Route')
    ohc_Region = pd.get_dummies(dataframe['Region of Incident'],prefix='Region')

    #makes a copy of dataframe,  does not alter it
    frame = dataframe.join(ohc_UNSD)
    frame = frame.join(ohc_Region)
    frame = frame.join(ohc_Route)

    y = np.array((pd.to_numeric(frame['Number of survivors']) / frame['total people']).fillna(0))
    y = np.nan_to_num(y)
    y[y == np.abs(np.inf)] = 0
    
    y = (100*y).astype(int)


    x = np.array(frame[['percent female', 'percent kids', 'percent male', 'day','month','x','y','z']])
    x = np.column_stack((x,np.array(frame.loc[:,'UNSD_0':])))
    
    x = np.nan_to_num(x)
    x[:,0] = (100*x[:,0]).astype(int)
    x[:,1] = (100*x[:,1]).astype(int)
    x[:,2] = (100*x[:,2]).astype(int)
    
    ntr = len(y)//4 *3
    x_tr = x[:ntr,:]
    y_tr = y[:ntr]
    x_ts = x[ntr:,:]
    y_ts = y[ntr:]
    
    K.clear_session()
    
    nin = x.shape[1]
    nout = int(np.max(y)+1) 
    model = Sequential()
    model.add(Dense(100, input_shape=(nin,), activation='sigmoid', name='hidden'))
    model.add(Dense(nout, activation='softmax', name='output'))

    model.summary()
    
    opt = optimizers.Adam(lr=0.001) # beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(optimizer=opt,
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    model.fit(x_tr, y_tr, epochs=10, batch_size=100, validation_data=(x_ts,y_ts))

In [60]:
neural(df)



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
hidden (Dense)               (None, 100)               5700      
_________________________________________________________________
output (Dense)               (None, 100)               10100     
Total params: 15,800
Trainable params: 15,800
Non-trainable params: 0
_________________________________________________________________
Train on 82317 samples, validate on 27442 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
