In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import ShuffleSplit

In [2]:
#creating whole dataset
seventeen = pd.read_json("https://missingmigrants.iom.int/global-figures/2017/json")
sixteen = pd.read_json("https://missingmigrants.iom.int/global-figures/2016/json")
fifteen = pd.read_json("https://missingmigrants.iom.int/global-figures/2015/json")
fourteen = pd.read_json("https://missingmigrants.iom.int/global-figures/2014/json")

frames = [seventeen,sixteen,fifteen,fourteen]

df = pd.concat(frames)

In [11]:
#cleaning data
df = df.replace('', 0,regex=True)
df = df.replace('nan', 0, regex=True)
df.drop(['Information Source'])

df['Reported Date'] = pd.to_datetime(df['Reported Date'])
df['month'] = df['Reported Date'].dt.month
# the index will help us get back from number to category later
df['UNSD Geographical Grouping'], geoIndex = pd.Series(df['UNSD Geographical Grouping']).factorize()
df['Migrant Route'], migrantIndex = pd.Series(df['Migrant Route']).factorize()
df['Region of Incident'], regionIndex = pd.Series(df['Region of Incident']).factorize()

In [12]:
#separating data
fourteen = df[(df['Reported Date'] >= '2014-01-01') & (df['Reported Date'] < '2015-01-01')]
fifteen = df[(df['Reported Date'] >= '2015-01-01') & (df['Reported Date'] < '2016-01-01')]
sixteen = df[(df['Reported Date'] >= '2016-01-01') & (df['Reported Date'] < '2017-01-01')]
seventeen = df[(df['Reported Date'] >= '2017-01-01') & (df['Reported Date'] < '2018-01-01')]

# print(fourteen.shape)
print(fourteen.columns.values)

['Cause of death' 'Information Reliability' 'Information Source' 'Location'
 'Location Description' 'Migrant Route' 'Number Dead' 'Number Missing'
 'Number of Children' 'Number of Female' 'Number of Male'
 'Number of survivors' 'Region of Incident' 'Reported Date'
 'Total Dead and Missing' 'UNSD Geographical Grouping' 'URL' 'Web ID'
 'month']


In [13]:
total_data = np.concatenate((np.array(fourteen),np.array(fifteen),np.array(sixteen),np.array(seventeen)))
np.random.shuffle(total_data)
y = total_data[:,11].astype(int) / (total_data[:,14].astype(int) + total_data[:,11].astype(int)) 
# y is what we predict -- total sruvivors over total survivors+missing and dead 
y = np.nan_to_num(y)
y = (100*y).astype(int)
x = np.column_stack((total_data[:,5], total_data[:,12],total_data[:,15]))
# x is the parts of the data we are using to predict Location, route, region, and Geographical Grouping
x[1]

  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  


array([1, 9, 10], dtype=object)

In [14]:
train = 2000
x_tr = x[:train,:]
y_tr = y[:train]
x_ts = x[train:,:]
y_ts = y[train:]
from sklearn import linear_model


In [15]:
regr = linear_model.LinearRegression()
regr.fit(x_tr,y_tr)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [16]:
regr.coef_

array([ -1.41180276e+15,  -8.77802167e+15,   7.31027913e+15])

In [17]:
#training data
y_pred_tr = regr.predict(x_tr)
RSS = np.mean((y_pred_tr - y_tr) **2)/(np.std(y_tr)**2)
print("RSS: = {0:f}".format(RSS))
print("R^2: = {0:f}".format(1-RSS))
print("R^2= {0:f}".format(regr.score(x_tr,y_tr)))

RSS: = 0.991794
R^2: = 0.008206
R^2= 0.008206


In [12]:
#test data
y_pred_ts = regr.predict(x_ts)

RSS = np.mean((y_pred_ts - y_ts) **2)/(np.std(y_ts)**2)
print("RSS: = {0:f}".format(RSS))
print("R^2: = {0:f}".format(1-RSS))




RSS: = 451701064949588129380611129344.000000
R^2: = -451701064949588129380611129344.000000
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 -2.63268018953e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 -3.37513206102e+15
0 2.59995936925e+15
0 -2.63268018953e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 -2.63268018953e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 -3.37513206102e+15
0 2.59995936925e+15
0 -2

0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 -3.37513206102e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 -1.85255269678e+16
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
84 -3.57360552991e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
26 -3.64508012587e+16
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 -3.37513206102e+15
0 2.59995936925e+15
0 2.59995936925e+15
0 2.599959369

In [10]:
def initTest(df):
    df['total people'] = pd.to_numeric(df['Number of survivors']) + pd.to_numeric(df['Total Dead and Missing'])
    y = np.array((pd.to_numeric(df['Number of survivors']) / df['total people']).fillna(0))
    y = np.nan_to_num(y)
    y[y == np.abs(np.inf)] = 0
    
    y = (100*y).astype(int)

    df['percent female'] = (pd.to_numeric(df['Number of Female'])/ df['total people']).fillna(0)
    df['percent male'] =  (pd.to_numeric(df['Number of Male'])/ df['total people']).fillna(0)
    df['percent kids'] =  (pd.to_numeric(df['Number of Children']) / df['total people']).fillna(0)
    

    x = np.array(df[['UNSD Geographical Grouping', 'Migrant Route', 'Region of Incident','percent female', 'percent kids', 'percent male', 'month']])
    x = np.nan_to_num(x)

    x[:,3] = (100*x[:,3]).astype(int)
    x[:,4] = (100*x[:,4]).astype(int)
    x[:,5] = (100*x[:,5]).astype(int)


    rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0)

    for train_index, test_index in rs.split(x):
        regr = linear_model.LinearRegression()
        regr.fit(x[train_index],y[train_index])
        print("train: ", regr.score(x[train_index],y[train_index]) )
        print("test: ", regr.score(x[test_index],y[test_index]))
initTest(sixteen)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: h

KeyError: "['month'] not in index"