In [1]:
# Set up Notebook
% matplotlib inline

# Standard imports
from sklearn.model_selection import GridSearchCV

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from matplotlib import cm


# We do this to ignore several specific Pandas warnings
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
labels = ['clust_logFCS', 'clust_RCSI', 'clust_HDDS']

In [9]:
x_train=train_data.drop(labels, axis=1)
x_train = x_train.drop(x_train.columns[0], axis=1)
x_test=test_data.drop(labels, axis=1)
x_test = x_test.drop(x_test.columns[0], axis=1)

x_train.head()

Unnamed: 0,clust_L12raincytot,clust_L12day1rain,clust_L12maxdays,clust_floodmax,clust_cells_own,clust_price,clust_thinn,clust_roof,clust_hhsize,clust_hh_age,clust_hh_gender,clust_asset,clust_dist_road,clust_dist_admarc,clust_percent_ag,clust_nutri_reten_constrained,clust_elevation,ipc_lag1,ipc_lag12
0,1090.0613,43,17,0.0,0.125,3.824215,0.359375,0.125,5.4375,41.625,1.25,-0.327686,1.395625,5.176875,0.6,0.0,1311.875,1.0,
1,855.86176,58,40,0.0,0.4375,3.917409,0.25,0.1875,5.6875,36.5625,1.3125,-0.202549,0.756875,5.40875,0.6,0.9375,496.0,1.0,
2,1300.2437,53,29,0.0,1.5,3.683867,0.25,0.5,6.375,40.4375,1.375,0.548275,0.163125,17.038126,0.45,1.0,526.75,1.0,
3,1036.9712,52,29,0.0,0.625,4.061391,0.5,0.25,5.4375,46.875,1.1875,-0.077412,5.80375,7.724375,0.4375,0.0,564.1875,1.0,
4,952.1828,54,25,0.0,0.5625,3.8795,0.25,0.0625,7.0,43.8125,1.0,-0.202549,11.286875,11.103125,0.4875,1.0,1539.9375,1.0,


In [10]:
y_train = train_data[labels]
y_test = test_data[labels]
y_test.head()

Unnamed: 0,clust_logFCS,clust_RCSI,clust_HDDS
0,3.666015,12.3125,4.6875
1,3.718879,5.352941,5.235294
2,3.79983,5.08,5.72
3,3.844891,4.285714,5.095238
4,3.687936,0.8125,5.3125


In [11]:
from sklearn.preprocessing import Imputer,StandardScaler

imp = Imputer(missing_values='NaN', strategy='mean', axis=0, copy=False)
x_train = imp.fit_transform(x_train)
x_test = imp.fit_transform(x_test)

ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)


In [7]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
x_train = poly.fit_transform(x_train)
x_test =  poly.fit_transform(x_test)



In [20]:
import numpy as np
y_train_new = np.arcsinh(np.array ( y_train[labels[1]])) 
y_test_new = np.arcsinh(np.array (y_test[labels[1]])) 

In [21]:
from sklearn.linear_model import LinearRegression

# Create and fit our linear regression model to training data
model = LinearRegression(fit_intercept=True)
model.fit(x_train, y_train_new)

# Compute model predictions for test data
pred = model.predict(x_test)

actual =y_test_new
r2_linear = stats.pearsonr(actual, pred)[0] ** 2
r2_linear

0.12297431987562661

In [22]:
# fit on logFCS 
from sklearn.ensemble import RandomForestRegressor

# Create Regressor with default properties
rfc = RandomForestRegressor(random_state =0,n_jobs =4,warm_start = True)

parameters = {'max_depth':np.arange( 1,4, 1 ).tolist(), 'min_samples_leaf':np.arange( 1, 4, 1 ).tolist()}
clf = GridSearchCV(rfc, parameters,cv=6, n_jobs= 4, iid = True,  refit= True,pre_dispatch= '2*n_jobs')
clf.fit(x_train, y_train_new)

# Fit estimator and display score

# Regress on test data
pred = clf.predict(x_test)

actual =y_test_new
r2_rfc = stats.pearsonr(actual, pred)[0] ** 2
r2_rfc

0.04498710944727587

In [147]:
# fit on logFCS 
from sklearn.ensemble import RandomForestRegressor

# Create Regressor with default properties
rfc = RandomForestRegressor(random_state =0,n_jobs =4,warm_start = True,max_depth=4, min_samples_leaf=3 )

rfc.fit(x_train, y_train_new)
# Fit estimator and display score

# Regress on test data
pred = rfc.predict(x_test)

actual =y_test_new
r2_rfc = stats.pearsonr(actual, pred)[0] ** 2
r2_rfc

0.04181011537703592

In [34]:
from sklearn.linear_model import RidgeCV
ridge = RidgeCV(alphas=(0.2,0.6,0.01), fit_intercept=True, normalize=False, scoring=None, cv=5, gcv_mode='auto', store_cv_values=False) 

# Define different alpha values for different fits
# alpha = [0.0, 1E-6, 1E-4, 1E-2, 1.0]

ridge.fit(x_train, y_train_new)
pred = ridge.predict(x_test)

actual =y_test_new
r2_ridge= stats.pearsonr(actual, pred)[0] ** 2
r2_ridge

0.12296352897584324

In [184]:
from sklearn.linear_model import BayesianRidge
bridge = BayesianRidge(compute_score=True)
bridge.fit(x_train, y_train_new)
pred = bridge.predict(x_test)

actual =y_test_new
r2_bridge= stats.pearsonr(actual, pred)[0] ** 2
r2_bridge

0.12311634209636622

In [142]:
from sklearn.linear_model import LassoCV

ls = LassoCV(eps=0.001, n_alphas=100, alphas=(2.1,3,0.1), fit_intercept=True,precompute='auto',n_jobs=4, random_state=0, selection='cyclic')

ls = ls.fit(x_train, y_train_new)
pred = ls.predict(x_test)

actual =y_test_new
r2_ls= stats.pearsonr(actual, pred)[0] ** 2
r2_ls

0.12092485754124149

In [143]:
from sklearn.linear_model import ElasticNetCV
en = ElasticNetCV(alphas=(1,4,0.1), copy_X=True, cv=5, eps=0.01, fit_intercept=True,
       l1_ratio=0.3, max_iter=1000, n_alphas=100, n_jobs=1,
       normalize=False, positive=False, precompute='auto', random_state=0,
       selection='cyclic', tol=0.0001, verbose=0)

en.fit(x_train, y_train_new)

pred = en.predict(x_test)

actual =y_test_new
r2_en= stats.pearsonr(actual, pred)[0] ** 2
r2_en


0.12006059193359798

In [144]:
from sklearn.ensemble import GradientBoostingRegressor

# Create Regressor with default properties
gbr = GradientBoostingRegressor(random_state=0,learning_rate=0.4, n_estimators=9,subsample=1, criterion='friedman_mse', min_samples_split=2)

gbr.fit(x_train, y_train_new)
pred = gbr.predict(x_test)

actual =y_test_new
r2_gbr= stats.pearsonr(actual, pred)[0] ** 2
r2_gbr


0.06581221706443534

In [185]:
df = pd.DataFrame(actual)
df["pred"] = pred
df["actual"]=actual
df.to_csv('rcsi.csv')

In [183]:
import seaborn as sns

# Create a linear model plot
sns.lmplot(pred, actual,df);

KeyError: '[0.09509471 0.14944313 0.26021146 0.36238645 0.59014369 0.6112243\n 0.66433062 0.69314718 0.72442749 0.74239938 0.78388429 0.80058527\n 0.81560892 0.9307718  0.97066545 0.97458795 0.98873638 1.01166966\n 1.03124095 1.04759301 1.06971075 1.09144892 1.09371574 1.14635208\n 1.15107199 1.17944404 1.18532247 1.20361645 1.2092725  1.22939201\n 1.23752966 1.25167473 1.25175675 1.25771931 1.27102712 1.27709082\n 1.27710412 1.28379563 1.29278711 1.31203311 1.3163993  1.31771843\n 1.33178905 1.33959751 1.34196347 1.34517913 1.36651524 1.38140346\n 1.38612459 1.40405705 1.40563117 1.40898812 1.41533008 1.43408623\n 1.44363548 1.44936111 1.45358378 1.45983148 1.46820817 1.47521817\n 1.48089992 1.48186076 1.48276017 1.48592073 1.50092044 1.50300739\n 1.50575677 1.50805916 1.50808078 1.50877332 1.52056927 1.52593129\n 1.52801629 1.52850819 1.53602125 1.53881785 1.54368973 1.54786547\n 1.55015796 1.55425274 1.5545636  1.55567537 1.55934966 1.56052152\n 1.56556828 1.56958279 1.57027854 1.57201075 1.57267643 1.5775041\n 1.5789269  1.58640363 1.5867403  1.58930663 1.59118566 1.60041372\n 1.60608824 1.61249723 1.61418122 1.61525854 1.62240262 1.62376575\n 1.62567301 1.62922161 1.63625214 1.63741315 1.64214786 1.64312763\n 1.64400094 1.64563729 1.65425616 1.65603762 1.65797244 1.65809937\n 1.65860543 1.66037546 1.66092733 1.66655739 1.66705501 1.67221934\n 1.67951155 1.68374311 1.68772442 1.69434256 1.69580584 1.69740375\n 1.69848458 1.69980492 1.70352187 1.70511377 1.70741145 1.70879416\n 1.71316321 1.71355379 1.71470156 1.71493324 1.71748495 1.72007393\n 1.72588295 1.72597561 1.72888543 1.73255651 1.73627759 1.73658808\n 1.73689484 1.74968264 1.75225564 1.7532289  1.75694107 1.76278783\n 1.76679474 1.77227615 1.77441908 1.78092593 1.78215152 1.78271184\n 1.79145261 1.79408961 1.81844646 1.83272326 1.83402244 1.83579248\n 1.846809   1.84960365 1.85637386 1.85725085 1.86196032 1.86312168\n 1.86428222 1.86484275 1.87188361 1.87442633 1.87701902 1.88647106\n 1.89090672 1.89467214 1.8973916  1.89956213 1.89965782 1.90280943\n 1.90927401 1.91289283 1.91708534 1.92025035 1.92451563 1.92571578\n 1.92802163 1.93102293 1.93249078 1.93520888 1.93787931 1.94052893\n 1.94840715 1.95228471 1.95474812 1.95760888 1.95878227 1.95915887\n 1.96572047 1.97292493 1.97614763 1.97878337 1.98275187 1.98329759\n 1.98968226 1.99580536 2.00670668 2.03222462 2.03934068 2.04441931\n 2.04820449 2.05798854 2.05946935 2.06577678 2.06895202 2.07242475\n 2.07340385 2.08233474 2.08517564 2.09471255 2.09769779 2.10500527\n 2.10629757 2.10739912 2.10904269 2.11358015 2.11807914 2.11868502\n 2.12075468 2.12088657 2.12285818 2.12313661 2.12459196 2.12879052\n 2.14211186 2.14454717 2.1531768  2.157422   2.1617757  2.16360047\n 2.16501681 2.16858713 2.17693911 2.18039922 2.18495632 2.19510147\n 2.19569905 2.20026887 2.20209106 2.20294744 2.20416966 2.20532877\n 2.21449971 2.22472674 2.23081416 2.23317181 2.23431254 2.244879\n 2.25077808 2.25183956 2.2564538  2.2602618  2.26521742 2.2762351\n 2.280993   2.29041153 2.29482999 2.32164628 2.32219734 2.32270831\n 2.32800813 2.3336757  2.34007685 2.35039711 2.35092568 2.35313329\n 2.36389186 2.365049   2.36809118 2.37256078 2.37579905 2.37940608\n 2.38136827 2.41492637 2.42172183 2.42882194 2.4293282  2.4310486\n 2.44514621 2.44982412 2.45757277 2.45828064 2.46548412 2.47179951\n 2.47334674 2.47672326 2.4789088  2.48309017 2.48704185 2.48986199\n 2.49177985 2.50140456 2.51087012 2.51212419 2.52214521 2.52581235\n 2.54240923 2.54278328 2.5467815  2.56661889 2.59584526 2.62371298\n 2.63477951 2.64412076 2.65052863 2.65576677 2.65750047 2.65889859\n 2.66511428 2.70857062 2.70898132 2.71845492 2.71978111 2.73237932\n 2.75366441 2.77081851 2.78185069 2.78297935 2.80859545 2.81130179\n 2.82738894 2.84393195 2.85403107 2.92067985 2.94411398 3.01467158\n 3.05151164 3.06689587 3.07870433 3.16725065 3.17978544 3.20540722\n 3.31348878 3.48389029] not in index'

In [186]:
df

Unnamed: 0,0,pred,actual
0,3.205407,2.134089,3.205407
1,2.379406,1.534749,2.379406
2,2.328008,1.413559,2.328008
3,2.161776,1.960183,2.161776
4,0.742399,2.094010,0.742399
5,3.078704,1.585790,3.078704
6,1.481861,1.699915,1.481861
7,1.508773,1.946352,1.508773
8,1.846809,1.766949,1.846809
9,1.550158,2.616829,1.550158
