In [9]:
# Set up Notebook
% matplotlib inline

# Standard imports
from sklearn.model_selection import GridSearchCV

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from matplotlib import cm


# We do this to ignore several specific Pandas warnings
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score

In [10]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
labels = ['clust_logFCS', 'clust_RCSI', 'clust_HDDS']

In [11]:
x_train=train_data.drop(labels, axis=1)
x_train = x_train.drop(x_train.columns[0], axis=1)
x_test=test_data.drop(labels, axis=1)
x_test = x_test.drop(x_test.columns[0], axis=1)

x_train.head()

Unnamed: 0,clust_L12raincytot,clust_L12day1rain,clust_L12maxdays,clust_floodmax,clust_cells_own,clust_price,clust_thinn,clust_roof,clust_hhsize,clust_hh_age,clust_hh_gender,clust_asset,clust_dist_road,clust_dist_admarc,clust_percent_ag,clust_nutri_reten_constrained,clust_elevation,ipc_lag1,ipc_lag12
0,1090.0613,43,17,0.0,0.125,3.824215,0.359375,0.125,5.4375,41.625,1.25,-0.327686,1.395625,5.176875,0.6,0.0,1311.875,1.0,
1,855.86176,58,40,0.0,0.4375,3.917409,0.25,0.1875,5.6875,36.5625,1.3125,-0.202549,0.756875,5.40875,0.6,0.9375,496.0,1.0,
2,1300.2437,53,29,0.0,1.5,3.683867,0.25,0.5,6.375,40.4375,1.375,0.548275,0.163125,17.038126,0.45,1.0,526.75,1.0,
3,1036.9712,52,29,0.0,0.625,4.061391,0.5,0.25,5.4375,46.875,1.1875,-0.077412,5.80375,7.724375,0.4375,0.0,564.1875,1.0,
4,952.1828,54,25,0.0,0.5625,3.8795,0.25,0.0625,7.0,43.8125,1.0,-0.202549,11.286875,11.103125,0.4875,1.0,1539.9375,1.0,


In [12]:
y_train = train_data[labels]
y_test = test_data[labels]
y_test.head()

Unnamed: 0,clust_logFCS,clust_RCSI,clust_HDDS
0,3.666015,12.3125,4.6875
1,3.718879,5.352941,5.235294
2,3.79983,5.08,5.72
3,3.844891,4.285714,5.095238
4,3.687936,0.8125,5.3125


In [13]:
from sklearn.preprocessing import Imputer,StandardScaler

imp = Imputer(missing_values='NaN', strategy='mean', axis=0, copy=False)
x_train = imp.fit_transform(x_train)
x_test = imp.fit_transform(x_test)

ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)


In [15]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
x_train = poly.fit_transform(x_train)
x_test =  poly.fit_transform(x_test)



In [16]:
from sklearn.linear_model import LinearRegression

# Create and fit our linear regression model to training data
model = LinearRegression(fit_intercept=True)
model.fit(x_train, y_train[labels[2]])

# Compute model predictions for test data
pred = model.predict(x_test)

actual = y_test[labels[2]]
r2_linear = stats.pearsonr(actual, pred)[0] ** 2
r2_linear

0.29157328215968636

In [17]:
# fit on logFCS 
from sklearn.ensemble import RandomForestRegressor

# Create Regressor with default properties
rfc = RandomForestRegressor(random_state =0,n_jobs =4,warm_start = True)

parameters = {'max_depth':np.arange( 1,4, 1 ).tolist(), 'min_samples_leaf':np.arange( 1, 4, 1 ).tolist()}
clf = GridSearchCV(rfc, parameters,cv=6, n_jobs= 4, iid = True,  refit= True,pre_dispatch= '2*n_jobs')
clf.fit(x_train, y_train[labels[2]])

# Fit estimator and display score

# Regress on test data
pred = clf.predict(x_test)

actual = y_test[labels[2]]
r2_rfc = stats.pearsonr(actual, pred)[0] ** 2
r2_rfc

0.6326097409989042

In [29]:
# fit on logFCS 
from sklearn.ensemble import RandomForestRegressor

# Create Regressor with default properties
rfc = RandomForestRegressor(random_state =0,n_jobs =4,warm_start = True,max_depth=4, min_samples_leaf=5 )

rfc.fit(x_train, y_train[labels[2]])
# Fit estimator and display score

# Regress on test data
pred = rfc.predict(x_test)

actual = y_test[labels[2]]
r2_rfc = stats.pearsonr(actual, pred)[0] ** 2
r2_rfc

0.6473430686792643

In [53]:
from sklearn.linear_model import RidgeCV
ridge = RidgeCV(alphas=(400,800), fit_intercept=True, normalize=False, scoring=None, cv=5, gcv_mode='auto', store_cv_values=False) 

# Define different alpha values for different fits
# alpha = [0.0, 1E-6, 1E-4, 1E-2, 1.0]

ridge.fit(x_train, y_train[labels[2]])
pred = ridge.predict(x_test)

actual = y_test[labels[2]]
r2_ridge= stats.pearsonr(actual, pred)[0] ** 2
r2_ridge

0.6236000039039707

In [74]:
from sklearn.linear_model import BayesianRidge
bridge = BayesianRidge(alpha_1=30, alpha_2=70,lambda_1=0.01, compute_score=True)
bridge.fit(x_train, y_train[labels[2]])
pred = bridge.predict(x_test)

actual = y_test[labels[2]]
r2_bridge= stats.pearsonr(actual, pred)[0] ** 2
r2_bridge

0.6246000099317235

In [85]:
from sklearn.linear_model import LassoCV

ls = LassoCV(eps=0.001, n_alphas=100, alphas=(0.01, 2), fit_intercept=True,precompute='auto',n_jobs=4, random_state=0, selection='cyclic')

ls = ls.fit(x_train, y_train[labels[2]])
pred = ls.predict(x_test)

actual = y_test[labels[2]]
r2_ls= stats.pearsonr(actual, pred)[0] ** 2
r2_ls

0.6687626368972859

In [129]:
from sklearn.linear_model import ElasticNetCV
en = ElasticNetCV(alphas=(0.1,0.02,3,2), copy_X=True, cv=10, eps=0.004, fit_intercept=True,
       l1_ratio=0.33, max_iter=1000, n_alphas=100, n_jobs=1,
       normalize=False, positive=False, precompute='auto', random_state=0,
       selection='cyclic', tol=0.0001, verbose=0)

en.fit(x_train, y_train[labels[2]])

pred = en.predict(x_test)

actual = y_test[labels[2]]
r2_en= stats.pearsonr(actual, pred)[0] ** 2
r2_en


0.6836678406828095

In [125]:
from sklearn.ensemble import GradientBoostingRegressor

# Create Regressor with default properties
gbr = GradientBoostingRegressor(random_state=0,learning_rate=0.1, n_estimators=19,subsample=1, criterion='friedman_mse', min_samples_split=3)

gbr.fit(x_train, y_train[labels[2]])
pred = gbr.predict(x_test)

actual = y_test[labels[2]]
r2_gbr= stats.pearsonr(actual, pred)[0] ** 2
r2_gbr


0.6639836307166467

In [128]:
type(pred)

numpy.ndarray

In [133]:
df = pd.DataFrame(actual)
df["pred"] = pred
df.to_csv('hddsplot.csv')

In [134]:
df

Unnamed: 0,clust_HDDS,pred
0,4.687500,4.724105
1,5.235294,5.036210
2,5.720000,5.740496
3,5.095238,4.761530
4,5.312500,4.997085
5,5.421053,5.203568
6,5.260870,4.856672
7,5.650000,4.888986
8,5.318182,4.648327
9,5.125000,5.141421
