# Basic data read & split

In [1]:
import csv
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [2]:
X = []
Y = []
with open('selected_features_x.csv') as csvfile:
    csv_reader = csv.reader(csvfile, delimiter=',')
    line = 0
    for row in csv_reader:
        if line != 0:
            X.append(row)
        line = line + 1
with open('selected_features_y.csv') as csvfile:
    csv_reader = csv.reader(csvfile, delimiter=',')
    line = 0
    for row in csv_reader:
        if line != 0:
            Y.append(row[0])
        line = line + 1
X = np.array(X, dtype=int)
Y = np.array(Y, dtype=int)

# 80 % training. 10 % validation. 10 % testing
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1)
x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.1)

x_ensemble_train = []
x_ensemble_test = []
x_ensemble_val = []

Function to calculate RMSE

In [3]:
def rmse(y_actual, y_pred):
    return sqrt(mean_squared_error(y_actual, y_pred))

# Ordinal Regression

In [4]:
import mord

In [5]:
step = 10000
yo_train = np.copy(y_train)
yo_test = np.copy(y_test)
yo_val = np.copy(y_val)
for i in range(yo_train.size):
    yo_train[i] = yo_train[i] // step
for i in range(yo_test.size):
    yo_test[i] = yo_test[i] // step
for i in range(yo_val.size):
    yo_val[i] = yo_val[i] // step

Train and validation

In [6]:
alpha = 110.0
ord_reg = mord.OrdinalRidge(alpha)
ord_reg.fit(x_train, yo_train)

# RMSE = rmse(yo_val, ord_reg.predict(x_val))
# print('RMSE on validation data', RMSE * step)

print('R2 loss on test data is', r2_score(yo_val, ord_reg.predict(x_val)))

R2 loss on test data is 0.8724538113386091


In [7]:
yo_pred = ord_reg.predict(x_test)
print('R2 loss on test data is', r2_score(yo_test, yo_pred))
# RMSE = rmse(yo_test, yo_pred)
# print('RMSE on testing data', RMSE * step)

R2 loss on test data is 0.9066593259974464


# PCA followed by Ridge Regression

In [8]:
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge

PCA with n_comp components

In [9]:
n_comp = 31
pca = PCA(n_comp)
proj = pca.fit(x_train)

xhat_val = pca.transform(x_val)
xhat_train = pca.transform(x_train)

Ridge Regression on transformed data

In [10]:
alpha = 20
RR = Ridge(alpha)
RR.fit(xhat_train, y_train)

# RMSE = rmse(y_val, RR.predict(xhat_val))
# print('RMSE on validation data', RMSE)
print('R2 loss on validation data', r2_score(y_val, RR.predict(xhat_val)))

R2 loss on validation data 0.8700033373967015


In [11]:
xhat_test = pca.transform(x_test)
y_pred = RR.predict(xhat_test)
print('R2 loss on test data', r2_score(y_test, y_pred))
# print('RMSE on testing data', rmse(y_test, y_pred))

R2 loss on test data 0.8981699322932527


# Adaboost

In [12]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

  from numpy.core.umath_tests import inner1d


Train and validation 

In [13]:
n_est = 100
dt_max_dep = 12
adareg = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=dt_max_dep), n_estimators=n_est)
adareg.fit(x_train, y_train)

# RMSE = rmse(y_val, adareg.predict(x_val))
# print('RMSE on validation data', RMSE)
x_ensemble_train.append(adareg.predict(x_train))
x_ensemble_val.append(adareg.predict(x_val))
print('R2 loss on validation data', r2_score(y_val, adareg.predict(x_val)))

R2 loss on validation data 0.891599445964292


In [14]:
y_pred = adareg.predict(x_test)
x_ensemble_test.append(y_pred)
print('R2 loss on test data', r2_score(y_test, y_pred))
# print('RMSE on testing data', rmse(y_test, y_pred))

R2 loss on test data 0.9978005695690043


# Random Forest Regressor

In [15]:
from sklearn.ensemble import RandomForestRegressor

In [16]:
n_est = 100
rf_reg = RandomForestRegressor(n_estimators=n_est, criterion='mse')
rf_reg.fit(x_train, y_train)

# RMSE = rmse(y_val, rfreg.predict(x_val))
# print('RMSE on validation data', RMSE)
x_ensemble_train.append(rf_reg.predict(x_train))
x_ensemble_val.append(rf_reg.predict(x_val))
print('R2 loss on validation data', r2_score(y_val, rf_reg.predict(x_val)))

R2 loss on validation data 0.8995334163110568


In [17]:
y_pred = rf_reg.predict(x_test)
x_ensemble_test.append(y_pred)
print('R2 loss on test data', r2_score(y_test, y_pred))
# print('RMSE on testing data', rmse(y_test, y_pred))

R2 loss on test data 0.985271923453513


# Extra Trees Regression

In [18]:
from sklearn.ensemble import ExtraTreesRegressor

In [19]:
n_est = 120
ext_reg = ExtraTreesRegressor(n_estimators=n_est, criterion='mse')
ext_reg.fit(x_train, y_train)

# RMSE = rmse(y_val, extreg.predict(x_val))
# print('RMSE on validation data', RMSE)
x_ensemble_train.append(ext_reg.predict(x_train))
x_ensemble_val.append(ext_reg.predict(x_val))
print('R2 loss on validation data', r2_score(y_val, ext_reg.predict(x_val)))

R2 loss on validation data 0.8804973291864597


In [20]:
y_pred = ext_reg.predict(x_test)
x_ensemble_test.append(y_pred)
print('R2 loss on test data', r2_score(y_test, y_pred))
# print('RMSE on testing data', rmse(y_test, y_pred))

R2 loss on test data 0.9950417034402779


# Gradient Boosting Regressor

In [21]:
from sklearn.ensemble import GradientBoostingRegressor

In [22]:
n_est = 600
max_dep = 5
gb_reg = GradientBoostingRegressor(n_estimators=n_est, max_depth=max_dep)
gb_reg.fit(x_train, y_train)

# RMSE = rmse(y_val, gbreg.predict(x_val))
# print('RMSE on validation data', RMSE)
x_ensemble_train.append(gb_reg.predict(x_train))
x_ensemble_val.append(gb_reg.predict(x_val))
print('R2 Loss on validation data', r2_score(y_val, gb_reg.predict(x_val)))

R2 Loss on validation data 0.8903744433401195


In [23]:
y_pred = gb_reg.predict(x_test)
x_ensemble_test.append(y_pred)
print('R2 loss on test data', r2_score(y_test, y_pred))
# print('RMSE on testing data', rmse(y_test, y_pred))

R2 loss on test data 0.9959337493845545


# Bagging Regressor

In [24]:
from sklearn.ensemble import BaggingRegressor

Train and validation 

In [25]:
n_est = 20
bag_reg = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=15),n_estimators=n_est)
bag_reg.fit(x_train, y_train)

# RMSE = rmse(y_val, bagreg.predict(x_val))
# print('RMSE on validation data', RMSE)
x_ensemble_train.append(bag_reg.predict(x_train))
x_ensemble_val.append(bag_reg.predict(x_val))
print('R2 Loss on validation data', r2_score(y_val, bag_reg.predict(x_val)))

R2 Loss on validation data 0.8896291558931003


In [26]:
y_pred = bag_reg.predict(x_test)
x_ensemble_test.append(y_pred)
print('R2 loss on test data', r2_score(y_test, y_pred))
# print('RMSE on testing data', rmse(y_test, y_pred))

R2 loss on test data 0.9819826654876976


# Ensembling all the above models using Ridge Regression

In [27]:
x_ensemble_train = np.transpose(np.array(x_ensemble_train))
x_ensemble_test = np.transpose(np.array(x_ensemble_test))
x_ensemble_val = np.transpose(np.array(x_ensemble_val))

In [28]:
print(x_ensemble_train.shape)

(1305, 5)


In [29]:
from sklearn.linear_model import Ridge

In [30]:
a = 50
RR_ens = Ridge(alpha=a)
RR_ens.fit(x_ensemble_train, y_train)
print('R2 Loss on validation data', r2_score(y_val, RR_ens.predict(x_ensemble_val)))

R2 Loss on validation data 0.8804973356332058


In [31]:
y_pred = RR_ens.predict(x_ensemble_test)
print('R2 loss on test data', r2_score(y_test, y_pred))

R2 loss on test data 0.9950417043786949
