# Prediction using Linear models

### Packages

In [None]:
# os libraries
import os

In [None]:
# numerical libraries
import numpy as np
import pandas as pd

In [None]:
# statistical learning libraries
import sklearn.ensemble as ens
import sklearn.feature_selection as fs
import sklearn.model_selection as ms
import sklearn.preprocessing as pr
import sklearn.linear_model as lm
import sklearn.neighbors as nb
import sklearn.svm as sv
import sklearn.neural_network as nn

### Functions

In [None]:
def MSE(model, X, y):
    '''
    Get MSE of model on test data.
    
    Arguments:
        model: prediction model
        
    Returns:
        score: MSE loss
    '''
    
    # compute number of points in data
    n = y.shape[0]
    
    # return loss
    return (1/n) * np.sum(np.square(model.predict(X) - y))

In [None]:
def export_results(model, X):
    '''
    Export results into CSV file for submission.
    
    Arguments:
        model: regression model
    '''
    
    # obtain predictions
    pred = model.predict(X)
    
    # obtain index of data
    idx = X.index
    
    # set in dataframe
    df_results = pd.DataFrame({'_ID': idx, '0': pred})
    
    # save dataframe
    df_results.to_csv('submissions/submit.csv', sep=',', index=False, index_label='_ID')

### Data Loading

In [None]:
# read X_train
df_X_train = pd.read_csv('data/input_training.csv', sep=',', header=0, index_col=0)
X_train = df_X_train.values

In [None]:
# read y_train
df_y_train = pd.read_csv('data/output_training.csv', sep=',', header=0, index_col=0)
y_train = df_y_train.values.ravel()

In [None]:
# read X_test
df_X_test = pd.read_csv('data/input_testing.csv', sep=',', header=0, index_col=0)
X_test = df_X_test.values

### Data Visualisation

In [None]:
import matplotlib.pyplot as plt

plt.plot(np.sort(y_train))

### Data Normalisation

In [None]:
# concatenate train and test datasets
df = pd.concat([df_X_train, df_X_test])

# initialise and fit StandardScaler
scaler = pr.MinMaxScaler().fit(df)

# create scaled dataset
df_scaled = scaler.transform(df)

# convert scaled dataset to DataFrame
df = pd.DataFrame(df_scaled, index=df.index, columns=df.columns)

In [None]:
# truncate to retrieve df_X_train
df_X_train = df.truncate(before=None, after=df_X_train.shape[0])
X_train = df_X_train.values

# truncate to retrieve df_X_test
df_X_test = df.truncate(before=df_X_train.shape[0]+1, after=None)
X_test = df_X_test.values

### Exploration and creation of an augmented dataset

In [None]:
# create summary train dataset
summary = pd.DataFrame(columns=['Mean', 'Standard deviation', 'Range', 'Number of values', 'Values'], index=df.columns)

# create Pandas summary train dataset
summary_df = df.describe()

# compute statistics for each feature
for feature in df.columns:
    mean = summary_df[feature][1]
    std = summary_df[feature][2]
    min = summary_df[feature][3]
    max = summary_df[feature][7]
    values = set(df[feature])
    n_values = len(set(values))
    
    # populate dataset if n_values <= 10
    if n_values <= 50:
        summary.loc[feature] = pd.Series({'Mean':'{:0.2f}'.format(mean),\
                                          'Standard deviation':'{:0.2f}'.format(std),\
                                          'Range':'[{:0.2f}, {:0.2f}]'.format(min, max),\
                                          'Number of values':'{:0.0f}'.format(n_values),\
                                          'Values':', '.join(["{:0.2f}".format(x) for x in sorted(values)])})
        
    
    # populate dataset otherwise
    else:
        summary.loc[feature] = pd.Series({'Mean':'{:0.2f}'.format(mean),\
                                          'Standard deviation':'{:0.2f}'.format(std),\
                                          'Range':'[{:0.2f}, {:0.2f}]'.format(min, max),\
                                          'Number of values':'{:0.0f}'.format(n_values),\
                                          'Values':'NA'})

In [None]:
summary

In [None]:
# uncomment to retrieve categorical features

#for x in summary.index:
#    print(x, summary.loc[x]['Values'])

In [None]:
# set list of categorical features
categorical_features = ['X3', 'X6', 'X11', 'X15', 'X16', 'X18', 'X19', 'X22', 'X28', 'X32', 'X33', 'X35', 'X36',
                        'X42', 'X49', 'X56', 'X58', 'X60', 'X62', 'X64', 'X68', 'X73', 'X74', 'X83', 'X86', 'X90',
                        'X104', 'X108', 'X109', 'X116', 'X117', 'X122', 'X130', 'X137', 'X139', 'X140', 'X141',
                        'X143', 'X144', 'X148', 'X149', 'X151', 'X162', 'X168', 'X169', 'X172', 'X174', 'X176',
                        'X177', 'X182', 'X184', 'X186', 'X187', 'X192', 'X193', 'X195', 'X196', 'X197', 'X199',
                        'X206', 'X209', 'X217', 'X219', 'X222', 'X231', 'X235', 'X238', 'X242', 'X246', 'X256',
                        'X260', 'X270', 'X275', 'X281', 'X285', 'X286', 'X291', 'X298', 'X301', 'X303', 'X304',
                        'X307', 'X308', 'X312', 'X314', 'X318', 'X330', 'X332', 'X336', 'X337', 'X338']

In [None]:
# set list of categorical features with exactly two possible values
categorical_features_two = summary[summary['Number of values'].astype(int) == 2].index

In [None]:
# set list of categorical features with strictly more than two possible values
categorical_features_more_than_two = [x for x in categorical_features if x not in categorical_features_two]

In [None]:
# create augmented train dataset by one-hot encoding features with strictly more than two possible values
df_augmented = df.copy()
for feature in categorical_features_more_than_two:
    _ = pd.get_dummies(df[feature])
    _.columns = [feature+'-'+str(i) for i in range(1, len(_.columns)+1)]
    df_augmented = df_augmented.drop(feature, axis = 1)
    df_augmented = df_augmented.join(_)

In [None]:
# truncate to retrieve df_X_train
df_X_train_augmented = df_augmented.truncate(before=None, after=df_X_train.shape[0])
X_train_augmented = df_X_train_augmented.values

# truncate to retrieve df_X_test
df_X_test_augmented = df_augmented.truncate(before=df_X_train.shape[0]+1, after=None)
X_test_augmented = df_X_test_augmented.values

In [None]:
# create validation dataset
Xt, Xv, yt, yv = ms.train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [None]:
# create validation augmented dataset
Xta, Xva, yta, yva = ms.train_test_split(X_train_augmented, y_train, test_size=0.2, random_state=1)

### Feature selection

In [None]:
# set dataset with feature selection
Xfs = fs.SelectKBest(fs.f_regression, k=225).fit_transform(X_train_augmented, y_train)

In [None]:
# create validation augmented dataset
Xtfs, Xvfs, ytfs, yvfs = ms.train_test_split(Xfs, y_train, test_size=0.2, random_state=1)

In [None]:
# print shape of datasets
print('Train data shape:', Xt.shape)
print('Train data (augmented) shape:', Xta.shape)
print('Train data (feature selection) shape:', Xtfs.shape)

### Prediction

#### OLS regression

In [None]:
# compute OLS on data
model = lm.LinearRegression().fit(Xt, yt)

# print score
print('OLS score (train):', MSE(model, Xt, yt))
print('OLS score (val):', MSE(model, Xv, yv))

In [None]:
# compute OLS on augmented data
model = lm.LinearRegression().fit(Xta, yta)

# print score
print('OLS score (augmented data) (train):', MSE(model, Xta, yta))
print('OLS score (augmented data) (val):', MSE(model, Xva, yva))

In [None]:
# compute OLS on data with feature selection
model = lm.LinearRegression().fit(Xtfs, ytfs)

# print score
print('OLS score (feature selection) (train):', MSE(model, Xtfs, ytfs))
print('OLS score (feature selection) (val):', MSE(model, Xvfs, yvfs))

#### Ridge regression

In [None]:
# compute Ridge on data
model = lm.Ridge().fit(Xt, yt)

# print score
print('Ridge score (train):', MSE(model, Xt, yt))
print('Ridge score (val):', MSE(model, Xv, yv))

In [None]:
# compute Ridge on augmented data
model = lm.Ridge().fit(Xta, yta)

# print score
print('Ridge score (augmented data) (train):', MSE(model, Xta, yta))
print('Ridge score (augmented data) (val):', MSE(model, Xva, yva))

In [None]:
# compute Ridge on data with feature selection
model = lm.Ridge().fit(Xtfs, ytfs)

# print score
print('Ridge score (feature selection) (train):', MSE(model, Xtfs, ytfs))
print('Ridge score (feature selection) (val):', MSE(model, Xvfs, yvfs))

#### Ridge regression (CV)

In [None]:
# compute Ridge CV on data
model = lm.RidgeCV(cv=10).fit(Xt, yt)

# print score
print('Ridge CV score (train):', MSE(model, Xt, yt))
print('Ridge CV score (val):', MSE(model, Xv, yv))

In [None]:
# compute Ridge CV on augmented data
model = lm.RidgeCV(cv=10).fit(Xta, yta)

# print score
print('Ridge CV score (augmented data) (train):', MSE(model, Xta, yta))
print('Ridge CV score (augmented data) (val):', MSE(model, Xva, yva))

In [None]:
# compute Ridge CV on data with feature selection
model = lm.RidgeCV(cv=10).fit(Xtfs, ytfs)

# print score
print('Ridge CV score (feature selection) (train):', MSE(model, Xtfs, ytfs))
print('Ridge CV score (feature selection) (val):', MSE(model, Xvfs, yvfs))

#### LASSO regression

In [None]:
# compute LASSO on data
model = lm.Lasso().fit(Xt, yt)

# print score
print('LASSO score (train):', MSE(model, Xt, yt))
print('LASSO score (val):', MSE(model, Xv, yv))

In [None]:
# compute LASSO on augmented data
model = lm.Lasso().fit(Xta, yta)

# print score
print('LASSO score (augmented data) (train):', MSE(model, Xta, yta))
print('LASSO score (augmented data) (val):', MSE(model, Xva, yva))

In [None]:
# compute LASSO on data with feature selection
model = lm.Lasso().fit(Xtfs, ytfs)

# print score
print('LASSO score (feature selection) (train):', MSE(model, Xtfs, ytfs))
print('LASSO score (feature selection) (val):', MSE(model, Xvfs, yvfs))

#### LASSO regression (CV)

In [None]:
# compute LASSO CV on data
model = lm.LassoCV(cv=10, max_iter=10000).fit(Xt, yt)

# print score
print('LASSO CV score (train):', MSE(model, Xt, yt))
print('LASSO CV score (val):', MSE(model, Xv, yv))

In [None]:
# compute LASSO CV on augmented data
model = lm.LassoCV(cv=10, max_iter=10000).fit(Xta, yta)

# print score
print('LASSO CV score (augmented data) (train):', MSE(model, Xta, yta))
print('LASSO CV score (augmented data) (val):', MSE(model, Xva, yva))

In [None]:
# compute LASSO CV on data with feature selection
model = lm.LassoCV(cv=10).fit(Xtfs, ytfs)

# print score
print('LASSO CV score (feature selection) (train):', MSE(model, Xtfs, ytfs))
print('LASSO CV score (feature selection) (val):', MSE(model, Xvfs, yvfs))

#### ElasticNet regression

In [None]:
# compute ElasticNet on data
model = lm.ElasticNet().fit(Xt, yt)

# print score
print('ElasticNet score (train):', MSE(model, Xt, yt))
print('ElasticNet score (val):', MSE(model, Xv, yv))

In [None]:
# compute ElasticNet on augmented data
model = lm.ElasticNet().fit(Xta, yta)

# print score
print('ElasticNet score (augmented data) (train):', MSE(model, Xta, yta))
print('ElasticNet score (augmented data) (val):', MSE(model, Xva, yva))

In [None]:
# compute ElasticNet on data with feature selection
model = lm.ElasticNet().fit(Xtfs, ytfs)

# print score
print('ElasticNet score (feature selection) (train):', MSE(model, Xtfs, ytfs))
print('ElasticNet score (feature selection) (val):', MSE(model, Xvfs, yvfs))

#### ElasticNet regression (CV)

In [None]:
# compute ElasticNet CV on data
model = lm.ElasticNetCV(cv=10, max_iter=10000).fit(Xt, yt)

# print score
print('ElasticNet CV score (train):', MSE(model, Xt, yt))
print('ElasticNet CV score (val):', MSE(model, Xv, yv))

In [None]:
# compute ElasticNet CV on augmented data
model = lm.ElasticNetCV(cv=10, max_iter=10000).fit(Xta, yta)

# print score
print('ElasticNet CV score (augmented data) (train):', MSE(model, Xta, yta))
print('ElasticNet CV score (augmented data) (val):', MSE(model, Xva, yva))

In [None]:
# compute ElasticNet CV on data with feature selection
model = lm.ElasticNetCV().fit(Xtfs, ytfs)

# print score
print('ElasticNet CV score (feature selection) (train):', MSE(model, Xtfs, ytfs))
print('ElasticNet CV score (feature selection) (val):', MSE(model, Xvfs, yvfs))

#### Huber regressor

In [None]:
# compute Huber regressor on data
model = lm.HuberRegressor(max_iter=10000).fit(Xt, yt)

# print score
print('Huber score (train):', MSE(model, Xt, yt))
print('Huber score (val):', MSE(model, Xv, yv))

In [None]:
# compute Huber regressor on augmented data
model = lm.HuberRegressor(max_iter=10000).fit(Xta, yta)

# print score
print('Huber score (augmented data) (train):', MSE(model, Xta, yta))
print('Huber score (augmented data) (val):', MSE(model, Xva, yva))

In [None]:
# compute Huber regressor on data with feature selection
model = lm.HuberRegressor().fit(Xtfs, ytfs)

# print score
print('Huber score (feature selection) (train):', MSE(model, Xtfs, ytfs))
print('Huber score (feature selection) (val):', MSE(model, Xvfs, yvfs))

#### SVR (RBF)

In [None]:
# compute SVR (RBF) on data
model = sv.SVR().fit(Xt, yt)

# print score
print('SVR (RBF) score (train):', MSE(model, Xt, yt))
print('SVR (RBF) score (val):', MSE(model, Xv, yv))

In [None]:
# compute SVR (RBF) on augmented data
model = sv.SVR().fit(Xta, yta)

# print score
print('SVR (RBF) score (augmented data) (train):', MSE(model, Xta, yta))
print('SVR (RBF) score (augmented data) (val):', MSE(model, Xva, yva))

In [None]:
# compute SVR (RBF) on data with feature selection
model = sv.SVR().fit(Xtfs, ytfs)

# print score
print('SVR (RBF) score (feature selection) (train):', MSE(model, Xtfs, ytfs))
print('SVR (RBF) score (feature selection) (val):', MSE(model, Xvfs, yvfs))

#### SVR (polynomial)

In [None]:
# compute SVR (polynomial) on data
model = sv.SVR(kernel='poly').fit(Xt, yt)

# print score
print('SVR (polynomial) score (train):', MSE(model, Xt, yt))
print('SVR (polynomial) score (val):', MSE(model, Xv, yv))

In [None]:
# compute SVR (polynomial) on augmented data
model = sv.SVR(kernel='poly').fit(Xta, yta)

# print score
print('SVR (polynomial) score (augmented data) (train):', MSE(model, Xta, yta))
print('SVR (polynomial) score (augmented data) (val):', MSE(model, Xva, yva))

In [None]:
# compute SVR (polynomial) on data with feature selection
model = sv.SVR(kernel='poly').fit(Xtfs, ytfs)

# print score
print('SVR (polynomial) score (feature selection) (train):', MSE(model, Xtfs, ytfs))
print('SVR (polynomial) score (feature selection) (val):', MSE(model, Xvfs, yvfs))

#### kNN

In [None]:
# compute kNN on data
model = nb.KNeighborsRegressor(n_neighbors=5).fit(Xt, yt)

# print score
print('kNN score (train):', MSE(model, Xt, yt))
print('kNN score (val):', MSE(model, Xv, yv))

In [None]:
# compute kNN on augmented data
model = nb.KNeighborsRegressor(n_neighbors=5).fit(Xta, yta)

# print score
print('kNN score (augmented data) (train):', MSE(model, Xta, yta))
print('kNN score (augmented data) (val):', MSE(model, Xva, yva))

In [None]:
# compute kNN on data with feature selection
model = nb.KNeighborsRegressor(n_neighbors=5).fit(Xtfs, ytfs)

# print score
print('kNN score (feature selection) (train):', MSE(model, Xtfs, ytfs))
print('kNN score (feature selection) (val):', MSE(model, Xvfs, yvfs))

#### Neural Network

In [None]:
# compute NN on data
model = nn.MLPRegressor(hidden_layer_sizes=(32, 16), early_stopping=True, alpha=5.0).fit(Xt, yt)

# print score
print('NN score (train):', MSE(model, Xt, yt))
print('NN score (val):', MSE(model, Xv, yv))

In [None]:
# compute NN on augmented data
model = nn.MLPRegressor(hidden_layer_sizes=(32, 16), early_stopping=True, alpha=1.0).fit(Xta, yta)

# print score
print('NN score (augmented data) (train):', MSE(model, Xta, yta))
print('NN score (augmented data) (val):', MSE(model, Xva, yva))

In [None]:
# compute NN on data with feature selection
model = nn.MLPRegressor(hidden_layer_sizes=(32, 16), early_stopping=True, alpha=.1).fit(Xtfs, ytfs)


# print score
print('NN score (feature selection) (train):', MSE(model, Xtfs, ytfs))
print('NN score (feature selection) (val):', MSE(model, Xvfs, yvfs))

#### Ensemble

In [None]:
# compute ensemble on data with feature selection
model1 = ens.GradientBoostingRegressor()
model2 = ens.RandomForestRegressor(n_estimators=20)
model3 = sv.SVR(kernel='rbf', C=1)
model4 = lm.RidgeCV(alphas=np.arange(0.01, 5+0.01, 0.01))
model5 = lm.LassoCV()
model6 = nn.MLPRegressor(hidden_layer_sizes=(48, 16), early_stopping=False, alpha=1.0)

# fit models
model1.fit(X_train_augmented, y_train)
model2.fit(X_train_augmented, y_train)
model3.fit(X_train_augmented, y_train)
model4.fit(X_train_augmented, y_train)
model5.fit(X_train_augmented, y_train)
model6.fit(X_train_augmented, y_train)

# fit ensemble model
model = ens.VotingRegressor(estimators=[('gb', model1),
                                        ('rf', model2),
                                        ('lr', model3),
                                        ('ri', model4),
                                        ('la', model5),
                                        ('nn', model6)])

model = model.fit(X_train_augmented, y_train)

# predictions
pred1 = model1.predict(Xva[:25])
pred2 = model2.predict(Xva[:25])
pred3 = model3.predict(Xva[:25])
pred4 = model4.predict(Xva[:25])
pred5 = model5.predict(Xva[:25])
pred6 = model6.predict(Xva[:25])
pred = model.predict(Xva[:25])

plt.figure()
plt.plot(pred1, "gd", label="GradientBoostingRegressor")
plt.plot(pred2, "b^", label="RandomForestRegressor")
plt.plot(pred3, "ys", label="SVR")
plt.plot(pred4, "r*", label="RidgeCV")
plt.plot(pred5, "bd", label="LassoCV")
plt.plot(pred6, "y^", label="MLPRegressor")
plt.plot(pred, "rd", label="Ensemble")
plt.plot(yva[:25], "b*", label="True")

plt.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False)
plt.ylabel("predicted")
plt.xlabel("training samples")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title("Regressor predictions and their average")

plt.show()

In [None]:
# compute ensemble on data with feature selection
model1 = ens.GradientBoostingRegressor()
model2 = ens.RandomForestRegressor(n_estimators=20)
model3 = sv.SVR(kernel='rbf', C=1)
model4 = lm.RidgeCV(alphas=np.arange(0.01, 5+0.01, 0.01))
model5 = lm.LassoCV()
model6 = nn.MLPRegressor(hidden_layer_sizes=(48, 16), early_stopping=False, alpha=1.0)

# fit models
model1.fit(X_train_augmented, y_train)
model2.fit(X_train_augmented, y_train)
model3.fit(X_train_augmented, y_train)
model4.fit(X_train_augmented, y_train)
model5.fit(X_train_augmented, y_train)
model6.fit(X_train_augmented, y_train)

# fit ensemble model
model = ens.VotingRegressor(estimators=[('gb', model1),
                                        ('rf', model2),
                                        ('lr', model3),
                                        ('ri', model4),
                                        ('la', model5),
                                        ('nn', model6)])

model = model.fit(X_train_augmented, y_train)