# load package and data

In [None]:
import pandas as pd
import numpy as py
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso, LassoCV, LassoLars, ElasticNet, ElasticNetCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, RepeatedKFold, train_test_split
from sklearn.feature_selection import RFECV
from sklearn import metrics

# put your input for data
x_train = pd.read_csv('/content/drive/MyDrive/ds310 /project 1/x_train.csv')
y_train = pd.read_csv('/content/drive/MyDrive/ds310 /project 1/y_train.csv')
x_test = pd.read_csv('/content/drive/MyDrive/ds310 /project 1/x_test.csv')
y_test = pd.read_csv('/content/drive/MyDrive/ds310 /project 1/sampleSubmission.csv')

# EDA

## summary stat

In [None]:
# stat. summary of x_train
x_train.describe()

Unnamed: 0,Col 1,Col 2,Col 3,Col 4,Col 5,Col 6,Col 7,Col 8,Col 9,Col 10,...,Col 55,Col 56,Col 57,Col 58,Col 59,Col 60,Col 61,Col 62,Col 63,Col 64
count,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,...,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0
mean,-0.003323,-0.001314,-0.002398,-0.00163,-0.003224,-0.002212,0.00084,-0.00314,-0.002849,-0.001591,...,0.00099,-0.000489,-0.002741,0.001206,0.000505,0.001395,-0.001923,-0.003529,0.001857,-0.000221
std,0.048495,0.047562,0.044823,0.046265,0.045687,0.046473,0.047075,0.047097,0.044921,0.048524,...,0.046503,0.052138,0.045878,0.051811,0.044714,0.043615,0.051494,0.040118,0.052074,0.047651
min,-0.107226,-0.044642,-0.083808,-0.1124,-0.126781,-0.106845,-0.102307,-0.076395,-0.126097,-0.129483,...,-0.256471,-0.076433,-0.155145,-0.151734,-0.22858,-0.143972,-0.223255,-0.160745,-0.128919,-0.092165
25%,-0.04184,-0.044642,-0.036385,-0.033214,-0.035624,-0.033177,-0.036038,-0.039493,-0.032934,-0.034215,...,-0.013098,-0.023737,-0.021494,-0.020256,-0.018353,-0.017702,-0.016053,-0.027055,-0.021352,-0.023189
50%,0.001751,-0.044642,-0.007284,-0.00854,-0.004321,-0.006168,-0.002903,-0.002592,-0.005145,-0.001078,...,0.007864,-0.015062,-0.010942,-0.010163,0.014418,0.009642,0.009135,-0.014333,-0.015864,-0.014415
75%,0.037168,0.05068,0.028284,0.031914,0.023198,0.025695,0.033914,0.034309,0.027204,0.023775,...,0.022502,0.004805,0.013687,0.012113,0.031298,0.021598,0.022525,0.013462,0.017656,0.012268
max,0.110727,0.05068,0.128521,0.125158,0.153914,0.198788,0.181179,0.185234,0.133599,0.135612,...,0.13561,0.555129,0.203381,0.299032,0.080445,0.163067,0.209905,0.157844,0.318104,0.338184


Since features of the data are already engineered, I do not need to perform feature engineering.

# Data pre-processing

In [None]:
# split x_train and y_train into training set and validation set
X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train, test_size = 0.2)

# model building

In [None]:
# create lassolars model
model_lasso = LassoLars(max_iter = 10000)

# fit the model
model_lasso.fit(x_train, y_train['Output'])

# predict the y_train
y_pred_lasso_wo_cv = model_lasso.predict(X_val)

## model evalutaion

In [None]:
print('MSE:', mean_squared_error(Y_val['Output'], y_pred_lasso_wo_cv))

MSE: 3607.807362681958


The MSE of lasso model seems to be high. So, I will proceed with hyperparameter tunning.

# hyperparameter setting and tunning

In [None]:
# cross validation
cv = RepeatedKFold(n_splits = 5, n_repeats= 3)

# try different values of alpha and choose the best one
param_grid = {
    'alpha': [0.001, 0.01, 0.05, 0.1, 0.15, 1.16, 0.17, 0.18, 0.19, 1]
}

# create the model
model_lasso_hyp = GridSearchCV(estimator = model_lasso, param_grid = param_grid,
                           n_jobs = 1, cv = cv, scoring = 'neg_mean_squared_error',
                           verbose = 1)

# fit the model with data
model_lasso_hyp.fit(x_train, y_train['Output'])

# predict the Y_val
y_pred_lasso_cv_train = model_lasso_hyp.best_estimator_.predict(X_val)

Fitting 15 folds for each of 10 candidates, totalling 150 fits


## model evaluation after cross-validation

In [None]:
# best alpha value
print('Best Alpha:', model_lasso_hyp.best_params_['alpha'])

# evaluate the model
print('MSE:', mean_squared_error(Y_val['Output'], y_pred_lasso_cv_train))

Best Alpha: 0.15
MSE: 2559.8744305112336


I can observe that cross validated model performs better than the model without cross validation.

# predict y_test

In [None]:
y_pred_lasso_cv = model_lasso_hyp.best_estimator_.predict(x_test)

# export y_test

In [None]:
# put y_pred values into y_test
y_test['Output'] = y_pred_lasso_cv

# export
y_test.to_csv('/content/drive/MyDrive/ds310 /project 1/y_test.csv')