In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style = "whitegrid")
import sklearn
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_train = pd.read_csv("../../datasets/ridge_train.csv")
df_test = pd.read_csv("../../datasets/ridge_test.csv")
df_train.head()

Unnamed: 0,Size,Price
0,1700,286500
1,1701,549000
2,1662,249000
3,1852,550000
4,1320,170000


In [3]:
from sklearn.preprocessing import PolynomialFeatures
poly10_train = PolynomialFeatures(10).fit_transform(df_train.Size.reshape(-1,1)) # Order 10
poly10_test = PolynomialFeatures(10).fit_transform(df_test.Size.reshape(-1,1)) # Order 10

## Linear regression of order 10

In [4]:
model = LinearRegression()
model.fit(poly10_train, df_train.Price.values.reshape(-1,1))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [5]:
est_train = model.predict(poly10_train)
print("Train RMSE: ", np.sqrt(sklearn.metrics.mean_squared_error(df_train.Price.values.reshape(-1,1), est_train)))
est_test = model.predict(poly10_test)
print("Test RMSE: ", np.sqrt(sklearn.metrics.mean_squared_error(df_test.Price.values.reshape(-1,1), est_test)))

Train RMSE:  198809.30767327943
Test RMSE:  4871998.393355155


## Ridge regression model using inbuilt Cross-Validation

In [6]:
ridge_model = RidgeCV(alphas=[.1, .5, .8,1,5,10,15], cv = 40)
ridge_model.fit(poly10_train, df_train.Price.values.reshape(-1,1))

RidgeCV(alphas=[0.1, 0.5, 0.8, 1, 5, 10, 15], cv=40, fit_intercept=True,
    gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)

In [7]:
est_train = ridge_model.predict(poly10_train)
print("Train RMSE: ", np.sqrt(sklearn.metrics.mean_squared_error(df_train.Price.values.reshape(-1,1), est_train)))
est_test = ridge_model.predict(poly10_test)
print("Test RMSE: ", np.sqrt(sklearn.metrics.mean_squared_error(df_test.Price.values.reshape(-1,1), est_test)))

Train RMSE:  193464.93902846103
Test RMSE:  88624931.7077602


In [8]:
ridge_model.coef_

array([[ 0.00000000e+00, -1.73251660e+05,  3.03255245e+02,
        -2.24213720e-01, -3.75671789e-06,  1.33635973e-07,
        -1.08393158e-10,  4.36953148e-14, -9.91045419e-18,
         1.20609123e-21, -6.13463185e-26]])

## Lasso regression model using inbuilt Cross-Validation

In [9]:
lasso_model = LassoCV(alphas=[0.1, 0.5, 1.0], cv = 40)
lasso_model.fit(poly10_train, df_train.Price.values.reshape(-1,1))

LassoCV(alphas=[0.1, 0.5, 1.0], copy_X=True, cv=40, eps=0.001,
    fit_intercept=True, max_iter=1000, n_alphas=100, n_jobs=1,
    normalize=False, positive=False, precompute='auto', random_state=None,
    selection='cyclic', tol=0.0001, verbose=False)

In [10]:
est_train = lasso_model.predict(poly10_train)
print("Train RMSE: ", np.sqrt(sklearn.metrics.mean_squared_error(df_train.Price.values.reshape(-1,1), est_train)))
est_test = lasso_model.predict(poly10_test)
print("Test RMSE: ", np.sqrt(sklearn.metrics.mean_squared_error(df_test.Price.values.reshape(-1,1), est_test)))

Train RMSE:  201384.1901629178
Test RMSE:  472252.73017639277


In [11]:
lasso_model.coef_

array([ 0.00000000e+00,  3.63811838e+02,  1.43414982e-02, -1.68379435e-05,
       -1.77835066e-09,  2.44067785e-13,  1.16641042e-16,  2.30124264e-20,
        2.66308275e-24, -1.95913516e-29, -1.17765553e-31])

In [12]:
# one of the coef is zero and Train/Test RMSE are lesser than the other two (Linear/Ridge)