In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, VarianceThreshold, f_regression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [25]:
# set up display

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_info_rows', 500)
pd.set_option('display.max_rows', 2500)
# idea taken from Jeff Hale

In [26]:
# read in cleaned dataset

housing = pd.read_csv('./datasets/modified_train.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [27]:
# create column for logarithm of Sale Price

housing['log_price'] = np.log(housing['SalePrice'])
housing['log_price'].head()

0    11.779129
1    12.301383
2    11.599103
3    12.066811
4    11.838626
Name: log_price, dtype: float64

In [39]:
features = housing.columns.drop(['Id', 'PID',
                      'BsmtFin SF 1', 
                      'BsmtFin SF 2', 
                      'Bsmt Unf SF', 
                      'Total Bsmt SF',
                      'Garage Yr Blt',
                      '2nd Flr SF', 
                      'Low Qual Fin SF', 
                      'Gr Liv Area', 
                      'Wood Deck SF', 
                      'Open Porch SF', 
                      'Enclosed Porch',
                      '3Ssn Porch', 
                      'Screen Porch', 
                      'Pool Area', 
                      'SalePrice',
                      'log_price'])

In [40]:
X = housing[features]
y = housing['log_price']

In [41]:
# generate polynomial features

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(include_bias=False)
X_poly = poly.fit_transform(X)

In [34]:
# poly.get_feature_names(X.columns)

In [42]:
X_poly.shape

(2051, 59684)

In [43]:
# train_test_split training dataset

X_train, X_test, y_train, y_test = train_test_split(X_poly, y, 
                                                    test_size=0.3, 
                                                    random_state=42)

In [44]:
# print(X_train.shape[0])
X_train[:2]

array([[6.800e+01, 4.500e+03, 6.000e+00, ..., 1.000e+00, 0.000e+00,
        0.000e+00],
       [6.800e+01, 8.174e+03, 7.000e+00, ..., 1.000e+00, 0.000e+00,
        0.000e+00]])

In [45]:
X_test[:2]

array([[5.500e+01, 1.078e+04, 5.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [8.800e+01, 1.420e+04, 7.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00]])

In [46]:
y_train[:2]

226     12.013701
1742    12.089539
Name: log_price, dtype: float64

In [47]:
y_test[:2]

1124    11.794338
188     12.328290
Name: log_price, dtype: float64

In [48]:
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)

# code taken from Tim Book - Regularization lesson

In [49]:
print(f'Z_train shape is: {Z_train.shape}')
print(f'y_train shape is: {y_train.shape}')
print(f'Z_test shape is: {Z_test.shape}')
print(f'y_test shape is: {y_test.shape}')

Z_train shape is: (1435, 59684)
y_train shape is: (1435,)
Z_test shape is: (616, 59684)
y_test shape is: (616,)


**Linear Regression Model**

In [50]:
# select KBest features
# SQRT(n) = 45

skb = SelectKBest(f_regression, k=45)
Z_train_best = skb.fit_transform(Z_train, y_train)
Z_test_best = skb.transform(Z_test)

  corr /= X_norms
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


In [51]:
# 45 Best Features

for feature_name, included in zip(poly.get_feature_names(X.columns),
    skb.get_support()):
        if included:
            print(feature_name)

# code taken from Janos Sallai

Overall Qual
Garage Area
Total SF
Overall Qual^2
Overall Qual Year Built
Overall Qual Year Remod/Add
Overall Qual 1st Flr SF
Overall Qual Garage Area
Overall Qual Street_Pave
Overall Qual Condition 2_Norm
Overall Qual Roof Matl_CompShg
Overall Qual Foundation_PConc
Overall Qual Central Air_Y
Overall Qual Electrical_SBrkr
Overall Qual Kitchen AbvGr_1
Overall Qual Garage Qual_TA
Overall Qual Garage Cond_TA
Overall Qual Paved Drive_Y
Overall Qual Pool QC_No
Overall Qual Misc Feature_No
Overall Qual Total SF
Year Built Year Remod/Add
Year Built Garage Area
Year Built Total SF
Year Remod/Add Garage Area
Year Remod/Add Total SF
Garage Area Street_Pave
Garage Area Foundation_PConc
Garage Area Central Air_Y
Garage Area Electrical_SBrkr
Garage Area Garage Cond_TA
Garage Area Paved Drive_Y
Garage Area Total SF
Street_Pave Total SF
Condition 2_Norm Total SF
Roof Matl_CompShg Total SF
Foundation_PConc Total SF
Central Air_Y Total SF
Electrical_SBrkr Total SF
Kitchen AbvGr_1 Total SF
Garage Qual_TA

In [52]:
# OLS regression model

ols = LinearRegression()
ols.fit(Z_train_best, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [53]:
print('Train R-Squared:', ols.score(Z_train_best, y_train))
print('Test R-Squared:', ols.score(Z_test_best, y_test))

Train R-Squared: 0.871566742244593
Test R-Squared: 0.8495290606891338


In [54]:
print('Cross-validation score:', cross_val_score(ols, Z_train_best, y_train, cv=5))

Cross-validation score: [0.8435982  0.68923533 0.83040294 0.79055078 0.77058947]


In [55]:
print('Cross-validation score:', cross_val_score(ols, Z_train_best, y_train, cv=5).mean())

Cross-validation score: 0.784875343188193


**Ridge Regression Model**

In [56]:
ridge_alphas = np.logspace(0, 4, 50)

In [57]:
# instantiate the Ridge model

ridge_cv = RidgeCV(alphas=ridge_alphas,
                   scoring='r2',
                   cv=5)
ridge_cv.fit(Z_train, y_train)

RidgeCV(alphas=array([1.00000000e+00, 1.20679264e+00, 1.45634848e+00, 1.75751062e+00,
       2.12095089e+00, 2.55954792e+00, 3.08884360e+00, 3.72759372e+00,
       4.49843267e+00, 5.42867544e+00, 6.55128557e+00, 7.90604321e+00,
       9.54095476e+00, 1.15139540e+01, 1.38949549e+01, 1.67683294e+01,
       2.02358965e+01, 2.44205309e+01, 2.94705170e+01, 3.55648031e+01,
       4.29193426e+01, 5.17947468e+0...
       4.09491506e+02, 4.94171336e+02, 5.96362332e+02, 7.19685673e+02,
       8.68511374e+02, 1.04811313e+03, 1.26485522e+03, 1.52641797e+03,
       1.84206997e+03, 2.22299648e+03, 2.68269580e+03, 3.23745754e+03,
       3.90693994e+03, 4.71486636e+03, 5.68986603e+03, 6.86648845e+03,
       8.28642773e+03, 1.00000000e+04]),
        cv=5, fit_intercept=True, gcv_mode=None, normalize=False, scoring='r2',
        store_cv_values=False)

In [58]:
ridge_cv.alpha_

5689.866029018293

In [59]:
print('Train R-Squared:', ridge_cv.score(Z_train, y_train))
print('Test R-Squared:', ridge_cv.score(Z_test, y_test))

Train R-Squared: 0.9881669053153816
Test R-Squared: 0.7979206075003799


**LASSO Regression Model**

In [69]:
lasso_alphas = np.logspace(-4, 0, 50)
lasso_alphas

array([1.00000000e-04, 1.20679264e-04, 1.45634848e-04, 1.75751062e-04,
       2.12095089e-04, 2.55954792e-04, 3.08884360e-04, 3.72759372e-04,
       4.49843267e-04, 5.42867544e-04, 6.55128557e-04, 7.90604321e-04,
       9.54095476e-04, 1.15139540e-03, 1.38949549e-03, 1.67683294e-03,
       2.02358965e-03, 2.44205309e-03, 2.94705170e-03, 3.55648031e-03,
       4.29193426e-03, 5.17947468e-03, 6.25055193e-03, 7.54312006e-03,
       9.10298178e-03, 1.09854114e-02, 1.32571137e-02, 1.59985872e-02,
       1.93069773e-02, 2.32995181e-02, 2.81176870e-02, 3.39322177e-02,
       4.09491506e-02, 4.94171336e-02, 5.96362332e-02, 7.19685673e-02,
       8.68511374e-02, 1.04811313e-01, 1.26485522e-01, 1.52641797e-01,
       1.84206997e-01, 2.22299648e-01, 2.68269580e-01, 3.23745754e-01,
       3.90693994e-01, 4.71486636e-01, 5.68986603e-01, 6.86648845e-01,
       8.28642773e-01, 1.00000000e+00])

In [70]:
# create Lasso CV model

lasso_cv = LassoCV(alphas=lasso_alphas,
                   cv=5)

In [71]:
# fit the Z_train data to the model

lasso_cv.fit(Z_train, y_train)

LassoCV(alphas=array([1.00000000e-04, 1.20679264e-04, 1.45634848e-04, 1.75751062e-04,
       2.12095089e-04, 2.55954792e-04, 3.08884360e-04, 3.72759372e-04,
       4.49843267e-04, 5.42867544e-04, 6.55128557e-04, 7.90604321e-04,
       9.54095476e-04, 1.15139540e-03, 1.38949549e-03, 1.67683294e-03,
       2.02358965e-03, 2.44205309e-03, 2.94705170e-03, 3.55648031e-03,
       4.29193426e-03, 5.17947468e-0...
       8.68511374e-02, 1.04811313e-01, 1.26485522e-01, 1.52641797e-01,
       1.84206997e-01, 2.22299648e-01, 2.68269580e-01, 3.23745754e-01,
       3.90693994e-01, 4.71486636e-01, 5.68986603e-01, 6.86648845e-01,
       8.28642773e-01, 1.00000000e+00]),
        copy_X=True, cv=5, eps=0.001, fit_intercept=True, max_iter=1000,
        n_alphas=100, n_jobs=None, normalize=False, positive=False,
        precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
        verbose=False)

In [72]:
lasso_cv.alpha_

0.0035564803062231283

In [45]:
lasso_cv.get_params()

{'alphas': array([0.01      , 0.01098541, 0.01206793, 0.01325711, 0.01456348,
        0.01599859, 0.01757511, 0.01930698, 0.02120951, 0.02329952,
        0.02559548, 0.02811769, 0.03088844, 0.03393222, 0.03727594,
        0.04094915, 0.04498433, 0.04941713, 0.05428675, 0.05963623,
        0.06551286, 0.07196857, 0.07906043, 0.08685114, 0.09540955,
        0.10481131, 0.11513954, 0.12648552, 0.13894955, 0.1526418 ,
        0.16768329, 0.184207  , 0.20235896, 0.22229965, 0.24420531,
        0.26826958, 0.29470517, 0.32374575, 0.35564803, 0.39069399,
        0.42919343, 0.47148664, 0.51794747, 0.5689866 , 0.62505519,
        0.68664885, 0.75431201, 0.82864277, 0.91029818, 1.        ]),
 'copy_X': True,
 'cv': 5,
 'eps': 0.001,
 'fit_intercept': True,
 'max_iter': 1000,
 'n_alphas': 100,
 'n_jobs': None,
 'normalize': False,
 'positive': False,
 'precompute': 'auto',
 'random_state': None,
 'selection': 'cyclic',
 'tol': 0.0001,
 'verbose': False}

In [73]:
print('Train R-Squared:', lasso_cv.score(Z_train, y_train))
print('Test R-Squared:', lasso_cv.score(Z_test, y_test))

Train R-Squared: 0.9743046726720975
Test R-Squared: 0.8755422174454062


In [75]:
print('OLS Train R-Squared:', ols.score(Z_train_best, y_train))
print('OLS Test R-Squared:', ols.score(Z_test_best, y_test))
print('OLS Cross-validation score:', cross_val_score(ols, Z_train_best, y_train, cv=5).mean())
print('')
print('Ridge Train R-Squared:', ridge_cv.score(Z_train, y_train))
print('Ridge Test R-Squared:', ridge_cv.score(Z_test, y_test))
print('')
print('Lasso Train R-Squared:', lasso_cv.score(Z_train, y_train))
print('Lasso Test R-Squared:', lasso_cv.score(Z_test, y_test))

OLS Train R-Squared: 0.871566742244593
OLS Test R-Squared: 0.8495290606891338
OLS Cross-validation score: 0.784875343188193

Ridge Train R-Squared: 0.9881669053153816
Ridge Test R-Squared: 0.7979206075003799

Lasso Train R-Squared: 0.9743046726720975
Lasso Test R-Squared: 0.8755422174454062
