In [21]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
from sqlalchemy import create_engine
import statsmodels.api as sm
# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

import warnings
warnings.filterwarnings(action="ignore")

Load the houseprices data from Thinkful's database.

In [2]:

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'


engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

df = pd.read_sql_query('select * from houseprices',con=engine)

engine.dispose()

df_nm = df.select_dtypes(['int64', 'float64'])
df_nm.dropna(inplace = True)

Reimplement your model from the previous checkpoint.

In [51]:
insig = ['id','saleprice', 'yearremodadd', 'firstflrsf','secondflrsf','fireplaces', 'lowqualfinsf', 'bsmthalfbath',
         'fullbath','poolarea', 'screenporch','bsmtfullbath','halfbath', 'garageyrblt', 'garagearea', 'miscval', 'mosold', 'yrsold']
X = df_nm.drop(columns = insig)
y = df_nm['saleprice']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [52]:
X.columns

Index(['mssubclass', 'lotfrontage', 'lotarea', 'overallqual', 'overallcond',
       'yearbuilt', 'masvnrarea', 'bsmtfinsf1', 'bsmtfinsf2', 'bsmtunfsf',
       'totalbsmtsf', 'grlivarea', 'bedroomabvgr', 'kitchenabvgr',
       'totrmsabvgrd', 'garagecars', 'wooddecksf', 'openporchsf',
       'enclosedporch', 'threessnporch'],
      dtype='object')

Try OLS, Lasso, Ridge, and ElasticNet regression using the same model specification. This time, you need to do k-fold cross-validation to choose the best hyperparameter values for your models. Scikit-learn has RidgeCV, LassoCV, and ElasticNetCV that you can utilize to do this. Which model is the best? Why?

ElasticNet regression performed the best and had the lowest error percentage.

In [53]:
lrm = LinearRegression()
lrm.fit(X_train, y_train)

y_pred_train = lrm.predict(X_train)
y_pred_test = lrm.predict(X_test)

print(lrm.score(X_train, y_train))
print(lrm.score(X_test, y_test))

0.7926094517401946
0.8319988762205055


In [64]:
print("R-squared of the model in the training set is: {}".format(lrm.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(lrm.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_pred_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_pred_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_pred_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100))

R-squared of the model in the training set is: 0.7926094517401946
-----Test set statistics-----
R-squared of the model in the test set is: 0.8319988762205055
Mean absolute error of the prediction is: 22885.962563091187
Mean squared error of the prediction is: 1069008590.9708884
Root mean squared error of the prediction is: 32695.696826507436
Mean absolute percentage error of the prediction is: 13.673604994929384


In [56]:
rr = Ridge(alpha = 10)
rr.fit(X_train, y_train)

ry_pred_train = rr.predict(X_train)
ry_pred_test = rr.predict(X_test)

print(rr.score(X_train, y_train))
print(rr.score(X_test, y_test))

0.7924578266733321
0.8310014114942595


In [65]:
print("R-squared of the model in the training set is: {}".format(rr.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(rr.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, ry_pred_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, ry_pred_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, ry_pred_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - ry_pred_test) / y_test)) * 100))

R-squared of the model in the training set is: 0.7924578266733321
-----Test set statistics-----
R-squared of the model in the test set is: 0.8310014114942595
Mean absolute error of the prediction is: 22866.519768324746
Mean squared error of the prediction is: 1075355562.571786
Root mean squared error of the prediction is: 32792.61445160764
Mean absolute percentage error of the prediction is: 13.642620513213242


In [55]:
tuned_param = [{'alpha' : [1, 5, 10, 50, 100, 1000, 10000, 100000]}]
clf = GridSearchCV(Ridge(), param_grid = tuned_param, cv = 5)
clf.fit(X_train, y_train)
print(clf.best_score_)
print(clf.best_estimator_.alpha)

0.747369277324341
10


In [59]:
lr = Lasso(alpha = 100)
lr.fit(X_train, y_train)

ly_pred_train = lr.predict(X_train)
ly_pred_test = lr.predict(X_test)

print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))

0.7925444324463121
0.8315984731474095


In [66]:
print("R-squared of the model in the training set is: {}".format(lr.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(lr.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, ly_pred_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, ly_pred_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, ly_pred_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - ly_pred_test) / y_test)) * 100))

R-squared of the model in the training set is: 0.7925444324463121
-----Test set statistics-----
R-squared of the model in the test set is: 0.8315984731474095
Mean absolute error of the prediction is: 22858.216167263938
Mean squared error of the prediction is: 1071556397.2912354
Root mean squared error of the prediction is: 32734.636049469613
Mean absolute percentage error of the prediction is: 13.646419693245779


In [60]:
tuned_param = [{'alpha' : [1, 5, 10, 50, 100, 1000, 10000, 100000]}]
clf1 = GridSearchCV(Lasso(), param_grid = tuned_param, cv = 5)
clf1.fit(X_train, y_train)
print(clf1.best_score_)
print(clf1.best_estimator_.alpha)

0.7469314574819985
100


In [67]:
el = ElasticNet(alpha = 1, l1_ratio= 0.9)
el.fit(X_train, y_train)

ey_pred_train = el.predict(X_train)
ey_pred_test = el.predict(X_test)

print(el.score(X_train, y_train))
print(el.score(X_test, y_test))

0.7899653522355892
0.8276092861023562


In [68]:
print("R-squared of the model in the training set is: {}".format(el.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(el.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, ey_pred_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, ey_pred_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, ey_pred_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - ey_pred_test) / y_test)) * 100))

R-squared of the model in the training set is: 0.7899653522355892
-----Test set statistics-----
R-squared of the model in the test set is: 0.8276092861023562
Mean absolute error of the prediction is: 22709.69463062451
Mean squared error of the prediction is: 1096940008.5803418
Root mean squared error of the prediction is: 33120.084670488715
Mean absolute percentage error of the prediction is: 13.384101859635205


In [62]:
tuned_param = [{'alpha' : [1, 5, 10, 50, 100, 1000, 10000, 100000],
               'l1_ratio' : [0.1, 0.3, 0.5, 0.7, 0.9]}]
clf2 = GridSearchCV(ElasticNet(), param_grid = tuned_param, cv = 5)
clf2.fit(X_train, y_train)
print(clf2.best_score_)
print(clf2.best_estimator_.alpha)
print(clf2.best_estimator_.l1_ratio)

0.7459959117969756
1
0.9
