<a href="https://colab.research.google.com/github/alexrafkin/houseprice_regression/blob/main/Overfitting%26Regularization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# House Prices Model

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats.mstats import winsorize
from sklearn import linear_model
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import statsmodels.api as sm
from statsmodels.tools.eval_measures import mse, rmse
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV

import warnings
warnings.filterwarnings('ignore')

  import pandas.util.testing as tm


In [None]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

houseprices = pd.read_sql_query('select * from houseprices',con=engine)

engine.dispose()

In [None]:
houseprices = pd.concat([houseprices, pd.get_dummies(houseprices['neighborhood'])], axis=1, sort=False)
prices = houseprices[['saleprice', 'overallqual','grlivarea','garagecars','firstflrsf','NoRidge']]
prices[['exterqual_Ex','exterqual_Fa','exterqual_Gd','exterqual_TA']] = pd.get_dummies(houseprices['exterqual'])
log_list = ['firstflrsf','grlivarea','saleprice']
for col in log_list:
  prices['log_{}'.format(col)] = np.log(prices[col])
prices = prices.drop(['firstflrsf','grlivarea','exterqual_Fa'], axis=1)

In [None]:
Y = prices['saleprice']
# `X` is the feature set which includes the
# `is_male` and `is_smoker` variables
X = prices[['overallqual','garagecars','log_firstflrsf','log_grlivarea','exterqual_Ex','exterqual_TA','exterqual_Gd']]
X['qual*area'] = X.overallqual * X.log_grlivarea
X['exterqual_Ex*cars'] = X.garagecars * X.exterqual_Ex

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 465)

print("The number of observations in training set is {}".format(X_train.shape[0]))
print("The number of observations in test set is {}".format(X_test.shape[0]))

alphas = [np.power(10.0,p) for p in np.arange(-10,40,1)]

The number of observations in training set is 1168
The number of observations in test set is 292


#OLS

In [None]:
# Fit an OLS model using scikit-learn
lrm = LinearRegression()
lrm.fit(X_train, y_train)


# You are making predictions here
y_preds_train = lrm.predict(X_train)
y_preds_test = lrm.predict(X_test)

print("R-squared of the model in the training set is: {}".format(lrm.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(lrm.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in the training set is: 0.8057967351740991
-----Test set statistics-----
R-squared of the model in the test set is: 0.8164470841106701
Mean absolute error of the prediction is: 23225.61203330008
Mean squared error of the prediction is: 1232319960.1320136
Root mean squared error of the prediction is: 35104.41510881521
Mean absolute percentage error of the prediction is: 13.77063674576265


#Lasso

In [None]:
from sklearn.linear_model import Lasso

lasso_cv = LassoCV(alphas=alphas, cv=5)
lasso_cv.fit(X_train, y_train)

# You are making predictions here
y_preds_train = lasso_cv.predict(X_train)
y_preds_test = lasso_cv.predict(X_test)

print('Best alpha value is: {}'.format(lasso_cv.alpha_))
print("R-squared of the model on the training set is: {}".format(lasso_cv.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(lasso_cv.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

Best alpha value is: 100.0
R-squared of the model on the training set is: 0.8010972887755711
-----Test set statistics-----
R-squared of the model on the test set is: 0.8047240377500139
Mean absolute error of the prediction is: 23723.51234933219
Mean squared error of the prediction is: 1311025024.3027189
Root mean squared error of the prediction is: 36208.07954452596
Mean absolute percentage error of the prediction is: 13.997406732788868


#Ridge

In [None]:
from sklearn.linear_model import Ridge

ridge_cv = RidgeCV(alphas=alphas, cv=5)
ridge_cv.fit(X_train, y_train)

# You are making predictions here
y_preds_train = ridge_cv.predict(X_train)
y_preds_test = ridge_cv.predict(X_test)

print('Best alpha value is: {}'.format(ridge_cv.alpha_))
print("R-squared of the model on the training set is: {}".format(ridge_cv.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(ridge_cv.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))


Best alpha value is: 1.0
R-squared of the model on the training set is: 0.8045236559255067
-----Test set statistics-----
R-squared of the model on the test set is: 0.8108274532655666
Mean absolute error of the prediction is: 23423.977709252198
Mean squared error of the prediction is: 1270048498.6596725
Root mean squared error of the prediction is: 35637.739808518614
Mean absolute percentage error of the prediction is: 13.82805486620613


#Elastic Net

In [None]:
from sklearn.linear_model import ElasticNet

EN_cv = ElasticNetCV(alphas=alphas, cv=5)
EN_cv.fit(X_train, y_train)

# You are making predictions here
y_preds_train = EN_cv.predict(X_train)
y_preds_test = EN_cv.predict(X_test)

print('Best alpha value is: {}'.format(EN_cv.alpha_))
print("R-squared of the model on the training set is: {}".format(EN_cv.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(EN_cv.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

Best alpha value is: 0.001
R-squared of the model on the training set is: 0.8046992209529641
-----Test set statistics-----
R-squared of the model on the test set is: 0.8103529498597858
Mean absolute error of the prediction is: 23445.215123569913
Mean squared error of the prediction is: 1273234174.1105971
Root mean squared error of the prediction is: 35682.407067217275
Mean absolute percentage error of the prediction is: 13.836464423750824


In [None]:
print('\nCoefficients: \n', lrm.coef_)
print('\nIntercept: \n', lrm.intercept_)


Coefficients: 
 [-158207.78304039   14623.14735193   38462.1406215   -88380.42647209
 -106568.33840703   11328.86788154   28718.31488022   24485.10367932
   63120.73186436]

Intercept: 
 381278.1113097122


In [None]:
print('\nCoefficients: \n', lasso_cv.coef_)
print('\nIntercept: \n', lasso_cv.intercept_)


Coefficients: 
 [-108258.68296278   15183.93411722   36118.42068538  -46079.92722335
   -3910.775866         0.           16982.08233085   17657.714719
   24752.0879547 ]

Intercept: 
 100177.60906233844


In [None]:
print('\nCoefficients: \n', ridge_cv.coef_)
print('\nIntercept: \n', ridge_cv.intercept_)


Coefficients: 
 [-128336.27676944   14882.34067684   37423.8886722   -63367.87579375
  -57534.00419683    8852.71360479   26287.55680408   20391.46359732
   46279.67351183]

Intercept: 
 208942.54568868218


In [None]:
print('\nCoefficients: \n', EN_cv.coef_)
print('\nIntercept: \n', EN_cv.intercept_)


Coefficients: 
 [-127027.31326585   14829.27618995   37794.44113217  -62635.64134207
  -70718.35215066    8460.72972204   26067.06651882   20211.68339705
   50968.15232533]

Intercept: 
 201486.28272857302


#Comparing

After looking at the test statistics, Lasso is definitely not the best option - it seems like EN and Ridge are very close.

Also, the error terms are all better in the original model, so it would seem no regularization was needed?