# Overfitting and Regularization - House Price Model
In this exercise, we'll work with the housing price data from the previous checkpoint. 

## Load the dataset from Thinkful's database

In [1]:
#Import libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import statsmodels.api as sm
from statsmodels.tools.eval_measures import mse, rmse
from sqlalchemy import create_engine

# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

import warnings
warnings.filterwarnings(action="ignore")

In [2]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

# use the credentials to start a connection
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

# Use the connection to extract SQL data
house_price = pd.read_sql_query('SELECT * FROM houseprices', con=engine)

#Close the connection after query is complete
engine.dispose()

## Clean and transform the data

In [3]:
#Drop features
drop_list = list((house_price.isnull().sum()/house_price.isnull().count()).sort_values(ascending=False).head(19).index)
drop_list.append('id')

house_price = house_price.drop(drop_list, axis=1)

#List of features that are string categoricals
str_cat_cols = list(house_price.describe(include=['O']).columns)

#Uniques within each variable
uniques = pd.DataFrame()
uni_col = []
num_uni = []
avgdiff_uni = []

for col in list(house_price.columns):
    uni_col.append(list(np.unique(house_price[col])))
    num_uni.append(len(np.unique(house_price[col])))
    try:
        avgdiff_uni.append(np.mean(np.diff(np.unique(house_price[col]))))
    except: 
        avgdiff_uni.append('N/A')
    
uniques['Category'] = list(house_price.columns)
uniques['Unique Values'] = uni_col
uniques['Num Uniques'] = num_uni
uniques['Avg Diff Among Uniques'] = avgdiff_uni


#List of features that are numerical categoricals
#If a numerical variable is categorical, its unique values will tend to be close to each other and there shouldn't
#be too many unique values 
num_cat_cols = []
for col in list(house_price.columns):
    if col not in str_cat_cols:   
        if ((list(uniques[uniques['Category']==col]['Avg Diff Among Uniques'])[0] < 2) or ((list(uniques[uniques['Category']==col]['Num Uniques'])[0] < 20) and (list(uniques[uniques['Category']==col]['Avg Diff Among Uniques'])[0] < 20))):
            num_cat_cols.append(col)
            
#List of features that are numerical continuous
cont_cols = []
for col in list(house_price.columns):
    if ((col not in str_cat_cols) and (col not in num_cat_cols)):
        cont_cols.append(col)
        
#Create new dataframe containing features of interest
#In previous modules, we determined that 'grlivarea', 'garagearea', 'totalbsmtsf', 'overallqual', and 'yearremodadd' were most effective
sale_df = house_price[['saleprice', 'grlivarea', 'garagearea', 'totalbsmtsf', 'overallqual', 'yearremodadd']]

#From previous checkpoints, we know we should also include an area feature and an area_quality interaction feature
sale_df['all_area'] = sale_df['grlivarea'] + sale_df['garagearea'] + sale_df['totalbsmtsf']
sale_df['area_qual'] = sale_df['all_area']*sale_df['overallqual']

sale_df.head(10)

Unnamed: 0,saleprice,grlivarea,garagearea,totalbsmtsf,overallqual,yearremodadd,all_area,area_qual
0,208500,1710,548,856,7,2003,3114,21798
1,181500,1262,460,1262,6,1976,2984,17904
2,223500,1786,608,920,7,2002,3314,23198
3,140000,1717,642,756,7,1970,3115,21805
4,250000,2198,836,1145,8,2000,4179,33432
5,143000,1362,480,796,5,1995,2638,13190
6,307000,1694,636,1686,8,2005,4016,32128
7,200000,2090,484,1107,7,1973,3681,25767
8,129900,1774,468,952,7,1950,3194,22358
9,118000,1077,205,991,5,1950,2273,11365


## Split data into train and test sets

In [4]:
target_var = 'saleprice'
feature_set = list(sale_df.columns.drop(['saleprice']))

# X is the feature set 
X = sale_df[feature_set]
# Y is the target variable
Y = sale_df[target_var]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 465)

print("The number of observations in training set is {}".format(X_train.shape[0]))
print("The number of observations in test set is {}".format(X_test.shape[0]))

The number of observations in training set is 1168
The number of observations in test set is 292


## Build a model using OLS regression

In [7]:
lrm = LinearRegression()

lrm.fit(X_train, y_train)

# We are making predictions here
y_preds_train = lrm.predict(X_train)
y_preds_test = lrm.predict(X_test)

print("R-squared of the model in training set is: {}".format(lrm.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(lrm.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in training set is: 0.7667442648292179
-----Test set statistics-----
R-squared of the model in test set is: 0.8041744753698993
Mean absolute error of the prediction is: 23516.49009949857
Mean squared error of the prediction is: 1314714623.4957995
Root mean squared error of the prediction is: 36258.99369116302
Mean absolute percentage error of the prediction is: 14.025140883004056


## Do k-fold cross-validation to choose best hyperparameter values

In [18]:
alphas = [np.power(10.0,p) for p in np.arange(-20,40,1)]

## Build a model using Lasso regression

In [19]:
from sklearn.linear_model import LassoCV

lassoregr = LassoCV(alphas=alphas, cv=5) 
lassoregr.fit(X_train, y_train)

# We are making predictions here
y_preds_train = lassoregr.predict(X_train)
y_preds_test = lassoregr.predict(X_test)

print("Best alpha parameter is: {}".format(lassoregr.alpha_))
print("R-squared of the model on the training set is: {}".format(lassoregr.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(lassoregr.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))


Best alpha parameter is: 10000.0
R-squared of the model on the training set is: 0.7583389212395036
-----Test set statistics-----
R-squared of the model on the test set is: 0.8308707435167693
Mean absolute error of the prediction is: 21271.912019210682
Mean squared error of the prediction is: 1135483779.1421204
Root mean squared error of the prediction is: 33696.94020444765
Mean absolute percentage error of the prediction is: 12.424790907707473


## Build a model using Ridge Regression

In [20]:
from sklearn.linear_model import RidgeCV

# Fitting a ridge regression model. Alpha is the regularization
# parameter (usually called lambda). As alpha gets larger, parameter
# shrinkage grows more pronounced.
ridgeregr = RidgeCV(alphas=alphas, cv=5) 
ridgeregr.fit(X_train, y_train)

# We are making predictions here
y_preds_train = ridgeregr.predict(X_train)
y_preds_test = ridgeregr.predict(X_test)

print("Best alpha parameter is: {}".format(ridgeregr.alpha_))
print("R-squared of the model on the training set is: {}".format(ridgeregr.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(ridgeregr.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

Best alpha parameter is: 10000.0
R-squared of the model on the training set is: 0.7588691942850861
-----Test set statistics-----
R-squared of the model on the test set is: 0.8300983141022953
Mean absolute error of the prediction is: 21352.809003397495
Mean squared error of the prediction is: 1140669641.6528702
Root mean squared error of the prediction is: 33773.801113479516
Mean absolute percentage error of the prediction is: 12.478787895910818


## Build a model using ElasticNet Regression

In [21]:
from sklearn.linear_model import ElasticNetCV

elasticregr = ElasticNetCV(alphas=alphas, cv=5) 
elasticregr.fit(X_train, y_train)

# We are making predictions here
y_preds_train = elasticregr.predict(X_train)
y_preds_test = elasticregr.predict(X_test)

print("Best alpha parameter is: {}".format(elasticregr.alpha_))
print("R-squared of the model on the training set is: {}".format(elasticregr.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(elasticregr.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))


Best alpha parameter is: 100.0
R-squared of the model on the training set is: 0.7582224451036733
-----Test set statistics-----
R-squared of the model on the test set is: 0.8308562129328529
Mean absolute error of the prediction is: 21247.470054706042
Mean squared error of the prediction is: 1135581333.18263
Root mean squared error of the prediction is: 33698.387694111276
Mean absolute percentage error of the prediction is: 12.410320240695393


Based on the results for each model, it's difficult to tell which one is the best, as each have their own positive and negative aspects. For example, OLS has the highest R-squared value, but also has the highest error values. Because the difference in R-squared is much smaller than the difference in errors, I'll remove OLS from the running. In conclusion, Lasso or ElasticNet regression provide the best models, with nearly identical results.