# Pre-Processing, Feature Engineering, and Model Benchmarks

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Lasso, LassoCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

In [2]:
ames = pd.read_csv('../datasets/ames_v1.csv', keep_default_na=False)

## Feature Engineering

Separate predictors by numerical and categorical variables

In [3]:
#method to find numeric cols received from: https://stackoverflow.com/questions/25039626/how-do-i-find-numeric-columns-in-pandas
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
category = ['object']
df_num = ames.select_dtypes(include=numerics)
df_cat = ames.select_dtypes(include=category)
n_vars = df_num.columns
c_vars = df_cat.columns

Categorical variables were dummified

In [4]:
ames_dummies = pd.get_dummies(ames[c_vars], columns=c_vars, drop_first=True)
ames_dummies.shape

(2000, 77)

In [5]:
ames[n_vars].shape

(2000, 16)

Numerical variables were transformed into polynomial features

In [6]:
ames_n = ames[n_vars].drop(columns = 'saleprice')

In [7]:
ames_n.head()

Unnamed: 0,ms_subclass,lot_frontage,lot_area,mas_vnr_area,bedroom_abvgr,kitchen,garage_cars,yr_sold,total_bath,total_sf,porch_sf,has_porch,has_2fl,has_fireplace,has_extrms
0,60,69.0,13517,289.0,3,1,2.0,2010,2.5,2204.0,44,1,1,0,1
1,60,43.0,11492,132.0,4,1,2.0,2009,3.5,3035.0,74,1,1,0,1
2,20,68.0,7922,0.0,3,1,1.0,2010,2.0,2114.0,52,1,0,0,0
3,60,73.0,9802,0.0,3,1,2.0,2010,2.5,1828.0,100,1,1,0,1
4,50,82.0,14235,0.0,3,1,2.0,2010,2.0,2121.0,59,1,1,0,1


In [8]:
features = ames_n.columns
X = ames[features]
poly = PolynomialFeatures(degree = 2, include_bias=False)
X_poly = poly.fit_transform(X)
X_poly_df = pd.DataFrame(X_poly, columns = poly.get_feature_names(features))

The polynomial features and the dummified categorical variables were combined into one dataframe called ames1.

In [9]:
ames1 = pd.concat([X_poly_df, ames_dummies],axis=1)
ames1.head()

Unnamed: 0,ms_subclass,lot_frontage,lot_area,mas_vnr_area,bedroom_abvgr,kitchen,garage_cars,yr_sold,total_bath,total_sf,...,central_air_Y,kitchen_qual_Fa,kitchen_qual_Gd,kitchen_qual_TA,garage_cond_Fa,garage_cond_Gd,garage_cond_Po,garage_cond_TA,paved_drive_P,paved_drive_Y
0,60.0,69.0,13517.0,289.0,3.0,1.0,2.0,2010.0,2.5,2204.0,...,1,0,1,0,0,0,0,1,0,1
1,60.0,43.0,11492.0,132.0,4.0,1.0,2.0,2009.0,3.5,3035.0,...,1,0,1,0,0,0,0,1,0,1
2,20.0,68.0,7922.0,0.0,3.0,1.0,1.0,2010.0,2.0,2114.0,...,1,0,1,0,0,0,0,1,0,1
3,60.0,73.0,9802.0,0.0,3.0,1.0,2.0,2010.0,2.5,1828.0,...,1,0,0,1,0,0,0,1,0,1
4,50.0,82.0,14235.0,0.0,3.0,1.0,2.0,2010.0,2.0,2121.0,...,1,0,0,1,0,0,0,1,0,0


In [10]:
ames1.to_csv('../datasets/amesv2.csv', index=False)

### Initial OLS Model

The initial OLS model shows that the train R2 score is 92% and the test R2 score is 86%. The R2 score signifies the percentage of variability in the sale price that can be explained by the features selected in our model. 

The train and test RMSE scores are at 0.11 and 0.14 respectively. The exponentiated RMSE scores show that there is a mean error of 19867 in my sale price for my train predictions and a mean error of 26369 in my test predictions. The model is overfit and I can work to reduce complexity within this model. My goal is to reduce the RMSE scores as close to 0 as possible.

In [11]:
X = ames1
y = np.log(ames['saleprice'])

X_train,X_test,y_train,y_test = train_test_split(X, y, train_size=0.7, random_state=77)

lr = LinearRegression()
lr.fit(X_train,y_train)
print(cross_val_score(lr, X, y, cv=5).mean())
print(lr.score(X_train,y_train))
print(lr.score(X_test,y_test))

0.8461452541116159
0.9206428930274038
0.8621985777386981


In [12]:
train_pred = lr.predict(X_train)
test_pred = lr.predict(X_test)

train_mse = mean_squared_error(y_true=y_train,y_pred=train_pred)
train_rmse = mean_squared_error(y_true=y_train,y_pred=train_pred, squared = False)
train_rmse_e =  mean_squared_error(y_true=np.exp(y_train),y_pred=np.exp(train_pred), squared = False)

test_mse = mean_squared_error(y_true=y_test,y_pred=test_pred)
test_rmse = mean_squared_error(y_true=y_test,y_pred=test_pred, squared = False)
test_rmse_e = mean_squared_error(y_true=np.exp(y_test),y_pred=np.exp(test_pred), squared = False)

print(f'Train MSE: {train_mse}')
print(f'Train RMSE: {train_rmse}')
print(f'Train RMSE exp: {train_rmse_e}')
print()
print(f'Test MSE: {test_mse}')
print(f'Test RMSE: {test_rmse}')
print(f'Test RMSE exp: {test_rmse_e}')

Train MSE: 0.012128184374282612
Train RMSE: 0.11012803627724692
Train RMSE exp: 19867.57883620927

Test MSE: 0.021809584590528973
Test RMSE: 0.1476806845546464
Test RMSE exp: 26369.70283862772


### Baseline vs OLS Model

The baseline train and test RMSE scores were developed to see how well our OLS model does compared to the baseline. The baseline train and test RMSE metrics show that there would be a mean error of 79233 in the prediction of sale price in the train model and a mean error of 81871 in the test model. The OLS model does better than the baseline model.

In [13]:
baseline_train_preds = [y_train.mean() for i in y_train]
baseline_test_preds = [y_test.mean() for i in y_test]

baseline_train_rmse = mean_squared_error(y_true = y_train, y_pred= baseline_train_preds, squared = False)
exp_b_train = mean_squared_error(y_true = np.exp(y_train), y_pred= np.exp(baseline_train_preds), squared = False)

baseline_test_rmse = mean_squared_error(y_true = y_test, y_pred= baseline_test_preds, squared = False)
exp_b_test = mean_squared_error(y_true = np.exp(y_test), y_pred= np.exp(baseline_test_preds), squared = False)

print(f'Baseline Train RMSE: {baseline_train_rmse}')
print(f'Baseline Train Exp RMSE: {exp_b_train}')
print()
print(f'Baseline Test RMSE: {baseline_test_rmse}')      
print(f'Baseline Test Exp RMSE: {exp_b_test}')     

Baseline Train RMSE: 0.3909353850255391
Baseline Train Exp RMSE: 79233.05974374324

Baseline Test RMSE: 0.39782937437542665
Baseline Test Exp RMSE: 81871.04064121071


In [14]:
#X = sm.add_constant(X)
import statsmodels.api as sm
#this is the intercept
X = sm.add_constant(X)
#fit model after adding the intercept
lrmodel = sm.OLS(y,X).fit()

print(lrmodel.summary())

                            OLS Regression Results                            
Dep. Variable:              saleprice   R-squared:                       0.914
Model:                            OLS   Adj. R-squared:                  0.904
Method:                 Least Squares   F-statistic:                     92.53
Date:                Thu, 11 Mar 2021   Prob (F-statistic):               0.00
Time:                        18:01:27   Log-Likelihood:                 1483.2
No. Observations:                2000   AIC:                            -2552.
Df Residuals:                    1793   BIC:                            -1393.
Df Model:                         206                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             