# Predictive Models

### Housekeeping

In [50]:
import warnings
warnings.filterwarnings('ignore')

In [51]:
import numpy as np
import pandas as pd

from tqdm import tqdm as progress_bar
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import (
    make_scorer,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
)
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.utils import shuffle

In [52]:
# Global Variables
%store -r CONTINUOUS_VARIABLES
%store -r DISCRETE_VARIABLES
%store -r TARGET_VARIABLE
%store -r filename

In [53]:
pd.set_option("display.max_columns", 250)

### Load the data

In [54]:
# Original dataframe 
df1 = pd.read_csv('data/{}-clean.csv'.format(filename))

In [55]:
VARIABLES = ['BldgType', 'CentralAir', 'Condition1', 'Condition2', 'Exterior1st', 'Exterior2nd', 'Foundation', 'GarageType','Heating', 'HouseStyle', 'LandContour', 'LotConfig', 'MSSubClass', 'MSZoning', 'MasVnrType', 'Neighborhood', 'RoofMatl', 'RoofStyle', 'SaleCondition', 'SaleType', 'Street']
df1 = pd.concat([
    pd.get_dummies(df1[VARIABLES], dtype=int),
    df1.drop(VARIABLES, axis=1)
], axis=1)

In [56]:
VARIABLES = ['BsmtCond', 'BsmtExposure', 'BsmtQual', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'ExterCond', 'ExterQual', 'Functional', 'GarageCond', 'GarageFinish', 'GarageQual', 'HeatingQC', 'KitchenQual', 'LandSlope', 'LotShape', 'OverallCond', 'OverallQual', 'PavedDrive', 'Utilities']
for column in df1.columns:
    if column in VARIABLES:
        df1[column] = df1[column].astype('category')
        df1[column] = df1[column].cat.codes.astype(int) 

In [57]:
df1.head()

Unnamed: 0,MSSubClass,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,CentralAir_N,CentralAir_Y,Condition1_Artery,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,Condition1_RRNe,Condition1_RRNn,Condition2_Artery,Condition2_Feedr,Condition2_Norm,Condition2_PosA,Condition2_PosN,Condition2_RRAe,Condition2_RRAn,Condition2_RRNn,Exterior1st_AsbShng,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsbShng,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Other,Exterior2nd_Plywood,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,Heating_Floor,Heating_GasA,Heating_GasW,Heating_Grav,Heating_OthW,Heating_Wall,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,LandContour_Bnk,LandContour_HLS,LandContour_Low,LandContour_Lvl,LotConfig_Corner,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,MasVnrType_BrkCmn,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,RoofMatl_ClyTile,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,RoofStyle_Flat,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,Street_Grvl,Street_Pave,Id,LotArea,LotShape,Utilities,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,HeatingQC,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MoSold,YrSold,SalePrice
0,60,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,8450,3,0,0,6,4,2003,2003,196,2,4,2,3,3,2,706,5,0,150,856,0,4,856,854,0,1710,1,0,2,1,3,1,2,8,6,0,1,2,548,4,4,2,0,61,0,0,0,0,2,2008,208500
1,20,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,2,9600,3,0,0,5,7,1976,1976,0,3,4,2,3,1,0,978,5,0,284,1262,0,4,1262,0,0,1262,0,1,2,0,3,1,3,6,6,1,1,2,460,4,4,2,298,0,0,0,0,0,5,2007,181500
2,60,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,3,11250,0,0,0,6,4,2001,2002,162,2,4,2,3,2,2,486,5,0,434,920,0,4,920,866,0,1786,1,0,2,1,3,1,2,6,6,1,1,2,608,4,4,2,0,42,0,0,0,0,9,2008,223500
3,70,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,4,9550,0,0,0,6,4,1915,1970,0,3,4,3,1,3,0,216,5,0,540,756,2,4,961,756,0,1717,1,0,1,0,3,1,2,7,6,1,2,3,642,4,4,2,0,35,272,0,0,0,2,2006,140000
4,60,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,5,14260,0,0,0,7,4,2000,2000,350,2,4,2,3,0,2,655,5,0,490,1145,0,4,1145,1053,0,2198,1,0,2,1,4,1,2,9,6,1,1,3,836,4,4,2,192,84,0,0,0,0,12,2008,250000


In [58]:
# Load the feature engineered dataframe
df = pd.read_csv('data/{}-final.csv'.format(filename))

In [59]:
df.shape

(1449, 97)

In [60]:
df.head()

Unnamed: 0,Id,LotArea,LotShape,Utilities,LandSlope,OverallQual,OverallCond,MasVnrArea,ExterQual,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MoSold,SalePrice,TotalSF,TotalPorchSF,TotalBath,has_2ndFlrSF,has_TotalBsmtSF,has_Fireplaces,has_GarageArea,has_PoolArea,has_TotalPorchSF,build_type_1Fam,build_type_2Fam,build_type_Twnhs,AirCon,street_proximity,f_BrkTil,f_CBlock,f_PConc,sale_Partial,sale_Abnormal,is_NewHome,was_remodeled,yrs_since_built,yrs_since_remodeled,recently_built,recently_remodeled,nd_Blmngtn,nd_Blueste,nd_BrDale,nd_BrkSide,nd_ClearCr,nd_CollgCr,nd_Crawfor,nd_Edwards,nd_Gilbert,nd_IDOTRR,nd_MeadowV,nd_Mitchel,nd_NAmes,nd_NPkVill,nd_NWAmes,nd_NoRidge,nd_NridgHt,nd_OldTown,nd_SWISU,nd_Sawyer,nd_SawyerW,nd_Somerst,nd_StoneBr,nd_Timber,nd_Veenker
0,1,10.42753,3,0,0,6,4,196,2,2,3,3,2,706,5,0,150,198.284295,4,856,854,0,7.600483,1,0,2,1,3,1,2,8,6,0,1,2,204.517915,4,4,2,0,61,0,0,0,0,2,208500,20.729442,11.124668,3.5,1,1,0,1,0,1,1,0,0,1,0,0,0,1,0,0,0,0,5,5,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,10.596444,3,0,0,5,7,0,3,2,3,1,0,978,5,0,284,264.601145,4,1262,0,0,7.284379,0,1,2,0,3,1,3,6,6,1,1,2,177.329636,4,4,2,298,0,0,0,0,0,5,181500,20.638826,23.806809,2.5,0,1,1,1,0,1,1,0,0,1,1,0,1,0,0,0,0,0,31,31,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,3,10.807339,0,0,0,6,4,162,2,2,3,2,2,486,5,0,434,209.206786,4,920,866,0,7.645779,1,0,2,1,3,1,2,6,6,1,1,2,222.579733,4,4,2,0,42,0,0,0,0,9,223500,21.023347,9.197105,3.5,1,1,1,1,0,1,1,0,0,1,0,0,0,1,0,0,0,1,7,6,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,10.589518,0,0,0,6,4,0,3,3,1,3,0,216,5,0,540,180.77993,4,961,756,0,7.604738,1,0,1,0,3,1,2,7,6,1,2,3,232.664362,4,4,2,0,35,272,0,0,0,2,140000,20.527193,24.135762,2.0,1,1,1,1,0,1,1,0,0,1,0,1,0,0,0,1,0,1,91,36,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,11.124528,0,0,0,7,4,350,2,2,3,0,2,655,5,0,490,246.150776,4,1145,1053,0,7.862154,1,0,2,1,4,1,2,9,6,1,1,3,288.451992,4,4,2,192,84,0,0,0,0,12,250000,22.227136,22.977765,3.5,1,1,1,1,0,1,1,0,0,1,0,0,0,1,0,0,0,0,8,8,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1449 entries, 0 to 1448
Data columns (total 97 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Id                   1449 non-null   int64  
 1   LotArea              1449 non-null   float64
 2   LotShape             1449 non-null   int64  
 3   Utilities            1449 non-null   int64  
 4   LandSlope            1449 non-null   int64  
 5   OverallQual          1449 non-null   int64  
 6   OverallCond          1449 non-null   int64  
 7   MasVnrArea           1449 non-null   int64  
 8   ExterQual            1449 non-null   int64  
 9   BsmtQual             1449 non-null   int64  
 10  BsmtCond             1449 non-null   int64  
 11  BsmtExposure         1449 non-null   int64  
 12  BsmtFinType1         1449 non-null   int64  
 13  BsmtFinSF1           1449 non-null   int64  
 14  BsmtFinType2         1449 non-null   int64  
 15  BsmtFinSF2           1449 non-null   i

Numpy arrays from the feature engineered dataframe

In [62]:
X = df.drop(columns='SalePrice').values
Y = df['SalePrice'].values

Numpy arrays excluding features uncorrelated with target variable

In [63]:
X1 = df.drop(columns=['SalePrice', 'LowQualFinSF', 'BsmtFinSF2', '3SsnPorch', 'MoSold', 'BsmtHalfBath', 'BsmtFinType1', 'Utilities', 'PoolArea', 'LandSlope'], axis = 1).values
Y1 = df['SalePrice'].values

Numpy arrays excluding features uncorrelated with target variable and those with evidence of multicollinearity

In [64]:
X2 = df.drop(columns=['SalePrice', 'LowQualFinSF', '3SsnPorch', 'MoSold', 'BsmtHalfBath', 'BsmtFinType1', 'Utilities', 'PoolArea', 'LandSlope', 'BsmtFinSF1', 'BsmtFinSF2', '1stFlrSF', '2ndFlrSF', 'GarageArea','WoodDeckSF'], axis = 1).values
Y2 = df['SalePrice'].values

Numpy arrays from the original dataframe

In [65]:
X0 = df1.drop(columns='SalePrice').values
Y0 = df1['SalePrice'].values

### Cross-Validation Function

Function to run k-fold cross validation and estimate the following values:
- Root Mean Squared Error
- R-Square
- Mean Absolute Error
- Maximum Deviation

In [66]:
def max_deviation(y_true, y_pred):
    """Determine the maximum deviation of a prediction."""
    assert y_true.shape == y_pred.shape
    assert y_true.ndim == 1
    return np.max(np.abs(y_pred - y_true))

In [67]:
def k_fold(X, Y, *, model, k=10, log=False, desc=None):
    mae, deviation, r2, rmse = [], [], [], []
    # Iterate over the k folds
    for train, test in progress_bar(KFold(n_splits=k, random_state=1, shuffle=True).split(X), desc=desc, total=k):
        model.fit(X[train], Y[train])
        y = model.predict(X[test])
        
        # Store the scores/errors for each fold
        mae.append(mean_absolute_error(Y[test], y))
        deviation.append(max_deviation(Y[test], y))
        r2.append(r2_score(Y[test], y))
        rmse.append(mean_squared_error(Y[test], y))
        
    # Return rounded values
    return {
        'mae': np.round(np.mean(mae)),
        "deviation": np.round(np.mean(deviation)),
        "r2": np.round(np.mean(r2), 3),
        "rmse": np.round(np.sqrt(np.mean(rmse)))
    }

### Store model performance for analysis

In [68]:
metrics = {}

### Linear Regression

In [69]:
# Using the original dataframe
lm = LinearRegression()
metrics[('lm', 'o')] = k_fold(X0, Y0, model=lm)

100%|██████████| 10/10 [00:00<00:00, 70.91it/s]


In [70]:
# Using the feature engineered dataframe
lm = LinearRegression()
metrics[('lm', 'e')] = k_fold(X, Y, model=lm)

100%|██████████| 10/10 [00:00<00:00, 184.39it/s]


In [71]:
# Using the dataframe with uncorrelated features excluded
lm = LinearRegression()
metrics[('lm', 'u')] = k_fold(X1, Y1, model=lm)

100%|██████████| 10/10 [00:00<00:00, 203.99it/s]


In [72]:
# Using the dataframe with uncorrelated and multicollinear features excluded
lm = LinearRegression()
metrics[('lm', 'm')] = k_fold(X2, Y2, model=lm)

100%|██████████| 10/10 [00:00<00:00, 195.14it/s]


### Lasso Regression

In [73]:
# GridSearchCV
t = 0.1
grid_search = GridSearchCV(
    estimator=Lasso(tol=t, selection="random", random_state=np.random.RandomState(42)),
    param_grid={"alpha": [2 ** x for x in range(-8, 4)] + list(range(12, 65, 4))},
    cv=KFold(n_splits=4),
    n_jobs=-1,
)

In [74]:
# Using the original dataframe
grid_search.fit(X0, Y0)
alpha = grid_search.best_params_["alpha"]
alpha

64

In [75]:
lasso = Lasso(alpha=alpha, tol=t)
metrics[('lasso', 'o')] = k_fold(X0, Y0, model=lasso)

100%|██████████| 10/10 [00:00<00:00, 159.41it/s]


In [76]:
# Using the feature engineered dataframe
grid_search.fit(X, Y)
alpha = grid_search.best_params_["alpha"]
alpha

16

In [77]:
lasso = Lasso(alpha=alpha, tol=t)
metrics[('lasso', 'e')] = k_fold(X, Y, model=lasso)

100%|██████████| 10/10 [00:00<00:00, 245.05it/s]


In [78]:
# Using the dataframe with uncorrelated features excluded
grid_search.fit(X1, Y1)
alpha = grid_search.best_params_["alpha"]
alpha

4

In [79]:
lasso = Lasso(alpha=alpha, tol=t)
metrics[('lasso', 'u')] = k_fold(X1, Y1, model=lasso)

100%|██████████| 10/10 [00:00<00:00, 260.11it/s]


In [80]:
# Using the dataframe with uncorrelated and multicollinear features excluded
grid_search.fit(X2, Y2)
alpha = grid_search.best_params_["alpha"]
alpha

64

In [81]:
lasso = Lasso(alpha=alpha, tol=t)
metrics[('lasso', 'm')] = k_fold(X2, Y2, model=lasso)

100%|██████████| 10/10 [00:00<00:00, 301.57it/s]


### Ridge Regression

In [82]:
# GridSearchCV
grid_search = GridSearchCV(
    estimator=Ridge(),
    param_grid={"alpha": [2 ** x for x in range(-8, 4)] + list(range(12, 65, 4))},
    cv=KFold(n_splits=4),
    n_jobs=-1,
)

In [83]:
# Using the original dataframe
grid_search.fit(X0, Y0)
alpha = grid_search.best_params_["alpha"]
alpha

2

In [84]:
ridge = Ridge(alpha=alpha)
metrics[('ridge', 'o')] = k_fold(X0, Y0, model=ridge)

100%|██████████| 10/10 [00:00<00:00, 167.04it/s]


In [85]:
# Using the feature engineered dataframe
grid_search.fit(X, Y)
alpha = grid_search.best_params_["alpha"]
alpha

4

In [86]:
ridge = Ridge(alpha=alpha)
metrics[('ridge', 'e')] = k_fold(X, Y, model=ridge)

100%|██████████| 10/10 [00:00<00:00, 322.99it/s]


In [87]:
# Using the dataframe with uncorrelated features excluded
grid_search.fit(X1, Y1)
alpha = grid_search.best_params_["alpha"]
alpha

8

In [88]:
ridge = Ridge(alpha=alpha)
metrics[('ridge', 'u')] = k_fold(X1, Y1, model=ridge)

100%|██████████| 10/10 [00:00<00:00, 251.89it/s]


In [89]:
# Using the dataframe with uncorrelated and multicollinear features excluded
grid_search.fit(X2, Y2)
alpha = grid_search.best_params_["alpha"]
alpha

4

In [90]:
ridge = Ridge(alpha=alpha)
metrics[('ridge', 'm')] = k_fold(X2, Y2, model=ridge)

100%|██████████| 10/10 [00:00<00:00, 287.31it/s]


### Random Forest

In [91]:
# Random Forest Classifier
rf = RandomForestRegressor(
    n_estimators=500,
    n_jobs=-1, random_state=np.random.RandomState(42)
)

In [92]:
# Using the original dataframe
metrics[('rf', 'o')] = k_fold(X0, Y0, model=rf)

100%|██████████| 10/10 [00:33<00:00,  3.37s/it]


In [93]:
# Using the feature engineered dataframe
metrics[('rf', 'e')] = k_fold(X, Y, model=rf)

100%|██████████| 10/10 [00:22<00:00,  2.29s/it]


In [94]:
# Using the dataframe with uncorrelated features excluded
metrics[('rf', 'u')] = k_fold(X1, Y1, model=rf)

100%|██████████| 10/10 [00:20<00:00,  2.09s/it]


In [95]:
# Using the dataframe with uncorrelated and multicollinear features excluded
metrics[('rf', 'm')] = k_fold(X2, Y2, model=rf)

100%|██████████| 10/10 [00:21<00:00,  2.18s/it]


### Analyzing the results

Based on the below analysis, it is clear that the feature engineered data performs way better than the original data set which had minimal changes applied. We also see that the best performance is achieved when allowing the model to select its features.    

Manually, removing the features not correlated to the SalePrice drive the model performance close to its best numbers. However, if we further remove the columns which show evidence of multicollinearity, the performance of the model drops

In [96]:
def scores(source, score="r2", ascending=True):
    r = [
        (model, sc[score])
        for (model, data_source), sc in metrics.items()
        if data_source == source
    ]
    return sorted(r, key=lambda x: x[1], reverse=(not ascending))

In [97]:
# Root Mean Squared Error
for label in ['o', 'e', 'u', 'm']:
    print(scores(label, 'rmse'))

[('rf', 29416.0), ('ridge', 33625.0), ('lasso', 36471.0), ('lm', 645682895.0)]
[('ridge', 21950.0), ('lm', 21996.0), ('lasso', 22934.0), ('rf', 24112.0)]
[('ridge', 21988.0), ('lm', 22337.0), ('lasso', 23097.0), ('rf', 24087.0)]
[('ridge', 23440.0), ('lm', 23717.0), ('rf', 24091.0), ('lasso', 24989.0)]


In [98]:
# R2
for label in ['o', 'e', 'u', 'm']:
    print(scores(label, 'r2', ascending=False))

[('rf', 0.859), ('ridge', 0.809), ('lasso', 0.775), ('lm', -56613906.133)]
[('ridge', 0.91), ('lm', 0.909), ('lasso', 0.901), ('rf', 0.892)]
[('ridge', 0.909), ('lm', 0.907), ('lasso', 0.9), ('rf', 0.892)]
[('ridge', 0.897), ('lm', 0.895), ('rf', 0.892), ('lasso', 0.883)]
