# Features Engineering - 3 - Garage Area per Car

In [268]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, RobustScaler, MaxAbsScaler, Normalizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from sklearn import set_config
set_config(transform_output='pandas')

from category_encoders import OrdinalEncoder

from src.helpers import *

In [269]:
df = pd.read_csv('../data/train.csv')

# ---Data Cleaning---

In [270]:
# Converting integers to string for the OrdinalEncoder

def convert_int_to_string(data, columns):
  for column in columns:
    data[column] = data[column].astype(str)
  return data

In [271]:
columns_to_convert = ['OverallQual', 'OverallCond']

df = convert_int_to_string(df, columns_to_convert)

In [272]:
# OutlierRemover is a custom transformer imported from src/helpers.py
# It can remove outliers from multiple columns by specifying the iqr_multipler

outlier_cols = ['LotFrontage','LotArea','MasVnrArea','BsmtFinSF1','1stFlrSF','GrLivArea',
                'OpenPorchSF']

outlier_remover = OutlierRemover(columns=outlier_cols,iqr_multiplier=100)

outlier_remover.fit(df)
df_proc = outlier_remover.transform(df)

df_proc.count()

Id               1460
MSSubClass       1460
MSZoning         1460
LotFrontage      1201
LotArea          1460
                 ... 
MoSold           1460
YrSold           1460
SaleType         1460
SaleCondition    1460
SalePrice        1460
Length: 81, dtype: int64

# ---Feature Engineering---

In [273]:
# Engineered features to keep

df_proc['RatioBathBed'] = (df_proc['FullBath']+df['HalfBath']) / (df_proc['BedroomAbvGr']+1)

In [274]:
# create a new feature with the ratio of garage area per car
# add 1 to denominator to avoid division by 0

df_proc['SFPerCar'] = (df_proc['GarageArea']) / (df_proc['GarageCars']+1)


In [275]:
# check value counts

df_proc['SFPerCar'].value_counts()

SFPerCar
0.000000      81
192.000000    57
120.000000    48
146.666667    47
176.000000    45
              ..
245.333333     1
115.000000     1
130.500000     1
187.250000     1
203.000000     1
Name: count, Length: 399, dtype: int64

In [276]:
# check distribution

df_proc['SFPerCar'].describe()

count    1460.000000
mean      161.527842
std        54.816337
min         0.000000
25%       141.625000
50%       162.333333
75%       192.000000
max       472.666667
Name: SFPerCar, dtype: float64

# ---Pipeline---

In [277]:
# Defining the list of columns that will go into the pipeline
# Remove columns that should not be included in the pipeline

In [278]:
# add in Engineered feature of SFPerCar

columns_to_keep = ['1stFlrSF','GarageCars','GrLivArea','LotArea','KitchenQual',
                   'TotalBsmtSF','YearBuilt','YearRemodAdd','FireplaceQu','OpenPorchSF','OverallQual','BsmtFinSF1',
                   'YrSold','BsmtQual','Fireplaces','RatioBathBed','SFPerCar']

In [279]:
# Preprocessor to filter unwanted columns, or unexpected columns.

preprocessor_filter = ColumnTransformer([
  ('passthrough','passthrough',columns_to_keep)
],remainder='drop', verbose_feature_names_out=False)

In [280]:
# Preprocessor for imputing all null variables

# cols_impute_na = ['Alley', 'GarageType', 'Fence','BsmtQual', 'BsmtCond', 'BsmtExposure',
#                   'BsmtFinType1','BsmtFinType2', 'FireplaceQu', 'GarageFinish', 'GarageQual','GarageCond',
#                   'PoolQC', 'MiscFeature']
# cols_impute_none = ['MasVnrType']
# cols_impute_most_freq = ['Electrical']
# cols_impute_zero = ['LotFrontage', 'MasVnrArea']

cols_impute_na = ['BsmtQual']
# cols_impute_none = ['MasVnrType']
# cols_impute_most_freq = ['Electrical']
# cols_impute_zero = ['LotFrontage', 'MasVnrArea']

preprocessor_imputer = ColumnTransformer([
  ('impute_na', SimpleImputer(strategy='constant', fill_value='NA'), cols_impute_na),
  # ('impute_none', SimpleImputer(strategy='constant', fill_value='None'), cols_impute_none),
  # ('impute_zero', SimpleImputer(strategy='constant', fill_value=0), cols_impute_zero),
  # ('impute_most_freq', SimpleImputer(strategy='most_frequent'), cols_impute_most_freq)
], remainder='passthrough', verbose_feature_names_out=False)

In [281]:
# Defining maps for Ordinal categories

dict_na_ex_6 = {'NA':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5}
dict_na_gd_5 = {'NA':0,'No':1,'Mn':2,'Av':3,'Gd':4}
dict_bsmt = {'NA':0,'Unf':1,'LwQ':2,'Rec':3,'BLQ':4,'ALQ':5,'GLQ':6}
dict_garage = {'NA':0,'Unf':1,'RFn':2,'Fin':3}
dict_1_10 = {1:0, 2:1, 3:2, 4:3, 5:4, 6:5, 7:6, 8:7, 9:8, 10:9}
dict_fence = {'NA':0, 'MnWw':1, 'GdWo':2, 'MnPrv':3, 'GdPrv':4}


ordinal_cat_map = [
  {'col':'BsmtQual','mapping':dict_na_ex_6},
  {'col':'BsmtCond','mapping':dict_na_ex_6},
  {'col':'BsmtExposure','mapping':dict_na_gd_5},
  {'col':'BsmtFinType1','mapping':dict_bsmt},
  {'col':'BsmtFinType2','mapping':dict_bsmt},
  {'col':'FireplaceQu','mapping':dict_na_ex_6},
  {'col':'GarageFinish','mapping':dict_garage},
  {'col':'GarageQual','mapping':dict_na_ex_6},
  {'col':'GarageCond','mapping':dict_na_ex_6},
  {'col':'PoolQC','mapping':dict_na_ex_6},
  {'col': 'OverallQual', 'mapping':dict_1_10},
  {'col': 'OverallCond', 'mapping':dict_1_10},
  {'col': 'ExterQual', 'mapping': dict_na_ex_6},
  {'col': 'ExterCond', 'mapping': dict_na_ex_6},
  {'col': 'HeatingQC', 'mapping': dict_na_ex_6},
  {'col': 'KitchenQual', 'mapping': dict_na_ex_6},
  {'col': 'Fence', 'mapping': dict_fence}
  ]

In [282]:
# Preprocessor for encoding and scaling

# ohe_cols = ['MSSubClass','MSZoning','Alley','LotShape','LandContour','LotConfig','LandSlope',
#             'Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st',
#             'Exterior2nd','MasVnrType','Foundation','Heating', 'CentralAir','Electrical','Functional',
#             'GarageType','PavedDrive','MiscFeature','MoSold','YrSold','SaleType','SaleCondition']

# oe_cols = ['OverallQual','OverallCond','ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure',
#            'BsmtFinType1','BsmtFinType2','HeatingQC','FireplaceQu', 'GarageFinish','GarageQual', 'GarageCond',
#            'PoolQC','KitchenQual','Fence']

# robust_cols = ['LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF',
#                'GrLivArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','ScreenPorch','PoolArea','MiscVal']


oe_cols = ['BsmtQual','KitchenQual','FireplaceQu','OverallQual']

robust_cols = ['BsmtFinSF1','OpenPorchSF','YearBuilt','GrLivArea','1stFlrSF']


preprocessor_encode_scale = ColumnTransformer([
  # ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ohe_cols),
  ('oe', OrdinalEncoder(), oe_cols),
  # ('standard_scaler', StandardScaler(), ['']),
  ('robust_scaler', RobustScaler(), robust_cols),
  ('minmax_scaler', MinMaxScaler(), ['YearRemodAdd'])
], remainder='passthrough', verbose_feature_names_out=False)

In [283]:
X = df_proc.drop(columns='SalePrice')
y = df_proc['SalePrice'].copy()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [284]:
main_pipe = Pipeline([
  ('preprocessor_filter', preprocessor_filter),
  ('preprocessor_imputer', preprocessor_imputer),
  ('preprocessor_encode_scale', preprocessor_encode_scale)
])

In [285]:
from sklearn.ensemble import RandomForestRegressor

rf_pipe = Pipeline([('main_pipe', main_pipe),('rf_model',RandomForestRegressor())])

In [286]:
rf_pipe.get_params()

{'memory': None,
 'steps': [('main_pipe',
   Pipeline(steps=[('preprocessor_filter',
                    ColumnTransformer(transformers=[('passthrough', 'passthrough',
                                                     ['1stFlrSF', 'GarageCars',
                                                      'GrLivArea', 'LotArea',
                                                      'KitchenQual', 'TotalBsmtSF',
                                                      'YearBuilt', 'YearRemodAdd',
                                                      'FireplaceQu', 'OpenPorchSF',
                                                      'OverallQual', 'BsmtFinSF1',
                                                      'YrSold', 'BsmtQual',
                                                      'Fireplaces', 'RatioBathBed',
                                                      'SFPerCar'])],
                                      verbose_feature_names_out=False...
                                      

In [287]:
# GridSearch

from sklearn.model_selection import GridSearchCV

params = {'rf_model__n_estimators': [200],
    'rf_model__max_depth': [None],
    'rf_model__min_samples_split': [5],
    'rf_model__min_samples_leaf': [1]
    }

rf_gs = GridSearchCV(rf_pipe, params, cv=5, n_jobs=-1)


In [288]:
rf_gs.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [289]:
rf_gs.best_params_

{'rf_model__max_depth': None,
 'rf_model__min_samples_leaf': 1,
 'rf_model__min_samples_split': 5,
 'rf_model__n_estimators': 200}

In [290]:
rf_gs.best_score_

np.float64(0.8341378167302581)

In [291]:
pd.DataFrame(rf_gs.cv_results_).sort_values(by='rank_test_score').head(3)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rf_model__max_depth,param_rf_model__min_samples_leaf,param_rf_model__min_samples_split,param_rf_model__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.097204,0.11809,0.012682,0.000558,,1,5,200,"{'rf_model__max_depth': None, 'rf_model__min_s...",0.872185,0.729568,0.81405,0.868922,0.885965,0.834138,0.057761,1


In [292]:
grid_results_train = evaluate_regression(rf_gs, X_train, y_train, 'rf_gs')
grid_results_val = evaluate_regression(rf_gs, X_val, y_val, 'rf_gs')

In [293]:
grid_results_train

Unnamed: 0,MAE,MSE,RMSE,MAPE,R2,adj_r2
rf_gs,7639.114553,174816400.0,13221.814099,0.045796,0.970691,0.968476


In [294]:
grid_results_val

Unnamed: 0,MAE,MSE,RMSE,MAPE,R2,adj_r2
rf_gs,17260.087223,815816300.0,28562.49861,0.108779,0.89364,0.85191


In [295]:
val_preds = rf_gs.predict(X_val)
pred_log = np.log(val_preds)
y_log = np.log(y_val)
root_mean_squared_error(y_log, pred_log)

0.158015873864694