## Refetching transformed data

In [1]:
import pandas as pd

train_data = pd.read_csv('./data/aligned_encoded_train_data.csv')
test_data = pd.read_csv('./data/aligned_encoded_test_data.csv')

test_data.drop(columns=['SalePrice'], inplace=True)

train_data.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,BsmtCond,...,ScreenPorch,Street_Grvl,Street_Pave,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,856,854,0,3,1,0,0,0,0,3,...,0,0,1,8,856,4,0,2003,2003,2008
1,1262,0,0,3,1,0,0,0,0,3,...,0,0,1,6,1262,4,298,1976,1976,2007
2,920,866,0,3,1,0,0,0,0,3,...,0,0,1,6,920,4,0,2001,2002,2008
3,961,756,0,3,1,0,0,0,0,4,...,0,0,1,7,756,4,0,1915,1970,2006
4,1145,1053,0,4,1,0,0,0,0,3,...,0,0,1,9,1145,4,192,2000,2000,2008


## Feature scaling and normalization

References:

- https://towardsdatascience.com/the-ultimate-guide-to-data-cleaning-3969843991d4#d078
- https://www.codecademy.com/articles/normalization
- https://machinelearningmastery.com/feature-selection-machine-learning-python/
- https://towardsdatascience.com/feature-selection-techniques-in-machine-learning-with-python-f24e7da3f36e

In [2]:
# space (normalize and scale):
#     LotArea, BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, 1stFlrSF, 2ndFlrSF, 
#     GrLivArea, GarageArea, WoodDeckSF, OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch, PoolArea,
#     BedroomAbvGr, KitchenAbvGr, LotFrontage (*), LotDepth (*), MasVnrArea, 
#
# quantity (scale):
#     BsmtFullBath, BsmtHalfBath, FullBath, HalfBath, TotRmsAbvGrd, Fireplaces, GarageCars
#
# grade (scale):
#     LotShape, LandSlope, Utilities, ExterQual, ExterCond, BsmtQual, BsmtCond, BsmtExposure,
#     HeatingQC, CentralAir, KitchenQual, Functional, FireplaceQu, GarageFinish, GarageQual, GarageCond
#     BsmtFinType1, BsmtFinType2, OverallQual, OverallCond, 
#
# time (scale):
#     YearBuilt, YearRemodAdd, YrSold, GarageYrBlt,
#
# currency (normalize and scale):
#     MiscVal, NeighborhoodMeanPrice, GarageTypeMeanPrice
#
# one-hot-encoded-binary (keep):
#     MSSubClass, MSZoning, LandContour, Street, LotConfig, BldgType, HouseStyle, RoofStyle, RoofMatl, MoSold,
#     MasVnrType, Foundation, Heating, Electrical, PavedDrive, Condition1, Condition2, Exterior1st, Exterior2nd

columns_to_normalize = [
    'MiscVal',
    'NeighborhoodMeanPrice',
    'GarageTypeMeanPrice',
    'LotArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    '1stFlrSF',
    '2ndFlrSF',
    'GrLivArea',
    'GarageArea',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
    '3SsnPorch',
    'ScreenPorch',
    'PoolArea',
    'BedroomAbvGr',
    'KitchenAbvGr',
    'LotFrontage',
    'LotDepth',
    'MasVnrArea'
]

columns_to_scale = [
    'MiscVal',
    'NeighborhoodMeanPrice',
    'GarageTypeMeanPrice',
    'LotArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    '1stFlrSF',
    '2ndFlrSF',
    'GrLivArea',
    'GarageArea',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
    '3SsnPorch',
    'ScreenPorch',
    'PoolArea',
    'BedroomAbvGr',
    'KitchenAbvGr',
    'LotFrontage',
    'LotDepth',
    'MasVnrArea',
    'BsmtFullBath',
    'BsmtHalfBath',
    'FullBath',
    'HalfBath',
    'TotRmsAbvGrd',
    'Fireplaces',
    'GarageCars',
    'LotShape',
    'LandSlope',
    'Utilities',
    'ExterQual',
    'ExterCond',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'HeatingQC',
    'CentralAir',
    'KitchenQual',
    'Functional',
    'FireplaceQu',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'BsmtFinType1',
    'BsmtFinType2',
    'OverallQual',
    'OverallCond',
    'YearBuilt',
    'YearRemodAdd',
    'YrSold',
    'GarageYrBlt'
]

In [3]:
# mean normalization
normalized_train_data = train_data.copy()
normalized_test_data = test_data.copy()

normalized_train_data[columns_to_normalize] = (normalized_train_data[columns_to_normalize] - normalized_train_data[columns_to_normalize].mean()) / normalized_train_data[columns_to_normalize].std()
normalized_test_data[columns_to_normalize] = (normalized_test_data[columns_to_normalize] - normalized_test_data[columns_to_normalize].mean()) / normalized_test_data[columns_to_normalize].std()

normalized_train_data.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,BsmtCond,...,ScreenPorch,Street_Grvl,Street_Pave,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,-0.793162,1.161454,-0.116299,0.163723,1,0,0,0,0,3,...,-0.270116,0,1,8,-0.459145,4,-0.751918,2003,2003,2008
1,0.257052,-0.794891,-0.116299,0.163723,1,0,0,0,0,3,...,-0.270116,0,1,6,0.466305,4,1.625638,1976,1976,2007
2,-0.627611,1.188943,-0.116299,0.163723,1,0,0,0,0,3,...,-0.270116,0,1,6,-0.313261,4,-0.751918,2001,2002,2008
3,-0.521555,0.936955,-0.116299,0.163723,1,0,0,0,0,4,...,-0.270116,0,1,7,-0.687089,4,-0.751918,1915,1970,2006
4,-0.045596,1.617323,-0.116299,1.389547,1,0,0,0,0,3,...,-0.270116,0,1,9,0.199611,4,0.77993,2000,2000,2008


In [4]:
# min max scaling
# normalized_df=(df-df.min())/(df.max()-df.min())
scaled_train_data = normalized_train_data.copy()
scaled_test_data = normalized_test_data.copy()

scaled_train_data[columns_to_scale] = (scaled_train_data[columns_to_scale] - scaled_train_data[columns_to_scale].min()) / (scaled_train_data[columns_to_scale].max() - scaled_train_data[columns_to_scale].min())
scaled_test_data[columns_to_scale] = (scaled_test_data[columns_to_scale] - scaled_train_data[columns_to_scale].min()) / (scaled_train_data[columns_to_scale].max() - scaled_train_data[columns_to_scale].min())

scaled_train_data.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,BsmtCond,...,ScreenPorch,Street_Grvl,Street_Pave,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,0.11978,0.413559,0.0,0.375,1,0,0,0,0,0.75,...,0.0,0,1,0.5,0.140098,1.0,0.0,0.949275,0.883333,0.5
1,0.212942,0.0,0.0,0.375,1,0,0,0,0,0.75,...,0.0,0,1,0.333333,0.206547,1.0,0.347725,0.753623,0.433333,0.25
2,0.134465,0.41937,0.0,0.375,1,0,0,0,0,0.75,...,0.0,0,1,0.333333,0.150573,1.0,0.0,0.934783,0.866667,0.5
3,0.143873,0.366102,0.0,0.375,1,0,0,0,0,1.0,...,0.0,0,1,0.416667,0.123732,1.0,0.0,0.311594,0.333333,0.0
4,0.186095,0.509927,0.0,0.5,1,0,0,0,0,0.75,...,0.0,0,1,0.583333,0.187398,1.0,0.224037,0.927536,0.833333,0.5


## Feature Selection

References:

- https://towardsdatascience.com/feature-selection-techniques-in-machine-learning-with-python-f24e7da3f36e
- https://seaborn.pydata.org/generated/seaborn.pairplot.html

### Feature correlation

- Drop highly correlated features

In [5]:
reduced_train_data = scaled_train_data.copy()
reduced_test_data = scaled_test_data.copy()

def run_corr_analysis():
    features = reduced_train_data.drop(columns=['Id', 'SalePrice'])

    corr = features.corr().abs()
    corr[corr == 1] = 0
    corr_cols = corr.max().sort_values(ascending=False)
    return corr_cols[corr_cols > 0.8]

display(run_corr_analysis())
reduced_train_data.drop(columns=['MSSubClass_190'], inplace=True)
reduced_test_data.drop(columns=['MSSubClass_190'], inplace=True)
reduced_train_data.drop(columns=['Exterior2nd_VinylSd'], inplace=True)
reduced_test_data.drop(columns=['Exterior2nd_VinylSd'], inplace=True)
reduced_train_data.drop(columns=['Exterior2nd_CmentBd'], inplace=True)
reduced_test_data.drop(columns=['Exterior2nd_CmentBd'], inplace=True)
reduced_train_data.drop(columns=['Exterior2nd_MetalSd'], inplace=True)
reduced_test_data.drop(columns=['Exterior2nd_MetalSd'], inplace=True)
reduced_train_data.drop(columns=['GarageQual'], inplace=True)
reduced_test_data.drop(columns=['GarageQual'], inplace=True)
reduced_train_data.drop(columns=['MSSubClass_80'], inplace=True)
reduced_test_data.drop(columns=['MSSubClass_80'], inplace=True)
reduced_train_data.drop(columns=['MSSubClass_50'], inplace=True)
reduced_test_data.drop(columns=['MSSubClass_50'], inplace=True)
reduced_train_data.drop(columns=['RoofStyle_Hip'], inplace=True)
reduced_test_data.drop(columns=['RoofStyle_Hip'], inplace=True)
reduced_train_data.drop(columns=['MSSubClass_45'], inplace=True)
reduced_test_data.drop(columns=['MSSubClass_45'], inplace=True)
reduced_train_data.drop(columns=['Exterior2nd_HdBoard'], inplace=True)
reduced_test_data.drop(columns=['Exterior2nd_HdBoard'], inplace=True)
reduced_train_data.drop(columns=['GarageArea'], inplace=True)
reduced_test_data.drop(columns=['GarageArea'], inplace=True)
reduced_train_data.drop(columns=['Exterior2nd_Wd Sdng'], inplace=True)
reduced_test_data.drop(columns=['Exterior2nd_Wd Sdng'], inplace=True)
reduced_train_data.drop(columns=['PavedDrive_N'], inplace=True)
reduced_test_data.drop(columns=['PavedDrive_N'], inplace=True)
reduced_train_data.drop(columns=['Exterior2nd_AsbShng'], inplace=True)
reduced_test_data.drop(columns=['Exterior2nd_AsbShng'], inplace=True)
reduced_train_data.drop(columns=['RoofMatl_Tar&Grv'], inplace=True)
reduced_test_data.drop(columns=['RoofMatl_Tar&Grv'], inplace=True)
reduced_train_data.drop(columns=['GrLivArea'], inplace=True)
reduced_test_data.drop(columns=['GrLivArea'], inplace=True)
run_corr_analysis()

MSSubClass_190         0.983395
BldgType_2fmCon        0.983395
Exterior2nd_VinylSd    0.977525
Exterior1st_VinylSd    0.977525
Exterior1st_CemntBd    0.974171
Exterior2nd_CmentBd    0.974171
Exterior2nd_MetalSd    0.973065
Exterior1st_MetalSd    0.973065
GarageQual             0.959172
GarageCond             0.959172
MSSubClass_80          0.942259
HouseStyle_SLvl        0.942259
HouseStyle_1.5Fin      0.940871
MSSubClass_50          0.940871
RoofStyle_Gable        0.933462
RoofStyle_Hip          0.933462
HouseStyle_1.5Unf      0.925181
MSSubClass_45          0.925181
Exterior2nd_HdBoard    0.883271
Exterior1st_HdBoard    0.883271
GarageArea             0.882475
GarageCars             0.882475
FireplaceQu            0.863241
Fireplaces             0.863241
Exterior1st_Wd Sdng    0.859244
Exterior2nd_Wd Sdng    0.859244
Electrical_SBrkr       0.857283
Electrical_FuseA       0.857283
PavedDrive_Y           0.856491
PavedDrive_N           0.856491
Exterior1st_AsbShng    0.847917
Exterior

Fireplaces            0.863241
FireplaceQu           0.863241
Electrical_FuseA      0.857283
Electrical_SBrkr      0.857283
1stFlrSF              0.819530
TotalBsmtSF           0.819530
HouseStyle_2Story     0.809150
2ndFlrSF              0.809150
MSZoning_RL           0.808585
MSZoning_RM           0.808585
MasVnrType_BrkFace    0.806337
MasVnrType_None       0.806337
dtype: float64

### Target correlation

- Drop lowly target-correlated features

In [6]:
def run_target_corr_analysis():
    features = reduced_train_data.drop(columns=['Id'])

    corr = features.corr().abs()
    target_corr = corr['SalePrice'].sort_values(ascending=True)
    return target_corr[target_corr < 0.2]

lowly_correlated_feats = run_target_corr_analysis()
display(reduced_train_data.shape)
display(lowly_correlated_feats)

reduced_train_data.drop(columns=lowly_correlated_feats.index, inplace=True)
reduced_test_data.drop(columns=lowly_correlated_feats.index, inplace=True)

display(run_target_corr_analysis())
display(reduced_train_data.shape)

(1460, 185)

RoofMatl_Metal         0.000304
RoofStyle_Mansard      0.000308
BsmtFinType2           0.000651
Foundation_Wood        0.002711
Condition2_RRAe        0.002993
BldgType_TwnhsE        0.003804
LotConfig_Corner       0.004145
MoSold_10              0.004354
Condition1_RRNe        0.004584
Condition1_RRAn        0.005893
MoSold_1               0.005980
LotConfig_FR2          0.006859
RoofMatl_ClyTile       0.006897
MoSold_2               0.007355
MoSold_3               0.008219
BsmtFinSF2             0.011378
Foundation_Stone       0.012103
MoSold_8               0.014185
Utilities              0.014314
MoSold_12              0.014465
RoofMatl_Roll          0.014479
Condition2_RRAn        0.014510
MSSubClass_75          0.015265
MSSubClass_40          0.016366
RoofStyle_Flat         0.016433
Exterior2nd_Stone      0.016754
BsmtHalfBath           0.016844
Exterior1st_Plywood    0.017719
LotConfig_FR3          0.018186
ExterCond              0.018899
                         ...   
Exterior

Series([], Name: SalePrice, dtype: float64)

(1460, 53)

### Feature variance

- Drop features with no variance (exists only in test data)

In [7]:
reduced_var_train_data = reduced_train_data.copy()
reduced_var_test_data = reduced_test_data.copy()

def run_var_analysis():
    features = reduced_var_train_data.drop(columns=['Id', 'SalePrice'])

    var_feats = features.var()
    var_feats_ordered = var_feats.sort_values(ascending=True)
    return var_feats_ordered[var_feats_ordered == 0]

low_variance_feats = run_var_analysis()
display(reduced_var_train_data.shape)
display(low_variance_feats)

reduced_var_train_data.drop(columns=low_variance_feats.index, inplace=True)
reduced_var_test_data.drop(columns=low_variance_feats.index, inplace=True)

display(run_var_analysis())
display(reduced_var_train_data.shape)

(1460, 53)

MSSubClass_150       0.0
Exterior1st_Other    0.0
dtype: float64

Series([], dtype: float64)

(1460, 51)

### Feature importance

- Drop lowly important features

In [8]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(n_jobs=-1, random_state=42, n_estimators=50)

feature_selected_train_data = reduced_var_train_data.copy()
feature_selected_test_data = reduced_var_test_data.copy()

def run_importance_analysis():
    features = feature_selected_train_data.drop(columns=['Id', 'SalePrice'])
    
    rfr_importance_analyser = rfr.fit(features, feature_selected_train_data['SalePrice'])
    return features.columns[rfr_importance_analyser.feature_importances_ < 0.005]

low_importance_feats = run_importance_analysis()
display(feature_selected_train_data.shape)
display(low_importance_feats)

feature_selected_train_data.drop(columns=low_importance_feats, inplace=True)
feature_selected_test_data.drop(columns=low_importance_feats, inplace=True)

display(run_importance_analysis())
display(feature_selected_train_data.shape)

(1460, 51)

Index([u'BsmtCond', u'BsmtExposure', u'BsmtFinType1', u'BsmtFullBath',
       u'BsmtQual', u'CentralAir', u'Electrical_SBrkr', u'ExterQual',
       u'Exterior1st_VinylSd', u'Fireplaces', u'Foundation_BrkTil',
       u'Foundation_CBlock', u'Foundation_PConc', u'GarageCond',
       u'GarageFinish', u'GarageTypeMeanPrice', u'HalfBath', u'HeatingQC',
       u'HouseStyle_2Story', u'KitchenQual', u'LotShape', u'MSSubClass_30',
       u'MSSubClass_60', u'MSZoning_RL', u'MSZoning_RM', u'MasVnrType_None',
       u'MasVnrType_Stone', u'PavedDrive_Y', u'RoofStyle_Gable', u'YearBuilt'],
      dtype='object')

Index([], dtype='object')

(1460, 21)

In [9]:
feature_selected_train_data.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,BsmtFinSF1,BsmtUnfSF,FireplaceQu,FullBath,GarageCars,GarageYrBlt,Id,LotArea,...,LotFrontage,MasVnrArea,NeighborhoodMeanPrice,OpenPorchSF,OverallQual,SalePrice,TotRmsAbvGrd,TotalBsmtSF,WoodDeckSF,YearRemodAdd
0,0.11978,0.413559,0.125089,0.064212,0.0,0.666667,0.5,0.941667,1,0.03342,...,0.045534,0.1225,0.419865,0.111517,0.666667,208500,0.5,0.140098,0.0,0.883333
1,0.212942,0.0,0.173281,0.121575,0.6,0.666667,0.5,0.716667,2,0.038795,...,0.058669,0.0,0.592251,0.0,0.555556,181500,0.333333,0.206547,0.347725,0.433333
2,0.134465,0.41937,0.086109,0.185788,0.6,0.666667,0.5,0.925,3,0.046507,...,0.048161,0.10125,0.419865,0.076782,0.666667,223500,0.333333,0.150573,0.0,0.866667
3,0.143873,0.366102,0.038271,0.231164,0.8,0.333333,0.75,0.9,4,0.038561,...,0.041156,0.0,0.473342,0.063985,0.666667,140000,0.416667,0.123732,0.0,0.333333
4,0.186095,0.509927,0.116052,0.20976,0.6,0.666667,0.75,0.916667,5,0.060576,...,0.062172,0.21875,1.0,0.153565,0.777778,250000,0.583333,0.187398,0.224037,0.833333
