## Refetching transformed data

In [1]:
import pandas as pd

train_data = pd.read_csv('./data/aligned_encoded_train_data.csv')
test_data = pd.read_csv('./data/aligned_encoded_test_data.csv')

train_data.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,BsmtCond,...,ScreenPorch,Street_Grvl,Street_Pave,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,856,854,0,3,1,0,0,0,0,3,...,0,0,1,8,856,4,0,2003,2003,2008
1,1262,0,0,3,1,0,0,0,0,3,...,0,0,1,6,1262,4,298,1976,1976,2007
2,920,866,0,3,1,0,0,0,0,3,...,0,0,1,6,920,4,0,2001,2002,2008
3,961,756,0,3,1,0,0,0,0,4,...,0,0,1,7,756,4,0,1915,1970,2006
4,1145,1053,0,4,1,0,0,0,0,3,...,0,0,1,9,1145,4,192,2000,2000,2008


## Feature scaling and normalization

References:

- https://towardsdatascience.com/the-ultimate-guide-to-data-cleaning-3969843991d4#d078
- https://www.codecademy.com/articles/normalization

In [2]:
# space (normalize and scale):
#     LotArea, BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, 1stFlrSF, 2ndFlrSF, 
#     GrLivArea, GarageArea, WoodDeckSF, OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch, PoolArea,
#     BedroomAbvGr, KitchenAbvGr, LotFrontage (*), LotDepth (*), MasVnrArea, 
#
# quantity (scale):
#     BsmtFullBath, BsmtHalfBath, FullBath, HalfBath, TotRmsAbvGrd, Fireplaces, GarageCars
#
# grade (scale):
#     LotShape, LandSlope, Utilities, ExterQual, ExterCond, BsmtQual, BsmtCond, BsmtExposure,
#     HeatingQC, CentralAir, KitchenQual, Functional, FireplaceQu, GarageFinish, GarageQual, GarageCond
#     BsmtFinType1, BsmtFinType2, OverallQual, OverallCond, 
#
# time (scale):
#     YearBuilt, YearRemodAdd, YrSold, GarageYrBlt,
#
# currency (normalize and scale):
#     MiscVal, NeighborhoodMeanPrice, GarageTypeMeanPrice
#
# one-hot-encoded-binary (keep):
#     MSSubClass, MSZoning, LandContour, Street, LotConfig, BldgType, HouseStyle, RoofStyle, RoofMatl, MoSold,
#     MasVnrType, Foundation, Heating, Electrical, PavedDrive, Condition1, Condition2, Exterior1st, Exterior2nd

columns_to_normalize = [
    'MiscVal',
    'NeighborhoodMeanPrice',
    'GarageTypeMeanPrice',
    'LotArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    '1stFlrSF',
    '2ndFlrSF',
    'GrLivArea',
    'GarageArea',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
    '3SsnPorch',
    'ScreenPorch',
    'PoolArea',
    'BedroomAbvGr',
    'KitchenAbvGr',
    'LotFrontage',
    'LotDepth',
    'MasVnrArea'
]

columns_to_scale = [
    'MiscVal',
    'NeighborhoodMeanPrice',
    'GarageTypeMeanPrice',
    'LotArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    '1stFlrSF',
    '2ndFlrSF',
    'GrLivArea',
    'GarageArea',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
    '3SsnPorch',
    'ScreenPorch',
    'PoolArea',
    'BedroomAbvGr',
    'KitchenAbvGr',
    'LotFrontage',
    'LotDepth',
    'MasVnrArea',
    'BsmtFullBath',
    'BsmtHalfBath',
    'FullBath',
    'HalfBath',
    'TotRmsAbvGrd',
    'Fireplaces',
    'GarageCars',
    'LotShape',
    'LandSlope',
    'Utilities',
    'ExterQual',
    'ExterCond',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'HeatingQC',
    'CentralAir',
    'KitchenQual',
    'Functional',
    'FireplaceQu',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'BsmtFinType1',
    'BsmtFinType2',
    'OverallQual',
    'OverallCond',
    'YearBuilt',
    'YearRemodAdd',
    'YrSold',
    'GarageYrBlt'
]

In [3]:
# mean normalization
normalized_train_data = train_data.copy()
normalized_test_data = train_data.copy()

normalized_train_data[columns_to_normalize] = (normalized_train_data[columns_to_normalize] - normalized_train_data[columns_to_normalize].mean()) / normalized_train_data[columns_to_normalize].std()
normalized_test_data[columns_to_normalize] = (normalized_test_data[columns_to_normalize] - normalized_test_data[columns_to_normalize].mean()) / normalized_test_data[columns_to_normalize].std()

normalized_train_data.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,BsmtCond,...,ScreenPorch,Street_Grvl,Street_Pave,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,-0.793162,1.161454,-0.116299,0.163723,1,0,0,0,0,3,...,-0.270116,0,1,8,-0.459145,4,-0.751918,2003,2003,2008
1,0.257052,-0.794891,-0.116299,0.163723,1,0,0,0,0,3,...,-0.270116,0,1,6,0.466305,4,1.625638,1976,1976,2007
2,-0.627611,1.188943,-0.116299,0.163723,1,0,0,0,0,3,...,-0.270116,0,1,6,-0.313261,4,-0.751918,2001,2002,2008
3,-0.521555,0.936955,-0.116299,0.163723,1,0,0,0,0,4,...,-0.270116,0,1,7,-0.687089,4,-0.751918,1915,1970,2006
4,-0.045596,1.617323,-0.116299,1.389547,1,0,0,0,0,3,...,-0.270116,0,1,9,0.199611,4,0.77993,2000,2000,2008


In [4]:
# min max scaling
# normalized_df=(df-df.min())/(df.max()-df.min())
scaled_train_data = normalized_train_data.copy()
scaled_test_data = normalized_test_data.copy()

scaled_train_data[columns_to_scale] = (scaled_train_data[columns_to_scale] - scaled_train_data[columns_to_scale].min()) / (scaled_train_data[columns_to_scale].max() - scaled_train_data[columns_to_scale].min())
scaled_test_data[columns_to_scale] = (scaled_test_data[columns_to_scale] - scaled_train_data[columns_to_scale].min()) / (scaled_train_data[columns_to_scale].max() - scaled_train_data[columns_to_scale].min())

scaled_train_data.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,BsmtCond,...,ScreenPorch,Street_Grvl,Street_Pave,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,0.11978,0.413559,0.0,0.375,1,0,0,0,0,0.75,...,0.0,0,1,0.5,0.140098,1.0,0.0,0.949275,0.883333,0.5
1,0.212942,0.0,0.0,0.375,1,0,0,0,0,0.75,...,0.0,0,1,0.333333,0.206547,1.0,0.347725,0.753623,0.433333,0.25
2,0.134465,0.41937,0.0,0.375,1,0,0,0,0,0.75,...,0.0,0,1,0.333333,0.150573,1.0,0.0,0.934783,0.866667,0.5
3,0.143873,0.366102,0.0,0.375,1,0,0,0,0,1.0,...,0.0,0,1,0.416667,0.123732,1.0,0.0,0.311594,0.333333,0.0
4,0.186095,0.509927,0.0,0.5,1,0,0,0,0,0.75,...,0.0,0,1,0.583333,0.187398,1.0,0.224037,0.927536,0.833333,0.5


## Feature Selection

References:

- https://towardsdatascience.com/feature-selection-techniques-in-machine-learning-with-python-f24e7da3f36e
- https://seaborn.pydata.org/generated/seaborn.pairplot.html

In [None]:
# target = train_data['SalePrice']
# features = train_data.drop(columns=['Id'])

# for feature in features.columns:
#     if (features[feature].dtype == 'int64' or features[feature].dtype == 'float64'):
#         print(feature)

# import seaborn as sns
# sns.pairplot(features[['MSSubClass', 'LotFrontage', 'LotArea']].fillna(0))

# analyse correlation
# analyse importance
# aggregation and feature selection