# Preprocessing

In [139]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, RobustScaler, MaxAbsScaler, Normalizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from sklearn import set_config
set_config(transform_output='pandas')

from category_encoders import OrdinalEncoder


In [140]:
df = pd.read_csv('../data/train.csv')

##### Imputing nulls with constant

In [141]:
  # LotFrontage - 0
  # MasVnrArea - 0

  # MasVnrType -	None

  # Electrical	- median

  # Alley	- NA
  # BsmtQual - NA
  # BsmtCond - NA
  # BsmtExposure - NA
  # BsmtFinType1 - NA
  # BsmtFinType2 - NA
  # FireplaceQu	- NA
  # GarageType	- NA
  # GarageYrBlt	- NA
  # GarageFinish - NA
  # GarageQual - NA
  # GarageCond - NA
  # PoolQC - NA
  # Fence - NA
  # MiscFeature	- NA

# Separating Categorial vs Numeric

### Impute

In [164]:
### Categorical Nominal
impute_na = ['Alley', 'GarageType', 'GarageYrBlt', 'Fence',]
impute_none = ['MasVnrType']
impute_median = ['Electrical']

### Categorical Ordinal
impute_na = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
             'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'MiscFeature']

### Numeric
impute_zero = ['LotFrontage', 'MasVnrArea']

In [143]:
imputer_na = SimpleImputer(strategy='constant', fill_value='NA')
imputer_none = SimpleImputer(strategy='constant', fill_value='None')
imputer_most = SimpleImputer(strategy='most_frequent')

imputer_zero = SimpleImputer(strategy='constant', fill_value=0)

In [144]:
scaler_standard = StandardScaler()
scaler_robust = RobustScaler()

In [145]:
ohe = OneHotEncoder(sparse_output=False, drop='if_binary', handle_unknown='ignore')

### Encode

In [146]:
dict_na_ex_6 = {'NA':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5}
dict_na_gd_5 = {'NA':0,'No':1,'Mn':2,'Av':3,'Gd':4}
dict_bsmt = {'NA':0,'Unf':1,'LwQ':2,'Rec':3,'BLQ':4,'ALQ':5,'GLQ':6}
dict_garage = {'NA':0,'Unf':1,'RFn':2,'Fin':3}


imputed_cat_map = [
  {'col':'BsmtQual','mapping':dict_na_ex_6},
  {'col':'BsmtCond','mapping':dict_na_ex_6},
  {'col':'BsmtExposure','mapping':dict_na_gd_5},
  {'col':'BsmtFinType1','mapping':dict_bsmt},
  {'col':'BsmtFinType2','mapping':dict_bsmt},
  {'col':'FireplaceQu','mapping':dict_na_ex_6},
  {'col':'GarageFinish','mapping':dict_garage},
  {'col':'GarageQual','mapping':dict_na_ex_6},
  {'col':'GarageCond','mapping':dict_na_ex_6},
  {'col':'PoolQC','mapping':dict_na_ex_6}
  ]


In [147]:
### Categorical Nominal
impute_na = ['Alley', 'GarageType', 'GarageYrBlt', 'PoolQC', 'Fence', 'MiscFeature']
impute_none = ['MasVnrType']
impute_most = ['Electrical']

### Categorical Ordinal
impute_na = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
             'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC']

### Numeric
impute_zero = ['LotFrontage', 'MasVnrArea']


### Pipes that need imputing

In [148]:
# Nominal, impute with NA, then use OHE
nom_na_pipe = Pipeline([('imputer_na', imputer_na),('ohe', ohe)])

# Nominal, impute with None, then use OHE
nom_none_pipe = Pipeline([('imputer_none', imputer_none),('ohe', ohe)])

# Nominal, impute with most frequent, then use OHE
nom_most_pipe = Pipeline([('imputer_most', imputer_most),('ohe', ohe)])

# Ordinal, impute with NA, then use ORD
ord_na_pipe = Pipeline([('imputer_na', imputer_na),('ord', OrdinalEncoder(mapping=imputed_cat_map))])

# Impute with 0, then use RobustScaler
num_robust_pipe = Pipeline([('imputer_zero', imputer_zero),('scaler', scaler_robust)])

In [149]:
nom_na_features = ['Alley', 'GarageType', 'GarageYrBlt', 'Fence', 'MiscFeature']
nom_none_features = ['MasVnrType']
nom_most_features = ['Electrical']
ord_na_features = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
             'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC']
num_robust_features = ['LotFrontage', 'MasVnrArea']

### Pipes that don't need imputing

In [150]:
nominal_pipe = Pipeline([('ohe',ohe)])
# ordinal_pipe = Pipeline([()])
numeric_ss_pipe = Pipeline([('standard_scaler', scaler_standard)])
numeric_rs_pipe = Pipeline([('robust_scaler', scaler_robust)])


In [151]:
# nominal_features = ['CentralAir','MSSubClass','MSZoning','Street','LotShape','LandContour',
#                     'Utilities','LotConfig','LandSlope','Neighborhood','Condition1','Condition2',
#                     'BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st','Exterior2nd',
#                     'Foundation','Heating','KitchenQual',
#                     'PavedDrive','MoSold','SaleType','SaleCondition']
# ordinal_features = ['OverallQual','OverallCond','ExterQual','ExterCond','HeatingQC','Functional']
numeric_ss_features = []
numeric_rs_features = []

temp_all_cat = ['CentralAir','MSSubClass','MSZoning','Street','LotShape','LandContour',
                    'Utilities','LotConfig','LandSlope','Neighborhood','Condition1','Condition2',
                    'BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st','Exterior2nd',
                    'Foundation','Heating','KitchenQual', 'PavedDrive','MoSold','SaleType',
                    'SaleCondition','OverallQual','OverallCond','ExterQual','ExterCond','HeatingQC','Functional']

In [152]:
preprocessor = ColumnTransformer([
  # features that require imputing
  ('nom_na_pipe', nom_na_pipe, nom_na_features),
  ('nom_none_pipe', nom_none_pipe, nom_none_features),
  ('nom_most_pipe', nom_most_pipe, nom_most_features),
  ('ord_na_pipe', ord_na_pipe, ord_na_features),
  ('num_zero_robust_pipe', num_robust_pipe, num_robust_features),

  # features that do not require imputing
  ('nominal_pipe', nominal_pipe, temp_all_cat),
  # ('ordinal_pipe', ordinal_pipe, ordinal_features),
  ('numeric_ss_pipe', numeric_ss_pipe, numeric_ss_features),
  ('numeric_rs_pipe', numeric_rs_pipe, numeric_rs_features)
])

In [158]:
preprocessor2 = ColumnTransformer([
  # features that require imputing
  ('nom_na_pipe', nom_na_pipe, nom_na_features),
  # ('nom_none_pipe', nom_none_pipe, nom_none_features),
  # ('nom_most_pipe', nom_most_pipe, nom_most_features),
  # ('ord_na_pipe', ord_na_pipe, ord_na_features),
  # ('num_zero_robust_pipe', num_robust_pipe, num_robust_features),

  # # features that do not require imputing
  # ('nominal_pipe', nominal_pipe, temp_all_cat),
  # # ('ordinal_pipe', ordinal_pipe, ordinal_features),
  # ('numeric_ss_pipe', numeric_ss_pipe, numeric_ss_features),
  # ('numeric_rs_pipe', numeric_rs_pipe, numeric_rs_features)
])

# Train-val split

In [153]:
X = df.drop(columns='SalePrice')
y = df['SalePrice'].copy()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [156]:
X_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
254,255,20,RL,70.0,8400,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
1066,1067,60,RL,59.0,7837,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,5,2009,WD,Normal
638,639,30,RL,67.0,8777,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,5,2008,WD,Normal
799,800,50,RL,60.0,7200,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,6,2007,WD,Normal
380,381,50,RL,50.0,5000,Pave,Pave,Reg,Lvl,AllPub,...,0,0,,,,0,5,2010,WD,Normal


In [None]:
# preprocessor.fit(X_train)

# X_train_proc = preprocessor.transform(X_train)
# X_test_proc = preprocessor.transform(X_val)