In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

In [2]:
house_variables = joblib.load('output/x_train.pkl')
house_target = joblib.load('output/y_train.pkl')

In [3]:
house_variables.head().T

Id,1332,814,1262,269,1027
MSSubClass,80,20,20,30,20
MSZoning,RL,RL,RL,RM,RL
LotFrontage,55.0,75.0,80.0,71.0,73.0
LotArea,10780,9750,9600,6900,9300
Street,Pave,Pave,Pave,Pave,Pave
...,...,...,...,...,...
MiscVal,0,500,0,0,0
MoSold,7,4,6,2,4
YrSold,2006,2007,2009,2008,2010
SaleType,WD,COD,WD,WD,WD


In [4]:
house_variables.isnull().sum().sort_values(ascending=False) / len(house_variables)

PoolQC           0.994647
MiscFeature      0.960385
Alley            0.932548
Fence            0.807281
FireplaceQu      0.466809
                   ...   
Heating          0.000000
MSZoning         0.000000
CentralAir       0.000000
Electrical       0.000000
SaleCondition    0.000000
Length: 79, dtype: float64

In [5]:
NUM = ["MSSubClass", "LotFrontage", "LotArea", "YearBuilt", "YearRemodAdd", "MasVnrArea", "BsmtFinSF2",
       "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "GrLivArea", "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr",
      "TotRmsAbvGrd", "Fireplaces", "GarageYrBlt", "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal", "MoSold", "YrSold"]

CAT = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', 'LowQualFinSF', 'BsmtFullBath',
       'BsmtHalfBath', 'KitchenQual', 'Functional', 'FireplaceQu',
       'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
       'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']

In [6]:
house_numerical = house_variables[NUM]
house_categorical = house_variables.drop(NUM, axis=1)

In [7]:
house_numerical

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1332,80,55.0,10780,1976,1976,0.0,0,428,911,954,...,576,0,0,0,0,0,0,0,7,2006
814,20,75.0,9750,1958,1958,243.0,0,834,1442,1442,...,301,0,0,275,0,0,0,500,4,2007
1262,20,80.0,9600,1956,1956,0.0,0,546,1050,1050,...,338,0,0,0,0,0,0,0,6,2009
269,30,71.0,6900,1940,1955,0.0,125,212,740,778,...,924,0,25,0,0,0,0,0,2,2008
1027,20,73.0,9300,1960,1960,324.0,0,571,1268,1264,...,461,0,0,0,0,143,0,0,4,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,50,79.0,9490,1941,1950,0.0,165,238,806,958,...,240,0,0,32,0,0,0,0,8,2006
421,90,78.0,7060,1997,1998,200.0,0,35,1344,1344,...,784,0,0,0,0,0,0,0,11,2008
410,60,85.0,10800,2007,2008,100.0,0,245,1034,1050,...,836,0,102,0,0,0,0,0,4,2008
481,20,98.0,16033,2004,2005,378.0,0,572,1833,1850,...,772,519,112,0,0,0,0,0,3,2006


In [8]:
house_categorical

Unnamed: 0_level_0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1332,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Normal
814,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,...,Attchd,RFn,TA,TA,Y,,,Shed,COD,Normal
1262,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,...,Attchd,Unf,TA,TA,Y,,,,WD,Normal
269,RM,Pave,,Reg,Lvl,AllPub,Inside,Gtl,IDOTRR,Norm,...,Detchd,Fin,Ex,Ex,Y,,,,WD,Normal
1027,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,...,Attchd,Unf,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Artery,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
421,RM,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,...,Attchd,Fin,TA,TA,Y,,,,WD,Alloca
410,FV,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Somerst,Norm,...,Attchd,Fin,TA,TA,Y,,,,New,Partial
481,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NridgHt,Norm,...,Attchd,Fin,TA,TA,Y,,,,WD,Normal


In [9]:
house_numerical.isnull().sum().sort_values(ascending=False) / len(house_numerical)

LotFrontage      0.158458
GarageYrBlt      0.061028
MasVnrArea       0.003212
MSSubClass       0.000000
TotRmsAbvGrd     0.000000
MoSold           0.000000
MiscVal          0.000000
PoolArea         0.000000
ScreenPorch      0.000000
3SsnPorch        0.000000
EnclosedPorch    0.000000
OpenPorchSF      0.000000
WoodDeckSF       0.000000
GarageArea       0.000000
GarageCars       0.000000
Fireplaces       0.000000
KitchenAbvGr     0.000000
BedroomAbvGr     0.000000
HalfBath         0.000000
FullBath         0.000000
GrLivArea        0.000000
2ndFlrSF         0.000000
1stFlrSF         0.000000
TotalBsmtSF      0.000000
BsmtUnfSF        0.000000
BsmtFinSF2       0.000000
YearRemodAdd     0.000000
YearBuilt        0.000000
LotArea          0.000000
YrSold           0.000000
dtype: float64

In [10]:
house_categorical.isnull().sum().sort_values(ascending=False) / len(house_categorical)

PoolQC           0.994647
MiscFeature      0.960385
Alley            0.932548
Fence            0.807281
FireplaceQu      0.466809
GarageCond       0.061028
GarageQual       0.061028
GarageFinish     0.061028
GarageType       0.061028
BsmtExposure     0.021413
BsmtCond         0.020343
BsmtQual         0.020343
BsmtFinType2     0.020343
BsmtFinType1     0.020343
MasVnrType       0.003212
KitchenQual      0.000000
BsmtHalfBath     0.000000
BsmtFullBath     0.000000
LowQualFinSF     0.000000
Electrical       0.000000
CentralAir       0.000000
HeatingQC        0.000000
Heating          0.000000
BsmtFinSF1       0.000000
PavedDrive       0.000000
SaleType         0.000000
Functional       0.000000
MSZoning         0.000000
Street           0.000000
BldgType         0.000000
LotShape         0.000000
LandContour      0.000000
Utilities        0.000000
LotConfig        0.000000
LandSlope        0.000000
Neighborhood     0.000000
Condition1       0.000000
Condition2       0.000000
HouseStyle  

## IMPUTING DATA

In [11]:
def numerical_imputer(numerical,
                    state = 'transform'):
    
    index = numerical.index
    cols = numerical.columns
    
    if state == 'fit':
        imputer = SimpleImputer(
            missing_values=np.nan,
            strategy="mean")

        imputer.fit(numerical)
        joblib.dump(imputer,
                    "output/numerical_imputer.pkl")
    elif state == 'transform':
        imputer = joblib.load("output/numerical_imputer.pkl")
        
    imputed = imputer.transform(numerical)
    imputed = pd.DataFrame(imputed)
    imputed.index = index
    imputed.columns = cols
    return imputed

In [12]:
def categorical_imputer(df_categorical):
    df = df_categorical.copy()
    df.fillna(value = 'KOSONG', inplace=True)
    return df

In [13]:
df_numerical_imputed = numerical_imputer(house_numerical, state='fit')
df_categorical_imputed = categorical_imputer(house_categorical)

In [14]:
df_categorical_imputed.head()

Unnamed: 0_level_0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1332,RL,Pave,KOSONG,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Detchd,Unf,TA,TA,Y,KOSONG,KOSONG,KOSONG,WD,Normal
814,RL,Pave,KOSONG,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,...,Attchd,RFn,TA,TA,Y,KOSONG,KOSONG,Shed,COD,Normal
1262,RL,Pave,KOSONG,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,...,Attchd,Unf,TA,TA,Y,KOSONG,KOSONG,KOSONG,WD,Normal
269,RM,Pave,KOSONG,Reg,Lvl,AllPub,Inside,Gtl,IDOTRR,Norm,...,Detchd,Fin,Ex,Ex,Y,KOSONG,KOSONG,KOSONG,WD,Normal
1027,RL,Pave,KOSONG,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,...,Attchd,Unf,TA,TA,Y,KOSONG,KOSONG,KOSONG,WD,Normal


# One Hot Encoding

In [15]:
def one_hot_encoder(x_cat,
                    state='fit'):
    df = x_cat.copy()
    index = x_cat.index
    col = x_cat.columns
    
    if state == 'fit':
        encoder = OneHotEncoder(sparse=False,handle_unknown='ignore')
        encoder.fit(x_cat)
        joblib.dump(encoder,
                    "output/onehotencoder.pkl")
        
    elif state == 'transform':
        encoder = joblib.load("output/onehotencoder.pkl")
    
    encoded = encoder.transform(x_cat)
    feat_names = encoder.get_feature_names_out(col)
    encoded = pd.DataFrame(encoded)
    encoded.index = index
    encoded.columns = feat_names
    return encoded

In [16]:
df_categorical_encoded = one_hot_encoder(df_categorical_imputed)

In [17]:
df_categorical_encoded.head()

Unnamed: 0_level_0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,Alley_Grvl,Alley_KOSONG,Alley_Pave,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1332,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
814,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1262,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
269,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1027,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


## Join Data

In [18]:
df_joined = pd.concat([df_numerical_imputed, df_categorical_encoded], axis=1)

In [19]:
import sklearn
sklearn.__version__

'1.0.1'

In [20]:
df_joined

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1332,80.0,55.0,10780.0,1976.0,1976.0,0.0,0.0,428.0,911.0,954.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
814,20.0,75.0,9750.0,1958.0,1958.0,243.0,0.0,834.0,1442.0,1442.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1262,20.0,80.0,9600.0,1956.0,1956.0,0.0,0.0,546.0,1050.0,1050.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
269,30.0,71.0,6900.0,1940.0,1955.0,0.0,125.0,212.0,740.0,778.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1027,20.0,73.0,9300.0,1960.0,1960.0,324.0,0.0,571.0,1268.0,1264.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,50.0,79.0,9490.0,1941.0,1950.0,0.0,165.0,238.0,806.0,958.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
421,90.0,78.0,7060.0,1997.0,1998.0,200.0,0.0,35.0,1344.0,1344.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
410,60.0,85.0,10800.0,2007.0,2008.0,100.0,0.0,245.0,1034.0,1050.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
481,20.0,98.0,16033.0,2004.0,2005.0,378.0,0.0,572.0,1833.0,1850.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


# STANDARDIZE ALL

In [21]:
def normalization(x_all,
                  state = 'fit'):
    index = x_all.index
    cols = x_all.columns
    

    if state == 'fit':
        normalizer = StandardScaler()
        normalizer.fit(x_all)
        joblib.dump(normalizer,
                    "output/normalizer.pkl")

    elif state == 'transform':
        normalizer = joblib.load("output/normalizer.pkl")
        
    normalized = normalizer.transform(x_all)
    normalized = pd.DataFrame(normalized)
    normalized.index = index
    normalized.columns = cols
    return normalized

In [22]:
df_normalized = normalization(df_joined)

In [23]:
df_normalized

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1332,0.532804,-0.710499,0.053439,0.159718,-0.427071,-0.583810,-0.283598,-0.318658,-0.381197,-0.571821,...,-0.056766,-0.295484,-0.046324,0.396703,-0.28901,-0.046324,-0.086898,-0.114084,0.473408,-0.301863
814,-0.858220,0.279401,-0.064080,-0.439787,-1.303740,0.782910,-0.283598,0.601375,0.888412,0.709200,...,-0.056766,-0.295484,-0.046324,-2.520780,-0.28901,-0.046324,-0.086898,-0.114084,0.473408,-0.301863
1262,-0.858220,0.526877,-0.081194,-0.506399,-1.401148,-0.583810,-0.283598,-0.051259,-0.048851,-0.319817,...,-0.056766,-0.295484,-0.046324,0.396703,-0.28901,-0.046324,-0.086898,-0.114084,0.473408,-0.301863
269,-0.626383,0.081421,-0.389253,-1.039293,-1.449852,-0.583810,0.466567,-0.808133,-0.790054,-1.033828,...,-0.056766,-0.295484,-0.046324,0.396703,-0.28901,-0.046324,-0.086898,-0.114084,0.473408,-0.301863
1027,-0.858220,0.180411,-0.115423,-0.373176,-1.206332,1.238484,-0.283598,0.005393,0.472382,0.241942,...,-0.056766,-0.295484,-0.046324,0.396703,-0.28901,-0.046324,-0.086898,-0.114084,0.473408,-0.301863
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,-0.162708,0.477382,-0.093745,-1.005988,-1.693371,-0.583810,0.706620,-0.749215,-0.632250,-0.561320,...,-0.056766,-0.295484,-0.046324,0.396703,-0.28901,-0.046324,-0.086898,-0.114084,0.473408,-0.301863
421,0.764641,0.427887,-0.370997,0.859142,0.644413,0.541063,-0.283598,-1.209231,0.654096,0.451946,...,-0.056766,-0.295484,-0.046324,0.396703,-0.28901,-0.046324,11.507761,-0.114084,-2.112342,-0.301863
410,0.069129,0.774352,0.055720,1.192201,1.131452,-0.021374,-0.283598,-0.733352,-0.087107,-0.319817,...,-0.056766,3.384277,-0.046324,-2.520780,-0.28901,-0.046324,-0.086898,-0.114084,-2.112342,3.312757
481,-0.858220,1.417787,0.652783,1.092283,0.985340,1.542199,-0.283598,0.007660,1.823284,1.780216,...,-0.056766,-0.295484,-0.046324,0.396703,-0.28901,-0.046324,-0.086898,-0.114084,0.473408,-0.301863


In [24]:
joblib.dump(df_normalized, 'output/preprocessed_x_train.pkl')

['output/preprocessed_x_train.pkl']

In [25]:
import yaml

In [26]:
f = open("src/params/preprocess_params.yaml", "r")
params = yaml.load(f, Loader=yaml.SafeLoader)
f.close()

In [27]:
params

{'TARGET_COLUMN': 'SalePrice',
 'DATA_PATH': 'data/train.csv',
 'TEST_SIZE': 0.2,
 'scoring': 'neg_mean_squared_log_error',
 'n_iter_search': 2,
 'verbosity': 0,
 'NUM_COLUMN': ['MSSubClass',
  'LotFrontage',
  'LotArea',
  'YearBuilt',
  'YearRemodAdd',
  'MasVnrArea',
  'BsmtFinSF2',
  'BsmtUnfSF',
  'TotalBsmtSF',
  '1stFlrSF',
  '2ndFlrSF',
  'GrLivArea',
  'FullBath',
  'HalfBath',
  'BedroomAbvGr',
  'KitchenAbvGr',
  'TotRmsAbvGrd',
  'Fireplaces',
  'GarageYrBlt',
  'GarageCars',
  'GarageArea',
  'WoodDeckSF',
  'OpenPorchSF',
  'EnclosedPorch',
  '3SsnPorch',
  'ScreenPorch',
  'PoolArea',
  'MiscVal',
  'MoSold',
  'YrSold'],
 'CAT_COLUMN': ['MSZoning',
  'Street',
  'Alley',
  'LotShape',
  'LandContour',
  'Utilities',
  'LotConfig',
  'LandSlope',
  'Neighborhood',
  'Condition1',
  'Condition2',
  'BldgType',
  'HouseStyle',
  'OverallQual',
  'OverallCond',
  'RoofStyle',
  'RoofMatl',
  'Exterior1st',
  'Exterior2nd',
  'MasVnrType',
  'ExterQual',
  'ExterCond',
  'Fo

In [28]:
yaml.__version__

'6.0'

In [29]:
def run(params, xpath, ypath, dump_path, state='fit'):
    house_variables = joblib.load(xpath)
    house_target = joblib.load(ypath)
    
    house_numerical = house_variables[params['NUM_COLUMN']]
    house_categorical = house_variables[params['CAT_COLUMN']]
    
    df_numerical_imputed = numerical_imputer(house_numerical, state=state)
    df_categorical_imputed = categorical_imputer(house_categorical)
    
    df_categorical_encoded = one_hot_encoder(df_categorical_imputed, state=state)
    
    df_joined = pd.concat([df_numerical_imputed, df_categorical_encoded], axis=1)
    
    df_normalized = normalization(df_joined, state=state)
    
    joblib.dump(df_normalized, dump_path)

In [30]:
run(params,'output/x_valid.pkl', 'output/y_valid.pkl', 'output/preprocessed_x_valid.pkl', state='transform')