# Preprocessing the Data 

I will preprocess the data in a different way from AutoGluon to test if it improves the prediction score.

## <span style="color:red">Thoughts</span>
- the motivation for this approach is that AutoGluon encodes all categorical features with OneHotEncoder, but my EDA analysis suggests that at least some ordered categorical features are correlated with 'SalePrice'.\
Therefore, it can be meaningful to encode them using OrdinalEncoder.

In [251]:
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from imblearn.combine import SMOTETomek

from imblearn.pipeline import Pipeline as ImbPipeline

from optuna.importance import FanovaImportanceEvaluator

from optuna.visualization import plot_optimization_history, plot_slice

from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import average_precision_score

from sklearn.model_selection import (
    cross_val_score,
    GridSearchCV, 
    PredefinedSplit,
)

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
    QuantileTransformer,
    RobustScaler,
    StandardScaler,
)


from src.config import (
    CLEAN_DATA,
    RANDOM_STATE,
)

from src.eda import check_columns_in_feature_dict

sns.set_theme(style='dark', palette='colorblind', context='notebook')
pd.set_option('display.max_columns', None)

In [335]:
df = pd.read_parquet(CLEAN_DATA)
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Alley,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,Longitude,Latitude,Median_n_Closest_SalePrice
0,2217,20,a,80,14584,no_alley_access,reg,low,inside,mod,idotrr,norm,1fam,1story,1,5,1952,1952,gable,asbshng,vinylsd,none,0,fa,po,slab,no_basement,no_basement,no_basement,no_basement,0,no_basement,0,0,0,wall,po,n,fusea,733,0,0,733,0,0,1,0,2,1,fa,4,sal,0,no_fireplace,attchd,1952.0,unf,2,487,fa,po,n,0,0,0,0,0,0,no_fence,none,0,2,2008,wd,abnorml,,-93.625214,42.018806,115000.0
1,2905,20,a,125,31250,no_alley_access,reg,lvl,inside,gtl,mitchel,artery,1fam,1story,1,3,1951,1951,gable,cblock,vinylsd,none,0,ta,fa,cblock,no_basement,no_basement,no_basement,no_basement,0,no_basement,0,0,0,gasa,ta,y,fusea,1600,0,0,1600,0,0,1,1,3,1,ta,6,mod,0,no_fireplace,attchd,1951.0,unf,1,270,fa,ta,n,0,0,135,0,0,0,no_fence,none,0,5,2006,wd,normal,,-93.610268,41.992222,139000.0
2,917,20,c,50,9000,no_alley_access,reg,lvl,inside,gtl,idotrr,norm,1fam,1story,2,3,1949,1950,gable,asbshng,asbshng,none,0,ta,ta,cblock,ta,ta,av,blq,50,unf,0,430,480,gasa,ta,n,fusea,480,0,0,480,1,0,0,0,1,1,ta,4,typ,0,no_fireplace,detchd,1958.0,unf,1,308,ta,ta,y,0,0,0,0,0,0,no_fence,none,0,10,2006,wd,abnorml,35311.0,-93.615013,42.019100,117250.0
3,2581,20,c,65,6565,no_alley_access,reg,lvl,corner,gtl,idotrr,norm,1fam,1story,4,6,1957,1980,gable,metalsd,metalsd,none,0,ta,ta,cblock,ta,ta,no,glq,967,unf,0,106,1073,gasa,gd,y,fusea,1073,0,0,1073,1,0,1,0,3,1,ta,6,typ,0,no_fireplace,detchd,1985.0,unf,2,720,ta,ta,y,0,444,0,0,0,0,no_fence,none,0,8,2007,wd,abnorml,,-93.615067,42.021423,120500.0
4,813,20,c,66,8712,no_alley_access,reg,bnk,inside,mod,idotrr,norm,1fam,1story,5,5,1952,1952,hip,wd sdng,wd sdng,none,0,fa,ta,cblock,ta,ta,av,unf,0,unf,0,540,540,gasa,ta,n,fusea,1044,0,0,1044,0,0,1,0,2,1,fa,4,typ,0,no_fireplace,basment,1952.0,unf,2,504,ta,ta,n,0,0,0,0,0,0,no_fence,shed,54,6,2010,wd,alloca,55993.0,-93.608269,42.021328,124900.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,2101,190,rm,69,9142,no_alley_access,reg,lvl,inside,gtl,oldtown,norm,2fmcon,2story,5,5,1900,2006,gable,metalsd,metalsd,none,0,ta,fa,brktil,fa,ta,no,unf,0,unf,0,797,797,gasa,ta,n,fusea,830,797,0,1627,0,0,2,0,4,2,ta,10,typ,0,no_fireplace,detchd,1950.0,unf,2,420,fa,po,n,192,0,60,0,0,0,no_fence,none,0,2,2008,wd,normal,,-93.618439,42.029030,126000.0
2915,706,190,rm,70,5600,no_alley_access,reg,lvl,inside,gtl,idotrr,norm,2fmcon,2story,4,5,1930,1950,hip,vinylsd,wd shng,none,0,fa,fa,slab,no_basement,no_basement,no_basement,no_basement,0,no_basement,0,0,0,gasa,fa,n,sbrkr,372,720,0,1092,0,0,2,0,3,2,fa,7,mod,0,no_fireplace,no_garage,0.0,no_garage,0,0,no_garage,no_garage,n,0,0,0,0,0,0,no_fence,othr,3500,7,2010,wd,normal,55000.0,-93.626877,42.024139,123500.0
2916,704,190,rm,76,7630,no_alley_access,reg,lvl,inside,gtl,oldtown,feedr,2fmcon,2story,5,9,1900,1996,gable,wd sdng,wd sdng,none,0,ta,gd,brktil,gd,ta,no,unf,0,unf,0,360,360,gasa,gd,y,sbrkr,1032,780,0,1812,0,0,2,0,4,2,gd,8,typ,1,po,detchd,1999.0,unf,2,672,ta,ta,n,344,0,40,0,0,0,mnprv,none,0,5,2010,wd,normal,140000.0,-93.620384,42.026741,126250.0
2917,1063,190,rm,85,13600,grvl,reg,lvl,inside,gtl,oldtown,norm,2fmcon,2story,5,5,1900,1950,gable,wd sdng,wd sdng,none,0,ta,ta,brktil,ta,ta,no,unf,0,unf,0,662,662,gasa,ta,n,sbrkr,1422,915,0,2337,0,0,2,0,5,2,ta,10,min2,0,no_fireplace,detchd,1945.0,unf,2,560,ta,ta,y,0,57,0,0,0,0,no_fence,none,0,9,2007,wd,normal,90000.0,-93.616913,42.029224,129700.0


# Defining the Type of Each Feature

In [266]:
feature_dict = {
    'categoric': ['MSSubClass', 'MSZoning', 'LandContour', 'LotConfig', 
                 'Neighborhood', 'Condition1','BldgType', 
                 'RoofStyle', 'Exterior1st', 'Exterior2nd','Foundation', 'BsmtFinType1', 'BsmtFinType2',
                 'Heating','CentralAir', 'Electrical', 'GarageType', 'SaleType','SaleCondition', 'Fence', 
                 'Alley', 'MasVnrType', 'HouseStyle','MiscFeature'],

    'ordered_categoric': ['FireplaceQu', 'LotShape','LandSlope', 'OverallQual', 'OverallCond','ExterQual', 'ExterCond',
                 'BsmtQual', 'BsmtCond', 'BsmtExposure','HeatingQC','KitchenQual','Functional',
                 'GarageFinish','GarageQual', 'GarageCond','PavedDrive'],
    
    'numeric': ['LotArea','YearBuilt','YearRemodAdd','MasVnrArea','BsmtFinSF1',
                'BsmtFinSF2','BsmtUnfSF','TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF','GrLivArea',
                'GarageYrBlt','GarageArea','WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
                'ScreenPorch','PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'Longitude', "Latitude", 'LotFrontage', 'Median_n_Closest_SalePrice'],

    'counts': ['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 
                'KitchenAbvGr', 'TotRmsAbvGrd','Fireplaces','GarageCars'],
    
    'target': ['SalePrice'],
}

#check if all columns were included
check_columns_in_feature_dict(df, feature_dict)

Missing in DataFrame: set()
Extra in DataFrame: {'Id'}


# Setting the Ordered Categorical Feature

In [337]:
df['FireplaceQu'].value_counts().index

CategoricalIndex(['no_fireplace', 'gd', 'ta', 'fa', 'po', 'ex'], categories=['ex', 'fa', 'gd', 'no_fireplace', 'po', 'ta'], ordered=False, dtype='category', name='FireplaceQu')

In [268]:
# Define the ordering for each ordered categorical feature
category_orders = {
    'FireplaceQu': ['no_fireplace', 'po', 'fa', 'ta', 'gd', 'ex'],
    'LotShape': ['ir3', 'ir2', 'ir1', 'reg'],
    'LandSlope': ['sev', 'mod', 'gtl'],
    'OverallQual': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'OverallCond': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'ExterQual': ['po', 'fa', 'ta', 'gd', 'ex'],
    'ExterCond': ['po', 'fa', 'ta', 'gd', 'ex'],
    'BsmtQual': ['no_basement', 'po', 'fa', 'ta', 'gd', 'ex'],
    'BsmtCond': ['no_basement', 'po', 'fa', 'ta', 'gd', 'ex'],
    'BsmtExposure': ['no_basement', 'no', 'mn', 'av', 'gd'],
    'HeatingQC': ['po', 'fa', 'ta', 'gd', 'ex'],
    'KitchenQual': ['unknown','po', 'fa', 'ta', 'gd', 'ex'],
    'Functional': ['sal', 'sev', 'maj2', 'maj1', 'mod', 'min2', 'min1', 'typ'],
    'GarageFinish': ['no_garage', 'unf', 'rfn', 'fin'],
    'GarageQual': ['no_garage', 'po', 'fa', 'ta', 'gd', 'ex'],
    'GarageCond': ['no_garage', 'po', 'fa', 'ta', 'gd', 'ex'],
    'PavedDrive': ['n', 'p', 'y']
}

# Update each column in the DataFrame to be an ordered categorical
for column, order in category_orders.items():
    df[column] = pd.Categorical(df[column], categories=order, ordered=True)


## Preprocessing data

In [355]:
preprocessor_dict = {
    'ordinal_encoder': feature_dict['ordered_categoric'],
}

In [357]:
preprocessor = ColumnTransformer(
    transformers=[
        ("ordinal", OrdinalEncoder(), preprocessor_dict['ordinal_encoder']),
        ],
    remainder="passthrough",
    force_int_remainder_cols=False,
    
)


In [359]:
transformed_data = preprocessor.fit_transform(df)

# Get the column names for the transformed data
ordinal_columns = preprocessor.named_transformers_['ordinal'].get_feature_names_out(preprocessor_dict['ordinal_encoder'])

# Get the names of the columns that remain unchanged (passed through)
remaining_columns = [col for col in df.columns if col not in preprocessor_dict['ordinal_encoder']]

# Combine the columns to create the new column names
all_column_names = list(ordinal_columns) + remaining_columns

# Create the DataFrame with the new column names
transformed_df = pd.DataFrame(transformed_data, columns=all_column_names)

In [361]:
transformed_df

Unnamed: 0,FireplaceQu,LotShape,LandSlope,OverallQual,OverallCond,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,HeatingQC,KitchenQual,Functional,GarageFinish,GarageQual,GarageCond,PavedDrive,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Alley,LandContour,LotConfig,Neighborhood,Condition1,BldgType,HouseStyle,YearBuilt,YearRemodAdd,RoofStyle,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,Foundation,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageType,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,Longitude,Latitude,Median_n_Closest_SalePrice
0,3.0,3.0,1.0,0.0,4.0,1.0,3.0,3.0,3.0,4.0,3.0,1.0,5.0,3.0,1.0,4.0,0.0,2217,20,a,80,14584,no_alley_access,low,inside,idotrr,norm,1fam,1story,1952,1952,gable,asbshng,vinylsd,none,0,slab,no_basement,0,no_basement,0,0,0,wall,n,fusea,733,0,0,733,0,0,1,0,2,1,4,0,attchd,1952.0,2,487,0,0,0,0,0,0,no_fence,none,0,2,2008,wd,abnorml,,-93.625214,42.018806,115000.0
1,3.0,3.0,0.0,0.0,2.0,3.0,1.0,3.0,3.0,4.0,4.0,3.0,4.0,3.0,1.0,5.0,0.0,2905,20,a,125,31250,no_alley_access,lvl,inside,mitchel,artery,1fam,1story,1951,1951,gable,cblock,vinylsd,none,0,cblock,no_basement,0,no_basement,0,0,0,gasa,y,fusea,1600,0,0,1600,0,0,1,1,3,1,6,0,attchd,1951.0,1,270,0,0,135,0,0,0,no_fence,none,0,5,2006,wd,normal,,-93.610268,41.992222,139000.0
2,3.0,3.0,0.0,1.0,2.0,3.0,4.0,4.0,5.0,0.0,4.0,3.0,7.0,3.0,5.0,5.0,2.0,917,20,c,50,9000,no_alley_access,lvl,inside,idotrr,norm,1fam,1story,1949,1950,gable,asbshng,asbshng,none,0,cblock,blq,50,unf,0,430,480,gasa,n,fusea,480,0,0,480,1,0,0,0,1,1,4,0,detchd,1958.0,1,308,0,0,0,0,0,0,no_fence,none,0,10,2006,wd,abnorml,35311.0,-93.615013,42.0191,117250.0
3,3.0,3.0,0.0,3.0,5.0,3.0,4.0,4.0,5.0,3.0,2.0,3.0,7.0,3.0,5.0,5.0,2.0,2581,20,c,65,6565,no_alley_access,lvl,corner,idotrr,norm,1fam,1story,1957,1980,gable,metalsd,metalsd,none,0,cblock,glq,967,unf,0,106,1073,gasa,y,fusea,1073,0,0,1073,1,0,1,0,3,1,6,0,detchd,1985.0,2,720,0,444,0,0,0,0,no_fence,none,0,8,2007,wd,abnorml,,-93.615067,42.021423,120500.0
4,3.0,3.0,1.0,4.0,4.0,1.0,4.0,4.0,5.0,0.0,4.0,1.0,7.0,3.0,5.0,5.0,0.0,813,20,c,66,8712,no_alley_access,bnk,inside,idotrr,norm,1fam,1story,1952,1952,hip,wd sdng,wd sdng,none,0,cblock,unf,0,unf,0,540,540,gasa,n,fusea,1044,0,0,1044,0,0,1,0,2,1,4,0,basment,1952.0,2,504,0,0,0,0,0,0,no_fence,shed,54,6,2010,wd,alloca,55993.0,-93.608269,42.021328,124900.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,3.0,3.0,0.0,4.0,4.0,3.0,1.0,1.0,5.0,3.0,4.0,3.0,7.0,3.0,1.0,4.0,0.0,2101,190,rm,69,9142,no_alley_access,lvl,inside,oldtown,norm,2fmcon,2story,1900,2006,gable,metalsd,metalsd,none,0,brktil,unf,0,unf,0,797,797,gasa,n,fusea,830,797,0,1627,0,0,2,0,4,2,10,0,detchd,1950.0,2,420,192,0,60,0,0,0,no_fence,none,0,2,2008,wd,normal,,-93.618439,42.02903,126000.0
2915,3.0,3.0,0.0,3.0,4.0,1.0,1.0,3.0,3.0,4.0,1.0,1.0,4.0,1.0,3.0,3.0,0.0,706,190,rm,70,5600,no_alley_access,lvl,inside,idotrr,norm,2fmcon,2story,1930,1950,hip,vinylsd,wd shng,none,0,slab,no_basement,0,no_basement,0,0,0,gasa,n,sbrkr,372,720,0,1092,0,0,2,0,3,2,7,0,no_garage,0.0,0,0,0,0,0,0,0,0,no_fence,othr,3500,7,2010,wd,normal,55000.0,-93.626877,42.024139,123500.0
2916,4.0,3.0,0.0,4.0,8.0,3.0,2.0,2.0,5.0,3.0,2.0,2.0,7.0,3.0,5.0,5.0,0.0,704,190,rm,76,7630,no_alley_access,lvl,inside,oldtown,feedr,2fmcon,2story,1900,1996,gable,wd sdng,wd sdng,none,0,brktil,unf,0,unf,0,360,360,gasa,y,sbrkr,1032,780,0,1812,0,0,2,0,4,2,8,1,detchd,1999.0,2,672,344,0,40,0,0,0,mnprv,none,0,5,2010,wd,normal,140000.0,-93.620384,42.026741,126250.0
2917,3.0,3.0,0.0,4.0,4.0,3.0,4.0,4.0,5.0,3.0,4.0,3.0,3.0,3.0,5.0,5.0,2.0,1063,190,rm,85,13600,grvl,lvl,inside,oldtown,norm,2fmcon,2story,1900,1950,gable,wd sdng,wd sdng,none,0,brktil,unf,0,unf,0,662,662,gasa,n,sbrkr,1422,915,0,2337,0,0,2,0,5,2,10,0,detchd,1945.0,2,560,0,57,0,0,0,0,no_fence,none,0,9,2007,wd,normal,90000.0,-93.616913,42.029224,129700.0


In [363]:
    # Print the result of null values per column
print(transformed_df.isnull().sum().sort_values(ascending=False).head(3))

SalePrice       1459
Fireplaces         0
TotRmsAbvGrd       0
dtype: int64


In [293]:
preprocessor_dict = {
    'one_hot_encoder': feature_dict['categoric'],
    'ordinal_encoder': feature_dict['ordered_categoric'],
    'quantile_transformer': feature_dict['numeric'],    
    'robust_scaler': feature_dict['counts'],
}

In [189]:
preprocessor = ColumnTransformer(
    transformers=[
        ("one_hot", OneHotEncoder(drop="first"), preprocessor_dict['one_hot_encoder']),
        ("ordinal", OrdinalEncoder(), preprocessor_dict['ordinal_encoder']),
        ("quantile", QuantileTransformer(output_distribution='normal'), preprocessor_dict['quantile_transformer']),
        ("robust", RobustScaler(), preprocessor_dict['robust_scaler']),
    ],
    remainder="passthrough",
    force_int_remainder_cols=False,
    
)


In [295]:
# Create a pipeline for ordinal features
ordinal_pipeline = Pipeline([
    ("ordinal", OrdinalEncoder()),
    ("scaler", StandardScaler())
])

# Updated ColumnTransformer with ordinal pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("one_hot", OneHotEncoder(drop="first"), preprocessor_dict['one_hot_encoder']),
        ("ordinal", ordinal_pipeline, preprocessor_dict['ordinal_encoder']),
        ("quantile", QuantileTransformer(output_distribution='normal'), preprocessor_dict['quantile_transformer']),
        ("robust", RobustScaler(), preprocessor_dict['robust_scaler']),
    ],
    remainder="passthrough",
    force_int_remainder_cols=False,
)

In [297]:
transformed_data = preprocessor.fit_transform(df)

# Get the transformed column names
one_hot_columns = preprocessor.named_transformers_['one_hot'].get_feature_names_out(preprocessor_dict['one_hot_encoder'])
all_column_names = (
    list(one_hot_columns) + 
    preprocessor_dict['ordinal_encoder'] + 
    preprocessor_dict['quantile_transformer'] + 
    preprocessor_dict['robust_scaler'] +
    ['Id'] +
    ['SalePrice']
    
)

# Create a DataFrame from the transformed data
transformed_df = pd.DataFrame(transformed_data, columns=all_column_names)


In [317]:
    # Print the result of null values per column
print(transformed_df.isnull().sum().sort_values(ascending=False).head(10))

SalePrice                1459
FireplaceQu              1420
GarageCond                  1
GarageQual                  1
GarageFinish                1
SaleType_wd                 0
SaleCondition_adjland       0
SaleCondition_alloca        0
SaleCondition_family        0
SaleCondition_normal        0
dtype: int64


In [278]:
transformed_df.to_csv('data_transformed.csv',index=False)