# Preprocessing the Data 

I will preprocess the data in a different way from AutoGluon to test if it improves the prediction score.

## <span style="color:red">Thoughts</span>
- the motivation for this approach is that AutoGluon encodes all categorical features with OneHotEncoder, but my EDA analysis suggests that at least some ordered categorical features are correlated with 'SalePrice'.\
Therefore, it can be meaningful to encode them using OrdinalEncoder.

In [2]:
import pandas as pd

from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import OrdinalEncoder

from src.config import CLEAN_DATA

from src.eda import check_columns_in_feature_dict

In [3]:
df = pd.read_csv(CLEAN_DATA)
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,rl,65.0,8450,pave,no_alley_access,reg,lvl,allpub,...,0,unknown,,unknown,0,2,2008,wd,normal,208500.0
1,2,20,rl,80.0,9600,pave,no_alley_access,reg,lvl,allpub,...,0,unknown,,unknown,0,5,2007,wd,normal,181500.0
2,3,60,rl,68.0,11250,pave,no_alley_access,ir1,lvl,allpub,...,0,unknown,,unknown,0,9,2008,wd,normal,223500.0
3,4,70,rl,60.0,9550,pave,no_alley_access,ir1,lvl,allpub,...,0,unknown,,unknown,0,2,2006,wd,abnorml,140000.0
4,5,60,rl,84.0,14260,pave,no_alley_access,ir1,lvl,allpub,...,0,unknown,,unknown,0,12,2008,wd,normal,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,2915,160,rm,21.0,1936,pave,no_alley_access,reg,lvl,allpub,...,0,unknown,,unknown,0,6,2006,wd,normal,
2915,2916,160,rm,21.0,1894,pave,no_alley_access,reg,lvl,allpub,...,0,unknown,,unknown,0,4,2006,wd,abnorml,
2916,2917,20,rl,160.0,20000,pave,no_alley_access,reg,lvl,allpub,...,0,unknown,,unknown,0,9,2006,wd,abnorml,
2917,2918,85,rl,62.0,10441,pave,no_alley_access,reg,lvl,allpub,...,0,unknown,mnprv,shed,700,7,2006,wd,normal,


# Defining the Type of Each Feature

In [9]:
feature_dict = {
    'categoric': ['Id', 'MSSubClass', 'MSZoning', 'LandContour', 'LotConfig', 
                 'Neighborhood', 'Condition1','BldgType', 
                 'RoofStyle', 'Exterior1st', 'Exterior2nd','Foundation', 'BsmtFinType1', 'BsmtFinType2',
                 'Heating','CentralAir', 'Electrical', 'GarageType', 'SaleType','SaleCondition', 'Fence', 
                 'Alley', 'MasVnrType', 'HouseStyle','MiscFeature', 'PoolQC', 'Street', 'RoofMatl', 'Utilities', 'Condition2'],

    'ordered_categoric': ['FireplaceQu', 'LotShape','LandSlope', 'OverallQual', 'OverallCond','ExterQual', 'ExterCond',
                 'BsmtQual', 'BsmtCond', 'BsmtExposure','HeatingQC','KitchenQual','Functional',
                 'GarageFinish','GarageQual', 'GarageCond','PavedDrive'],
    
    'numeric': ['LotArea','YearBuilt','YearRemodAdd','MasVnrArea','BsmtFinSF1',
                'BsmtFinSF2','BsmtUnfSF','TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF','GrLivArea',
                'GarageYrBlt','GarageArea','WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
                'ScreenPorch','PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'LotFrontage'],

    'counts': ['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 
                'KitchenAbvGr', 'TotRmsAbvGrd','Fireplaces','GarageCars'],
    
    'target': ['SalePrice'],
}

#check if all columns were included
check_columns_in_feature_dict(df, feature_dict)

All good in feature_dict


# Setting the Ordered Categorical Feature

In [11]:
# Define the ordering for each ordered categorical feature
category_orders = {
    'FireplaceQu': ['no_fireplace', 'po', 'fa', 'ta', 'gd', 'ex'],
    'LotShape': ['ir3', 'ir2', 'ir1', 'reg'],
    'LandSlope': ['sev', 'mod', 'gtl'],
    'OverallQual': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'OverallCond': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'ExterQual': ['po', 'fa', 'ta', 'gd', 'ex'],
    'ExterCond': ['po', 'fa', 'ta', 'gd', 'ex'],
    'BsmtQual': ['no_basement', 'po', 'fa', 'ta', 'gd', 'ex'],
    'BsmtCond': ['no_basement', 'po', 'fa', 'ta', 'gd', 'ex'],
    'BsmtExposure': ['no_basement', 'no', 'mn', 'av', 'gd'],
    'HeatingQC': ['po', 'fa', 'ta', 'gd', 'ex'],
    'KitchenQual': ['unknown','po', 'fa', 'ta', 'gd', 'ex'],
    'Functional': ['sal', 'sev', 'maj2', 'maj1', 'mod', 'min2', 'min1', 'typ'],
    'GarageFinish': ['no_garage', 'unf', 'rfn', 'fin'],
    'GarageQual': ['no_garage', 'po', 'fa', 'ta', 'gd', 'ex'],
    'GarageCond': ['no_garage', 'po', 'fa', 'ta', 'gd', 'ex'],
    'PavedDrive': ['n', 'p', 'y']
}

# Update each column in the DataFrame to be an ordered categorical
for column, order in category_orders.items():
    df[column] = pd.Categorical(df[column], categories=order, ordered=True)


# Preprocessing data

In [14]:
preprocessor_dict = {
    'ordinal_encoder': feature_dict['ordered_categoric'],
}

In [16]:
preprocessor = ColumnTransformer(
    transformers=[
        ("ordinal", OrdinalEncoder(), preprocessor_dict['ordinal_encoder']),
        ],
    remainder="passthrough",
    force_int_remainder_cols=False,
    
)


In [18]:
transformed_data = preprocessor.fit_transform(df)

# Get the column names for the transformed data
ordinal_columns = preprocessor.named_transformers_['ordinal'].get_feature_names_out(preprocessor_dict['ordinal_encoder'])

# Get the names of the columns that remain unchanged (passed through)
remaining_columns = [col for col in df.columns if col not in preprocessor_dict['ordinal_encoder']]

# Combine the columns to create the new column names
all_column_names = list(ordinal_columns) + remaining_columns

# Create the DataFrame with the new column names
df_transformed = pd.DataFrame(transformed_data, columns=all_column_names)

In [20]:
df_transformed

Unnamed: 0,FireplaceQu,LotShape,LandSlope,OverallQual,OverallCond,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,,3.0,0.0,6.0,4.0,2.0,4.0,2.0,3.0,3.0,...,0,unknown,,unknown,0,2,2008,wd,normal,208500.0
1,4.0,3.0,0.0,5.0,7.0,3.0,4.0,2.0,3.0,1.0,...,0,unknown,,unknown,0,5,2007,wd,normal,181500.0
2,4.0,0.0,0.0,6.0,4.0,2.0,4.0,2.0,3.0,2.0,...,0,unknown,,unknown,0,9,2008,wd,normal,223500.0
3,2.0,0.0,0.0,6.0,4.0,3.0,4.0,3.0,1.0,3.0,...,0,unknown,,unknown,0,2,2006,wd,abnorml,140000.0
4,4.0,0.0,0.0,7.0,4.0,2.0,4.0,2.0,3.0,0.0,...,0,unknown,,unknown,0,12,2008,wd,normal,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,,3.0,0.0,3.0,6.0,3.0,4.0,3.0,3.0,3.0,...,0,unknown,,unknown,0,6,2006,wd,normal,
2915,,3.0,0.0,3.0,4.0,3.0,4.0,3.0,3.0,3.0,...,0,unknown,,unknown,0,4,2006,wd,abnorml,
2916,4.0,3.0,0.0,4.0,6.0,3.0,4.0,3.0,3.0,3.0,...,0,unknown,,unknown,0,9,2006,wd,abnorml,
2917,,3.0,0.0,4.0,4.0,3.0,4.0,2.0,3.0,0.0,...,0,unknown,mnprv,shed,700,7,2006,wd,normal,


In [22]:
df_transformed.to_csv('data_transformed.csv',index=False)