# Data Processing Steps:

1. Split data into train and test
- Drop columns noted through EDA
    -  `Id`, `MiscFeature`, `MSSubClass`
2. Preprocessing
- Instantiate Transformers
- Build Pipelines
- Select columns
- Create tuples
- Column Transformer
- Encoding

3. New Baseline Model submission with processed data

# Library imports

In [None]:
import math
import pandas as pd
pd.set_option('display.max_columns', 100)
import numpy as np
np.set_printoptions(legacy="1.21")

from sklearn import set_config
set_config(transform_output='pandas')

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import root_mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer

import category_encoders as ce
from category_encoders.hashing import HashingEncoder
from category_encoders.ordinal import OrdinalEncoder

import joblib

# Preprocessing

In [None]:
# column selection
ohe_cols = ["MoSold",
            "GarageFinish",
            "CentralAir",
            "Street",
            "Alley",
            "LotShape",
            "Utilities",
            "LandContour",
            "LotConfig",
            "LandSlope"]
num_cols = ["MiscVal",
            "YrSold",
            "PoolArea",
            "GarageCars",
            "GarageArea",
            "GarageYrBlt",
            "Fireplaces",
            "TotRmsAbvGrd",
            "1stFlrSF",
            "2ndFlrSF",
            "LowQualFinSF",
            "GrLivArea",
            "BsmtFullBath",
            "BsmtHalfBath",
            "FullBath",
            "HalfBath",
            "BedroomAbvGr",
            "KitchenAbvGr",
            "BsmtFinSF1",
            "BsmtFinSF2",
            "BsmtUnfSF",
            "TotalBsmtSF",
            "MasVnrArea",
            "LotFrontage",
            "LotArea",
            "YearBuilt",
            "YearRemodAdd",
            "OverallQual",
            "OverallCond"]
porch_cols = ["ScreenPorch",
            "3SsnPorch",
            "WoodDeckSF",
            "OpenPorchSF",
            "EnclosedPorch"]
hash_cols = ["SaleCondition",
            "SaleType",
            "GarageType",
            "Heating",
            "MSZoning",
            "Neighborhood",		
            "Condition1",
            "Condition2",
            "BldgType",
            "HouseStyle",
            "RoofStyle",
            "RoofMatl",
            "Exterior1st",
            "Exterior2nd",
            "MasVnrType",
            "Foundation",
            "BsmtFinType1",
            "BsmtFinType2"]
ord_cols = ["Fence",
            "PavedDrive",
            "Functional",
            "Electrical",
            "BsmtExposure",
            "HeatingQC",
            "KitchenQual",
            "GarageCond",
            "GarageQual",
            "FireplaceQu",
            "BsmtQual",
            "BsmtCond",
            "ExterCond",
            "ExterQual",
            "PoolQC"]

In [None]:
"""Intermediaries Needed for Encoding"""
# porch summing function
# def porch_func(df):
#     df_porch = df.copy()
#     df_porch["Porch_sum"] = 0
#     for porch in porch_cols:
#         df_porch["Porch_sum"] = df_porch["Porch_sum"] + df_porch[porch]
#     df_porch.drop(porch_cols, axis=1, inplace=True)
#     return df_porch

def porch_func(X):
    import pandas as pd
    return pd.DataFrame(X.sum(axis=1), columns=['porch_cols'])

# find hash components needed and add one for safety
file_path = "../../house-prices-advanced-regression-techniques/input/train.csv"
houses = pd.read_csv(file_path)
hash_n = math.ceil(math.log2(max(houses.select_dtypes(include="object").nunique()))) + 1

# ordinal map
ord_map = [{"col": "Fence", "mapping": {"Missing":0, "MnWw":1, "GdWo":2, "MnPrv":3, "GdPrv":4}},
             {"col": "PavedDrive", "mapping": {"N":1, "P":2, "Y":3}},
             {"col": "Functional", "mapping": {"Sal":-7, "Sev":-6, "Maj2":-5, "Maj1":-4, "Mod":-3, "Min2":-2, "Min1":-1, "Missing": 0, "Typ":0}},
             {"col": "Electrical", "mapping": {"Missing":0, "FuseP": -2, "FuseF":-1, "Mix":0, "FuseA":1, "SBrkr":2}},
             {"col": "BsmtExposure", "mapping": {"Missing":0, "No":1, "Mn":2, "Av":3, "Gd":4}},
             {"col": "HeatingQC", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "KitchenQual", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "GarageCond", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "GarageQual", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "FireplaceQu", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "BsmtQual", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "BsmtCond", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "ExterCond", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "ExterQual", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "PoolQC", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}}]

In [None]:
# instantiate transformers
zero_imputer = SimpleImputer(strategy="constant", fill_value=0)
missing_imputer = SimpleImputer(strategy="constant", fill_value="Missing")

ohe = OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist')
porch_sum = FunctionTransformer(porch_func)
hash = HashingEncoder(cols=hash_cols, n_components=hash_n, drop_invariant=True)
ord = OrdinalEncoder(cols=ord_cols, mapping=ord_map, drop_invariant=True)

In [None]:
# pipelines
ohe_pipe = Pipeline([("Missing Imputer", missing_imputer),
                     ("One Hot Encoder", ohe)])
hash_pipe = Pipeline([("Missing Imputer", missing_imputer),
                     ("Hashing Encoder", hash)])
ord_pipe = Pipeline([("Missing Imputer", missing_imputer),
                     ("Ordinal Encoder", ord)])
porch_pipe = Pipeline([("Zero Imputer", zero_imputer),
                     ("Porch Sum", porch_sum)])

In [None]:
# column transformer tuples
ohe_tuple = ("One Hot Encoder", ohe_pipe, ohe_cols)
hash_tuple = ("Hashing Encoder", hash_pipe, hash_cols)
ord_tuple = ("Ordinal Encoder", ord_pipe, ord_cols)
porch_tuple = ("Sum of Porches", porch_pipe, porch_cols)

num_tuple = ("Numeric Imputation", zero_imputer, num_cols)


In [None]:
# column selector
preprocessor = ColumnTransformer([num_tuple, ohe_tuple, porch_tuple, hash_tuple, ord_tuple], remainder='drop',
                                 verbose_feature_names_out=False)

In [None]:
# saving preprocessor for future use
joblib.dump(preprocessor, "preprocessor.joblib")

# Load dataset & Train/Test Split

In [None]:
file_path = "../../house-prices-advanced-regression-techniques/input/train.csv"
houses = pd.read_csv(file_path)

X = houses.drop(columns = ["Id", "MiscFeature", "MSSubClass", "SalePrice"])
y = houses["SalePrice"].copy()
y = np.log(y) # taking logarithm for a more normally distributed target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Transform Data
preprocessor.fit(X_train)
X_train_proc = preprocessor.transform(X_train)
X_test_proc = preprocessor.transform(X_test)

In [None]:
# Fitted preprocessor on all data
fitted_preprocessor = preprocessor.fit(X)
joblib.dump(fitted_preprocessor, "fitted_preprocessor.joblib")