This notebook is a cleaned up version of https://www.kaggle.com/code/abhivij/housing-price-prediction-part-2-exploratory 

# References
- sklearn pipeline : https://www.kaggle.com/code/alexisbcook/pipelines
- https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices
- https://www.kaggle.com/code/marto24/beginners-prediction-top3

# Import libraries

In [90]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from pandas.api.types import CategoricalDtype

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import KFold, cross_val_score
from sklearn.feature_selection import mutual_info_regression

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, PowerTransformer

from functools import reduce

from category_encoders import MEstimateEncoder, cat_boost

from sklearn.compose import ColumnTransformer

import optuna
import time

# Global variables

In [91]:
SEED = 0

# Load data and preprocess function

In [92]:
def load_and_preprocess_data(train_data = True, perform_impute = True):
    if train_data:
        print("Train data")
        X = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv', index_col='Id')
        X.dropna(axis=0, subset=['SalePrice'], inplace=True)
        y = X.SalePrice
        X.drop(['SalePrice'], axis=1, inplace=True)
    else:
        print("Test data")
        X = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv', index_col='Id')
        y = None
    print("Loaded data")
    print(X.shape)

    X["GarageYrBlt"] = X["GarageYrBlt"].where((X.GarageYrBlt.isna() | (X.GarageYrBlt <= 2024)), X.YearRemodAdd)  #there is 1 GarageYrBlt with value 2207
    X["Exterior2nd"] = X["Exterior2nd"].replace({"Brk Cmn": "BrkComm"})
    
    X = encode(X)
    if perform_impute:
        X = impute(X)
    
    return (X, y)

def encode(df):
    # Nominal categories
    for name in features_nom:
        df[name] = df[name].astype("category")
        # Add a None category for missing values
        if "None" not in df[name].cat.categories:
            df[name] = df[name].cat.add_categories("None")
    # Ordinal categories
    for name, levels in ordered_levels.items():
        df[name] = df[name].astype(CategoricalDtype(levels,
                                                    ordered=True))
    return df

def impute(df):
    df.loc[df.GarageYrBlt.isna() & df.GarageType.notna(), "GarageYrBlt"] = df.YearRemodAdd
    for name in df.select_dtypes("number"):
        df[name] = df[name].fillna(0)
    for name in df.select_dtypes("category"):
        df[name] = df[name].fillna("None")
    return df

# Categorical features - special handling
Ref : https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices

In [93]:
# The nominative (unordered) categorical features
features_nom = ["MSSubClass", "MSZoning", "Street", "Alley", "LandContour", "LotConfig", "Neighborhood", 
                "Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", 
                "MasVnrType", "Foundation", "Heating", "CentralAir", "GarageType", "MiscFeature", "SaleType", "SaleCondition",
                "Fence", "Electrical"]


# The ordinal (ordered) categorical features 
five_levels = ["Po", "Fa", "TA", "Gd", "Ex"]
ten_levels = list(range(1, 11))

ordered_levels = {
    "OverallQual": ten_levels,
    "OverallCond": ten_levels,
    "ExterQual": five_levels,
    "ExterCond": five_levels,
    "BsmtQual": five_levels,
    "BsmtCond": five_levels,
    "HeatingQC": five_levels,
    "KitchenQual": five_levels,
    "FireplaceQu": five_levels,
    "GarageQual": five_levels,
    "GarageCond": five_levels,
    "PoolQC": five_levels,
    "LotShape": ["IR3", "IR2", "IR1", "Reg"],
    "LandSlope": ["Sev", "Mod", "Gtl"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "Functional": ["Sal", "Sev", "Maj1", "Maj2", "Mod", "Min2", "Min1", "Typ"],
    "GarageFinish": ["Unf", "RFn", "Fin"],
    "PavedDrive": ["N", "P", "Y"],
    "Utilities": ["ELO", "NoSeWa", "NoSewr", "AllPub"],
    "CentralAir": ["N", "Y"]
}

ordered_levels = {key: ["None"] + value for key, value in
                  ordered_levels.items()}
ordered_levels.keys()

dict_keys(['OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC', 'LotShape', 'LandSlope', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Functional', 'GarageFinish', 'PavedDrive', 'Utilities', 'CentralAir'])

# Append features

In [94]:
ms_subclass_mapping = {
    20: "1-STORY 1946 & NEWER ALL STYLES",
    30: "1-STORY 1945 & OLDER",
    40: "1-STORY W/FINISHED ATTIC ALL AGES",
    45: "1-1/2 STORY - UNFINISHED ALL AGES",
    50: "1-1/2 STORY FINISHED ALL AGES",
    60: "2-STORY 1946 & NEWER",
    70: "2-STORY 1945 & OLDER",
    75: "2-1/2 STORY ALL AGES",
    80: "SPLIT OR MULTI-LEVEL",
    85: "SPLIT FOYER",
    90: "DUPLEX - ALL STYLES AND AGES",
    120: "1-STORY PUD (Planned Unit Development) - 1946 & NEWER",
    150: "1-1/2 STORY PUD - ALL AGES",
    160: "2-STORY PUD - 1946 & NEWER",
    180: "PUD - MULTILEVEL - INCL SPLIT LEV/FOYER",
    190: "2 FAMILY CONVERSION - ALL STYLES AND AGES"
}

ms_class_mapping = {
    "1-STORY 1946 & NEWER ALL STYLES": "1-Story",
    "1-STORY 1945 & OLDER": "1-Story",
    "1-STORY W/FINISHED ATTIC ALL AGES": "1-Story",
    "1-STORY PUD (Planned Unit Development) - 1946 & NEWER": "1-Story",
    "1-1/2 STORY - UNFINISHED ALL AGES": "1-1/2 Story",
    "1-1/2 STORY FINISHED ALL AGES": "1-1/2 Story",
    "1-1/2 STORY PUD - ALL AGES": "1-1/2 Story",
    "2-STORY 1946 & NEWER": "2-Story",
    "2-STORY 1945 & OLDER": "2-Story",
    "2-STORY PUD - 1946 & NEWER": "2-Story",
    "SPLIT OR MULTI-LEVEL": "Split-Level",
    "SPLIT FOYER": "Split-Level",
    "PUD - MULTILEVEL - INCL SPLIT LEV/FOYER": "Split-Level",
    "DUPLEX - ALL STYLES AND AGES": "Multi-Family/Duplex",
    "2 FAMILY CONVERSION - ALL STYLES AND AGES": "Multi-Family/Duplex",
    "2-1/2 STORY ALL AGES": "2-1/2 Story",
}

In [95]:
def append_features(df):
    df = df.copy()

    #The commented features below ended up decreasing the overall score
    
    df["LivLotRatio"] = df.GrLivArea / df.LotArea
    # df["Spaciousness"] = (df['1stFlrSF'] + df['2ndFlrSF']) / df.TotRmsAbvGrd
    # df["Spaciousness"] = df.GrLivArea / df.TotRmsAbvGrd
    df["Spaciousness"] = df.GrLivArea / (df.TotRmsAbvGrd + df.FullBath + df.HalfBath + df.KitchenAbvGr)

    # df["Age"] = df.YrSold - df.YearBuilt
    # df["Age_since_mod"] = df.YrSold - df.YearRemodAdd
    # print(df.Age_since_mod.describe())

    # bldg_dummies = pd.get_dummies(df.BldgType, prefix="Bldg")
    # df = df.join(bldg_dummies.mul(df.GrLivArea, axis=0))
    
    # df["PorchTypes"] = df[["WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch"]].gt(0.0).sum(axis=1)

    # df["TotalOutsideSF"] = df.WoodDeckSF + df.OpenPorchSF + df.EnclosedPorch + df["3SsnPorch"] + df.ScreenPorch

    df["MSClass"] = (X["MSSubClass"].map(ms_subclass_mapping)
                                    .map(ms_class_mapping)
                                    .astype('category')
                                    .cat.add_categories("None")
                                    .fillna("None"))
    df["IsPUD"] = (X["MSSubClass"].map(ms_subclass_mapping)
                                  .str.contains('PUD')
                                  .astype('category')
                                  .cat.add_categories("None")
                                  .fillna("None"))
    # df.drop(columns = "MSSubClass", inplace = True)

    # df["MedNhbdArea"] = df.groupby("Neighborhood")["GrLivArea"].transform("median")

    # #PCA inspired as specified in https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices
    # df["Feature1"] = df.GrLivArea + df.TotalBsmtSF
    # df["Feature2"] = df.YearRemodAdd * df.TotalBsmtSF

    # df["OverallScore"] = df.OverallQual.cat.codes * df.OverallCond.cat.codes
    # df["OverallScore"] = df.OverallQual.cat.codes + df.OverallCond.cat.codes

    # df["LotAreaFrontage"] = df.LotArea * (df.LotFrontage + 21.0/10)  
    #                                     # adding a small value to avoid effect of 0 LotFrontage. 
    #                                     # 21 is minimum LotFrontage before replacing NA with 0
    # df["LotAreaFrontage"] = df.LotArea * df.LotFrontage

    # df["Age_with_quality"] = (df.YrSold - df.YearBuilt) * df.OverallQual.cat.codes 

    # df["TotalBathrooms"] = df.FullBath + (0.5 * df.HalfBath) + df.BsmtFullBath + (0.5 * df.BsmtHalfBath)

    df["GarageAreaPerCar"] = df.GarageArea / (df.GarageCars + 0.1)
    # print(df["GarageAreaPerCar"].describe())
    
    return df

# Load data and process

In [96]:
X, y = load_and_preprocess_data()
X_test, _ = load_and_preprocess_data(train_data = False)

print("removing less important features")
features_to_drop = ['PoolQC', 'MiscVal', 'MoSold', 'PoolArea', 'MiscFeature', 'Utilities']
X.drop(columns = features_to_drop, inplace = True)
X_test.drop(columns = features_to_drop, inplace = True)
print(X.shape)
print(X_test.shape)

print("appending features")
X = append_features(X)
print(X.shape)
X_test = append_features(X_test)
print(X_test.shape)

def remove_columns_from_list(orig_list, to_remove):
    return [f for f in orig_list if f not in to_remove]
    
ordinal_categorical_cols = remove_columns_from_list(ordered_levels.keys(), features_to_drop)
features_nom = remove_columns_from_list(features_nom, features_to_drop)

Train data
Loaded data
(1460, 79)
Test data
Loaded data
(1459, 79)
removing less important features
(1460, 73)
(1459, 73)
appending features
(1460, 78)
(1459, 78)


# Append Cluster information as training features

In [97]:
class AppendKMeans(BaseEstimator, TransformerMixin):
    def __init__(self, cluster_columns, n_clusters=20, return_cluster=True, return_distances=False):
        self.cluster_columns = cluster_columns
        self.n_clusters = n_clusters
        self.return_cluster = return_cluster
        self.return_distances = return_distances

    def fit(self, X, y=None):
        X = X.copy()
        for colname in X.select_dtypes(["category"]):
            X[colname] = X[colname].cat.codes
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X[self.cluster_columns])  # Scale features
        self.kmeans = KMeans(n_clusters=self.n_clusters, n_init=10, random_state=SEED)
        self.kmeans.fit(X_scaled)  # Fit K-Means on scaled features
        return self

    def transform(self, X):
        result = X.copy()
        X = X.copy()
        for colname in X.select_dtypes(["category"]):
            X[colname] = X[colname].cat.codes
        X_scaled = self.scaler.transform(X[self.cluster_columns])  # Apply same scaling as training
        if self.return_cluster:
            result["Cluster"] = self.kmeans.predict(X_scaled)  # Get cluster
        if self.return_distances:
            cluster_distances = self.kmeans.transform(X_scaled)
            cluster_distances = pd.DataFrame(
                    cluster_distances, columns=[f"distance_centroid_{i}" for i in range(cluster_distances.shape[1])]
            )
            cluster_distances.set_index(X.index, inplace = True)
            result = result.join(cluster_distances)
        return result

# Append PCA

In [98]:
class AppendPCA(BaseEstimator, TransformerMixin):
    def __init__(self, pca_columns, n_components=2, pca_col_prefix="PCA"):
        self.pca_columns = pca_columns
        self.n_components = n_components
        self.pca_col_prefix = pca_col_prefix

    def fit(self, X, y=None):
        X = X.copy()
        for colname in X.select_dtypes(["category"]):
            X[colname] = X[colname].cat.codes
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X[self.pca_columns])  # Scale features
        self.pca = PCA(n_components=self.n_components, random_state=SEED)
        self.pca.fit(X_scaled)  # Fit PCA on scaled features
        return self

    def transform(self, X):
        result = X.copy()
        X = X.copy()
        for colname in X.select_dtypes(["category"]):
            X[colname] = X[colname].cat.codes
        X_scaled = self.scaler.transform(X[self.pca_columns])  # Apply same scaling as training
        pca_components = self.pca.transform(X_scaled)  # Apply PCA
        # print(self.pca.explained_variance_ratio_)
        # print(np.cumsum(self.pca.explained_variance_ratio_))
        pca_components = pd.DataFrame(
                    pca_components, columns=[f"{self.pca_col_prefix}_{i}" for i in range(pca_components.shape[1])]
        )
        pca_components.set_index(X.index, inplace = True)
        result = result.join(pca_components)
        return result

# Target Encoding

In [99]:
class CrossFoldEncoder(BaseEstimator, TransformerMixin):
    
    #encoder_other_params should be a dict of argument_name and value
    # This is done to ensure it works properly within Pipeline
    # Not passing it as kwargs, because Pipeline uses sklearn.base.clone() and clone does not retain kwargs
    def __init__(self, cols, encoder, encoder_other_params):
        self.cols = cols
        self.encoder = encoder
        self.cv = KFold(n_splits=5)
        self.encoder_other_params = encoder_other_params  
        
    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit(self, X, y):
        self.fitted_encoders_ = []
        X_encoded = []
        for idx_encode, _ in self.cv.split(X):
            fitted_encoder = self.encoder(cols=self.cols, **self.encoder_other_params)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            self.fitted_encoders_.append(fitted_encoder)
        return self

    # To transform the data, average the encodings learned from
    # each fold.
    def transform(self, X):
        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        #drop columns for which target encoding has been created and join with target encodings
        return X.drop(columns=self.cols).join(X_encoded)   

# Training pipeline

Lets define a Transformer to convert categorical columns to their codes

In [100]:
class OrdinalEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        result = X.copy()
        for col in result.columns:
            result[col] = result[col].cat.codes
        return result

In [101]:
categorical_cols = [cname for cname in X.columns if
                    X[cname].dtype == "category"]

numerical_cols = [cname for cname in X.columns if 
                X[cname].dtype in ['int64', 'float64']]

small_cat_categorical_cols = [cname for cname in categorical_cols if
                             X[cname].nunique() <= 4 and cname not in ordinal_categorical_cols]
med_cat_categorical_cols = [cname for cname in categorical_cols if
                             X[cname].nunique() > 4 and X[cname].nunique() < 10 and cname not in ordinal_categorical_cols]
large_cat_categorical_cols = [cname for cname in categorical_cols if
                             X[cname].nunique() >= 10 and cname not in ordinal_categorical_cols]

print(len(ordinal_categorical_cols))
print(len(small_cat_categorical_cols))
print(len(med_cat_categorical_cols))
print(len(large_cat_categorical_cols))
print(len(categorical_cols))  
print(len(numerical_cols))

20
5
16
4
45
33


In [102]:
def score_dataset(X, y, model=XGBRegressor()):
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    log_y = np.log(y)
    score = cross_val_score(
        model, X, log_y, cv=5, scoring="neg_root_mean_squared_error", n_jobs = -1
    )
    print(score)
    print(-1*np.median(score))
    print(np.std(score))
    score = -1 * np.mean(score)
    return score

In [103]:
# Pipeline 1
# numerical_transformer = Pipeline(steps=[
#     ('scaler', StandardScaler())
# ])
# ord_categorical_transformer = Pipeline(steps=[
#     ('catcode', OrdinalEncoder())
# ])
# small_categorical_transformer = Pipeline(steps=[
#     ('small_cat_catcode', OrdinalEncoder())
# ])

# pipeline = Pipeline([
#     ('append_pca', AppendPCA(X.columns, n_components = 5)),  
#     ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
#                                         encoder=MEstimateEncoder, 
#                                         encoder_other_params={"m":10.0})),
#     ('encoder_scaler', ColumnTransformer(
#         transformers=[
#             ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
#             ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
#             ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
#         ],
#         remainder="passthrough")
#     ),
#     ('model', XGBRegressor())         
# ])
# score_dataset(X, y, pipeline)

# [-0.12334956 -0.13390977 -0.13267923 -0.11532492 -0.12691669]
# 0.12691668822938887
# 0.006753496607733489
# 0.12643603427331637

In [104]:
# Pipeline 2
# numerical_transformer = Pipeline(steps=[
#     ('scaler', StandardScaler())
# ])
# ord_categorical_transformer = Pipeline(steps=[
#     ('catcode', OrdinalEncoder()),
#     ('scaler', StandardScaler())
# ])
# small_categorical_transformer = Pipeline(steps=[
#     ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])


# pipeline = Pipeline([
#     ('append_pca', AppendPCA(X.columns, n_components = 5)),  
#     ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
#                                         encoder=MEstimateEncoder, 
#                                         encoder_other_params={"m":10.0})),
#     ('encoder_scaler', ColumnTransformer(
#         transformers=[
#             ('num', numerical_transformer, numerical_cols+[col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
#             ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
#             ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
#         ],
#         remainder="passthrough")
#     ),
#     ('model', XGBRegressor())         
# ])

# score_dataset(X, y, pipeline)

# # [-0.12328024 -0.14403148 -0.14249333 -0.11536911 -0.12724898]
# # 0.12724898077297406
# # 0.011122748447938504
# # 0.1304846298772378

In [105]:
# Pipeline 3
# numerical_transformer = Pipeline(steps=[
#     ('scaler', StandardScaler())
# ])
# ord_categorical_transformer = Pipeline(steps=[
#     ('catcode', OrdinalEncoder()),
#     ('scaler', StandardScaler())
# ])
# small_categorical_transformer = Pipeline(steps=[
#     ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])


# pipeline = Pipeline([
#     ('append_pca', AppendPCA(X.columns, n_components = 5)), 
#     ('append_kmeans', AppendKMeans([f"PCA_{i}" for i in range(5)], 
#                                    n_clusters = 10,
#                                    return_cluster=True, return_distances=True)),
#     ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
#                                         encoder=MEstimateEncoder, 
#                                         encoder_other_params={"m":10.0})),
#     ('encoder_scaler', ColumnTransformer(
#         transformers=[
#             ('num', numerical_transformer, (numerical_cols + 
#                                            [col + "_encoded" for col in large_cat_categorical_cols] + 
#                                            [f"PCA_{i}" for i in range(5)] + 
#                                            [f"distance_centroid_{i}" for i in range(10)])
#             ),
#             ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
#             ('small_cat', small_categorical_transformer, small_cat_categorical_cols+['Cluster'])
#         ],
#         remainder="passthrough")
#     ),
#     ('model', XGBRegressor())         
# ])

# score_dataset(X, y, pipeline)

# # [-0.12575089 -0.14977669 -0.14573617 -0.11280657 -0.12453592]
# # 0.12575089246295004
# # 0.013910111600061406
# # 0.13172124644075708

In [106]:
# Pipeline 4
# numerical_transformer = Pipeline(steps=[
#     ('scaler', StandardScaler())
# ])
# ord_categorical_transformer = Pipeline(steps=[
#     ('catcode', OrdinalEncoder())
# ])
# small_categorical_transformer = Pipeline(steps=[
#     ('small_cat_catcode', OrdinalEncoder())
# ])

# pipeline = Pipeline([
#     ('append_pca', AppendPCA(X.columns, n_components = 5)),  
#     ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
#                                         encoder=MEstimateEncoder, 
#                                         encoder_other_params={"m":10.0})),
#     ('encoder_scaler', ColumnTransformer(
#         transformers=[
#             ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
#             ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
#             ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
#         ],
#         remainder="passthrough")
#     ),
#     ('model', RandomForestRegressor())         
# ])
# score_dataset(X, y, pipeline)
# [-0.13334223 -0.14826003 -0.14243637 -0.12530829 -0.13019875]
# 0.13334223315889074
# 0.008327364089523639
# 0.1359091355635815

In [107]:
# Pipeline 5
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
med_categorical_transformer = Pipeline(steps=[
    ('catcode_medcat', OrdinalEncoder())
])

pipeline = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('med_cat', med_categorical_transformer, med_cat_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', XGBRegressor())         
])

score_dataset(X, y, pipeline)

# [-0.12651235 -0.13380629 -0.13357361 -0.11428085 -0.12639507]
# 0.12651234561050542
# 0.007097623139040923
# 0.126913633236225

# med_cat [> 5 and < 10]
# [-0.12506024 -0.14160065 -0.13534954 -0.11813159 -0.11999781]
# 0.12506024282103534
# 0.00904491349257496
# 0.12802796794855642

# med_cat [> 6 and < 10]
# [-0.1277651  -0.14225836 -0.13436323 -0.11917849 -0.12597884]
# 0.12776509761340055
# 0.007842616722643862
# 0.12990880359437723

# med_cat [> 4 and < 10]
# [-0.12548204 -0.142939   -0.13416994 -0.11402189 -0.12682665]
# 0.12682664692146164
# 0.009612285820809875
# 0.12868790109795897

[-0.12548204 -0.142939   -0.13416994 -0.11402189 -0.12682665]
0.12682664692146164
0.009612285820809875


0.12868790109795897

In [127]:
# Pipeline 6
# numerical_transformer = Pipeline(steps=[
#     ('skew_handler', PowerTransformer(method='yeo-johnson', standardize=False)),
#     ('scaler', RobustScaler())
# ])
# ord_categorical_transformer = Pipeline(steps=[
#     ('catcode', OrdinalEncoder()),
#     ('scaler', RobustScaler())
# ])
# small_categorical_transformer = Pipeline(steps=[
#     ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])


# pipeline = Pipeline([
#     ('append_pca', AppendPCA(X.columns, n_components = 5)),  
#     ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
#                                         encoder=MEstimateEncoder, 
#                                         encoder_other_params={"m":10.0})),
#     ('encoder_scaler', ColumnTransformer(
#         transformers=[
#             ('num', numerical_transformer, numerical_cols+[col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
#             ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
#             ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
#         ],
#         remainder="passthrough")
#     ),
#     ('model', XGBRegressor())         
# ])

# score_dataset(X, y, pipeline)

# [-0.12212421 -0.14414503 -0.14381811 -0.11467164 -0.12970438]
# 0.12970437937124102
# 0.011697107907623815
# 0.13089267445607045

[-0.12212421 -0.14414503 -0.14381811 -0.11467164 -0.12970438]
0.12970437937124102
0.011697107907623815


0.13089267445607045

# Optimize hyperparameters

In [None]:
def xgb_objective(trial):  
    
    params = {
        'model__random_state':       SEED,
        'model__n_estimators':       trial.suggest_int('model__n_estimators', 500, 2000, step = 50),
        'model__learning_rate':      trial.suggest_float('model__learning_rate', 1e-4, 0.5, log=True),
        'model__max_depth':          trial.suggest_int('model__max_depth', 0, 8),
        'model__min_child_weight':   trial.suggest_int('model__min_child_weight', 0, 10),
        'model__lambda':             trial.suggest_float('model__lambda', 0, 10.0, step = 0.0001),
        'model__alpha':              trial.suggest_float('model__alpha', 0, 10.0, step = 0.0001),
        'model__subsample':          trial.suggest_float('model__subsample', 0.4, 1.0, step = 0.0001),
        'model__colsample_bytree':   trial.suggest_float('model__colsample_bytree', 0.4, 1.0, step = 0.0001),
        'model__colsample_bylevel':  trial.suggest_float('model__colsample_bylevel', 0.4, 1.0, step = 0.0001),
        'model__colsample_bynode':   trial.suggest_float('model__colsample_bynode', 0.4, 1.0, step = 0.0001)
    }
    pipeline_clone = clone(pipeline)
    pipeline_clone.set_params(**params)

    val_score = score_dataset(X, y, pipeline_clone)
    return val_score

start_time = time.time()
study = optuna.create_study(direction = 'minimize')
study.optimize(xgb_objective, n_trials = 100)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"XGB tuning took {elapsed_time:.2f} seconds.")
print(elapsed_time)

print(study.best_params)
print(study.best_value)
print(study.best_trial)

[I 2025-01-18 08:05:02,501] A new study created in memory with name: no-name-d91e400b-f0f2-47f7-bf8f-3322309db620
[I 2025-01-18 08:05:05,721] Trial 0 finished with value: 0.31540859481270117 and parameters: {'model__n_estimators': 750, 'model__learning_rate': 0.0005511932936141188, 'model__max_depth': 3, 'model__min_child_weight': 8, 'model__lambda': 0.11660000000000001, 'model__alpha': 3.9617, 'model__subsample': 0.8061, 'model__colsample_bytree': 0.9622, 'model__colsample_bylevel': 0.41750000000000004, 'model__colsample_bynode': 0.45920000000000005}. Best is trial 0 with value: 0.31540859481270117.


[-0.30044611 -0.33812794 -0.32852132 -0.3021699  -0.3077777 ]
0.307777700132494
0.0151359301710554


[I 2025-01-18 08:05:11,062] Trial 1 finished with value: 0.2671094044108245 and parameters: {'model__n_estimators': 550, 'model__learning_rate': 0.0014468134889327948, 'model__max_depth': 8, 'model__min_child_weight': 6, 'model__lambda': 8.168800000000001, 'model__alpha': 6.4327000000000005, 'model__subsample': 0.8436, 'model__colsample_bytree': 0.4349, 'model__colsample_bylevel': 0.8528, 'model__colsample_bynode': 0.901}. Best is trial 1 with value: 0.2671094044108245.


[-0.24862678 -0.28812241 -0.28226385 -0.25549757 -0.26103641]
0.26103640695823477
0.015391730252390367


[I 2025-01-18 08:05:13,764] Trial 2 finished with value: 0.37795647068268734 and parameters: {'model__n_estimators': 950, 'model__learning_rate': 0.00015687466560882532, 'model__max_depth': 1, 'model__min_child_weight': 2, 'model__lambda': 4.505, 'model__alpha': 2.3133, 'model__subsample': 0.5064000000000001, 'model__colsample_bytree': 0.5426, 'model__colsample_bylevel': 0.4404, 'model__colsample_bynode': 0.7005}. Best is trial 1 with value: 0.2671094044108245.


[-0.36499075 -0.40339983 -0.38915227 -0.36198025 -0.37025925]
0.3702592495921705
0.015844918165465124


[I 2025-01-18 08:05:17,834] Trial 3 finished with value: 0.1306446686874421 and parameters: {'model__n_estimators': 1050, 'model__learning_rate': 0.008919812485947802, 'model__max_depth': 3, 'model__min_child_weight': 9, 'model__lambda': 9.5759, 'model__alpha': 2.3312, 'model__subsample': 0.9245000000000001, 'model__colsample_bytree': 0.6960000000000001, 'model__colsample_bylevel': 0.7488, 'model__colsample_bynode': 0.6259}. Best is trial 3 with value: 0.1306446686874421.


[-0.11887484 -0.13882098 -0.14237467 -0.12288653 -0.13026633]
0.13026632671984703
0.00898115438541487


[I 2025-01-18 08:05:21,432] Trial 4 finished with value: 0.31566825113991775 and parameters: {'model__n_estimators': 1650, 'model__learning_rate': 0.00038229819995605604, 'model__max_depth': 1, 'model__min_child_weight': 7, 'model__lambda': 7.961600000000001, 'model__alpha': 7.2029000000000005, 'model__subsample': 0.7339, 'model__colsample_bytree': 0.9515, 'model__colsample_bylevel': 0.5911000000000001, 'model__colsample_bynode': 0.9389000000000001}. Best is trial 3 with value: 0.1306446686874421.


[-0.30082333 -0.33909486 -0.32918815 -0.3012185  -0.30801642]
0.30801641637094573
0.015616087096790241


[I 2025-01-18 08:05:31,969] Trial 5 finished with value: 0.14793579645458907 and parameters: {'model__n_estimators': 1350, 'model__learning_rate': 0.0022612942648834716, 'model__max_depth': 5, 'model__min_child_weight': 6, 'model__lambda': 1.5329000000000002, 'model__alpha': 1.9565000000000001, 'model__subsample': 0.5478000000000001, 'model__colsample_bytree': 0.9891000000000001, 'model__colsample_bylevel': 0.7275, 'model__colsample_bynode': 0.8204}. Best is trial 3 with value: 0.1306446686874421.


In [None]:
# def rf_objective(trial):  
    
#     params = {
#         'model__random_state':          SEED,
#         'model__n_estimators':          trial.suggest_int('model__n_estimators', 500, 10000, step = 100),
#         'model__max_depth':             trial.suggest_categorical('model__max_depth', [None] + list(range(4, 9))),
#         'model__min_samples_split':     trial.suggest_int('model__min_samples_split', 2, 20),
#         'model__min_samples_leaf':      trial.suggest_int('model__min_samples_leaf', 1, 20),
#         'model__max_features':          trial.suggest_categorical('model__max_features', ["sqrt", "log2", None]),
#         'model__min_impurity_decrease': trial.suggest_categorical('model__min_impurity_decrease',
#                                                                   [0, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2])
#     }
#     pipeline_clone = clone(pipeline)
#     pipeline_clone.set_params(**params)

#     val_score = score_dataset(X, y, pipeline_clone)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')
# study.optimize(rf_objective, n_trials = 100)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"RF tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [None]:
# # Best params
# best_params = {'model__n_estimators': 650, 'model__learning_rate': 0.02384699181391458, 
#                'model__max_depth': 4, 'model__min_child_weight': 1, 
#                'model__lambda': 3.6219467321680083, 'model__alpha': 0.00555739253376164,
#                'model__subsample': 0.5767916833020451, 'model__colsample_bytree': 0.799700431685384, 
#                'model__colsample_bylevel': 0.8676463787826333, 'model__colsample_bynode': 0.8271978509268489}

# Best is trial 62 finished with value: 0.11742375540973124 
# [-0.10837224 -0.12998627 -0.12850005 -0.10610519 -0.11415502]
# 0.11415502442730315
# 0.010012248568044236

# # Best params
# best_params = {'model__n_estimators': 9000, 'model__learning_rate': 0.002221965665898899, 
#                'model__max_depth': 4, 'model__min_child_weight': 5, 
#                'model__lambda': 0.3279, 'model__alpha': 0.2417, 
#                'model__subsample': 0.8049000000000001, 'model__colsample_bytree': 0.9475, 
#                'model__colsample_bylevel': 0.4768, 'model__colsample_bynode': 0.4454}

# Best is trial 92 with value: 0.11728844905814861.
# [-0.1069524  -0.12825763 -0.12633835 -0.10719973 -0.11769413]
# 0.11769413498310685
# 0.009066468479194328

# best_params = {'model__n_estimators': 1000, 'model__learning_rate': 0.02384699181391458, 
#                'model__max_depth': 4, 'model__min_child_weight': 1, 
#                'model__lambda': 3.6219467321680083, 'model__alpha': 0.00555739253376164,
#                'model__subsample': 0.5767916833020451, 'model__colsample_bytree': 0.799700431685384, 
#                'model__colsample_bylevel': 0.8676463787826333, 'model__colsample_bynode': 0.8271978509268489}

# [-0.10795649 -0.13002858 -0.12830252 -0.10569348 -0.11434615]
# 0.11434615405227047
# 0.010137099509793399
# 0.11726544444298999

In [None]:
# # Best params
# with Age_since_mod feature

# best_params = {'model__n_estimators': 850, 'model__learning_rate': 0.017442598532274846, 
#                'model__max_depth': 4, 'model__min_child_weight': 1, 
#                'model__lambda': 0.8255067561921624, 'model__alpha': 0.004278929472964361, 
#                'model__subsample': 0.8974473156207723, 'model__colsample_bytree': 0.6523619866997316, 
#                'model__colsample_bylevel': 0.996362051631553, 'model__colsample_bynode': 0.4628476409826708}

# Best is trial 61 with value: 0.11779737989132073.
# [-0.10790906 -0.1318243  -0.12853275 -0.10425928 -0.11646151]
# 0.11646151088245223
# 0.010907303679586703

In [None]:
# # Best params
# with pipeline2

# best_params = {'model__n_estimators': 1000, 'model__learning_rate': 0.020124989124474368, 
#                'model__max_depth': 4, 'model__min_child_weight': 6, 
#                'model__lambda': 1.732780164845022, 'model__alpha': 0.03214159796407795, 
#                'model__subsample': 0.618267265549825, 'model__colsample_bytree': 0.7018360769686597, 
#                'model__colsample_bylevel': 0.5343572892012461, 'model__colsample_bynode': 0.9702456455515944}

#Best is trial 85 with value: 0.11782913284946137.
# [-0.1100461  -0.12623955 -0.12463757 -0.11017505 -0.1180474 ]
# 0.11804740104228104
# 0.0068745661426423185


# best_params = {'model__n_estimators': 8500, 'model__learning_rate': 0.0024356711073752263, 
#                'model__max_depth': 6, 'model__min_child_weight': 7, 
#                'model__lambda': 1.0561, 'model__alpha': 0.0106, 
#                'model__subsample': 0.7481, 'model__colsample_bytree': 0.6855, 
#                'model__colsample_bylevel': 0.8596, 'model__colsample_bynode': 0.7395}

# Best is trial 86 with value: 0.11844810379526453.
# [-0.11185226 -0.12871867 -0.12804039 -0.10766969 -0.11595951]
# 0.11595951124551873
# 0.008524891860194693


# best_params = {'model__n_estimators': 1600, 'model__learning_rate': 0.020124989124474368, 
#                'model__max_depth': 4, 'model__min_child_weight': 6, 
#                'model__lambda': 1.732780164845022, 'model__alpha': 0.03214159796407795, 
#                'model__subsample': 0.618267265549825, 'model__colsample_bytree': 0.7018360769686597, 
#                'model__colsample_bylevel': 0.5343572892012461, 'model__colsample_bynode': 0.9702456455515944}

# pipeline 2
# 0.11802772081955834  2000
# 0.11790519839005424  1800
# 0.11783945720039994  1700
# 0.11775646400190418  1600
# 0.11777810773615957  1500
# 0.11777453916635619  1400
# 0.11776996564553088  1300
# 0.1177948484577633   1200

In [None]:
# # Best params
# with pipeline3

# best_params = {'model__n_estimators': 800, 'model__learning_rate': 0.03224043015635982, 
#                'model__max_depth': 4, 'model__min_child_weight': 0, 
#                'model__lambda': 1.888402035590375, 'model__alpha': 0.01316645967800336, 
#                'model__subsample': 0.7195499210117768, 'model__colsample_bytree': 0.9291979069424845, 
#                'model__colsample_bylevel': 0.41676523400723603, 'model__colsample_bynode': 0.8500081595906049}

# Best is trial 66 with value: 0.11661104365392565.
# [-0.10954966 -0.12933787 -0.1275082  -0.10417967 -0.11247982]
# 0.11247981940875118
# 0.010021866508966784

# best_params = {'model__n_estimators': 2300, 'model__learning_rate': 0.01796267061260535, 
#                'model__max_depth': 3, 'model__min_child_weight': 1, 
#                'model__lambda': 4.547000000000001, 'model__alpha': 0.2436, 
#                'model__subsample': 0.5405, 'model__colsample_bytree': 0.7052, 
#                'model__colsample_bylevel': 0.8729, 'model__colsample_bynode': 0.7699}

# Best is trial 83 with value: 0.11711228199833161.
# [-0.10811146 -0.12430323 -0.12678496 -0.10884254 -0.11751923]
# 0.11751922891170936
# 0.0076790551766297965

In [None]:
# #pipeline 4 best params
# {'model__n_estimators': 500, 'model__max_depth': None, 
#  'model__min_samples_split': 4, 'model__min_samples_leaf': 1, 
#  'model__max_features': None, 'model__min_impurity_decrease': 1e-05}

# Best is trial 69 with value: 0.13452078571895393.
# [-0.13265846 -0.14837583 -0.13993977 -0.12230483 -0.12932503]
# 0.13265846285872251
# 0.00895568242880693

In [None]:
# pipeline.set_params(**best_params)

# CV Score

In [None]:
# score_dataset(X, y, pipeline)
# 0.11742375540973124
# public score : 0.12053

# 0.11779737989132073  best param with Age_since_mod feature
# public score : 0.12179

# 0.11782913284946137 best param with pipeline2
# public score : 0.12119

# 0.11661104365392565 best param with pipeline3
# public score : 0.12261

# 0.11726544444298999 pipeline1 with 1000 iter
# public score : 0.12090

# pipeline 2
# 0.11802772081955834  2000
# 0.11790519839005424  1800
# 0.11783945720039994  1700
# 0.11775646400190418  1600
# 0.11777810773615957  1500
# 0.11777453916635619  1400
# 0.11776996564553088  1300
# 0.1177948484577633   1200

# Train on full data and obtain test predictions

In [None]:
#retrain on full data and obtain test predictions using best model hyperparameter values
pipeline.fit(X, np.log(y))

# Preprocessing of validation data, get predictions
pred = np.exp(pipeline.predict(X_test))

print(pred[:10])

In [None]:
pipeline

In [None]:
# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': pred})
output.to_csv('submission.csv', index=False)
print('saved output file')