This notebook explores different pipeline configurations, performs hyperparameter optimization and model ensembling based on the exploratory analysis from this [notebook](https://www.kaggle.com/code/abhivij/housing-price-prediction-part-2-exploratory)

Do check the [exploratory notebook](https://www.kaggle.com/code/abhivij/housing-price-prediction-part-2-exploratory) if you'd like to get a better idea of data distribution.

Please upvote if you find this notebook helpful : https://www.kaggle.com/code/abhivij/housing-price-prediction-part-2

# Import libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from pandas.api.types import CategoricalDtype

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, ExtraTreesRegressor, VotingRegressor, StackingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import BayesianRidge, RidgeCV, LinearRegression

from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold
from sklearn.feature_selection import mutual_info_regression

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, PowerTransformer

from functools import reduce

from category_encoders import MEstimateEncoder, cat_boost

from sklearn.compose import ColumnTransformer

import optuna
import time

In [None]:
# import xgboost
# import lightgbm
# print(xgboost.__version__)
# print(lightgbm.__version__)

# 2.0.3
# 4.5.0

# Global variables

In [None]:
SEED = 0

# Load data and preprocess function

In [None]:
def load_and_preprocess_data(train_data = True, perform_impute = True):
    if train_data:
        print("Train data")
        X = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv', index_col='Id')
        X.dropna(axis=0, subset=['SalePrice'], inplace=True)
        y = X.SalePrice
        X.drop(['SalePrice'], axis=1, inplace=True)
    else:
        print("Test data")
        X = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv', index_col='Id')
        y = None
    print("Loaded data")
    print(X.shape)

    X["GarageYrBlt"] = X["GarageYrBlt"].where((X.GarageYrBlt.isna() | (X.GarageYrBlt <= 2024)), X.YearRemodAdd)  #there is 1 GarageYrBlt with value 2207
    X["Exterior2nd"] = X["Exterior2nd"].replace({"Brk Cmn": "BrkComm"})
    
    X = encode(X)
    if perform_impute:
        X = impute(X)
    
    return (X, y)

def encode(df):  # lists of columns needed for this is defined in next cell
    # Nominal categories
    for name in features_nom:
        df[name] = df[name].astype("category")
        # Add a None category for missing values
        if "None" not in df[name].cat.categories:
            df[name] = df[name].cat.add_categories("None")
    # Ordinal categories
    for name, levels in ordered_levels.items():
        df[name] = df[name].astype(CategoricalDtype(levels,
                                                    ordered=True))
    return df

def impute(df):
    df.loc[df.GarageYrBlt.isna() & df.GarageType.notna(), "GarageYrBlt"] = df.YearRemodAdd
    for name in df.select_dtypes("number"):
        df[name] = df[name].fillna(0)
    for name in df.select_dtypes("category"):
        df[name] = df[name].fillna("None")
    return df

# Categorical features - special handling
Ref : https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices

In [None]:
# The nominative (unordered) categorical features
features_nom = ["MSSubClass", "MSZoning", "Street", "Alley", "LandContour", "LotConfig", "Neighborhood", 
                "Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", 
                "MasVnrType", "Foundation", "Heating", "CentralAir", "GarageType", "MiscFeature", "SaleType", "SaleCondition",
                "Fence", "Electrical"]


# The ordinal (ordered) categorical features 
five_levels = ["Po", "Fa", "TA", "Gd", "Ex"]
ten_levels = list(range(1, 11))

ordered_levels = {
    "OverallQual": ten_levels,
    "OverallCond": ten_levels,
    "ExterQual": five_levels,
    "ExterCond": five_levels,
    "BsmtQual": five_levels,
    "BsmtCond": five_levels,
    "HeatingQC": five_levels,
    "KitchenQual": five_levels,
    "FireplaceQu": five_levels,
    "GarageQual": five_levels,
    "GarageCond": five_levels,
    "PoolQC": five_levels,
    "LotShape": ["IR3", "IR2", "IR1", "Reg"],
    "LandSlope": ["Sev", "Mod", "Gtl"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "Functional": ["Sal", "Sev", "Maj1", "Maj2", "Mod", "Min2", "Min1", "Typ"],
    "GarageFinish": ["Unf", "RFn", "Fin"],
    "PavedDrive": ["N", "P", "Y"],
    "Utilities": ["ELO", "NoSeWa", "NoSewr", "AllPub"],
    "CentralAir": ["N", "Y"]
}

ordered_levels = {key: ["None"] + value for key, value in
                  ordered_levels.items()}
ordered_levels.keys()

# Append features

In [None]:
ms_subclass_mapping = {
    20: "1-STORY 1946 & NEWER ALL STYLES",
    30: "1-STORY 1945 & OLDER",
    40: "1-STORY W/FINISHED ATTIC ALL AGES",
    45: "1-1/2 STORY - UNFINISHED ALL AGES",
    50: "1-1/2 STORY FINISHED ALL AGES",
    60: "2-STORY 1946 & NEWER",
    70: "2-STORY 1945 & OLDER",
    75: "2-1/2 STORY ALL AGES",
    80: "SPLIT OR MULTI-LEVEL",
    85: "SPLIT FOYER",
    90: "DUPLEX - ALL STYLES AND AGES",
    120: "1-STORY PUD (Planned Unit Development) - 1946 & NEWER",
    150: "1-1/2 STORY PUD - ALL AGES",
    160: "2-STORY PUD - 1946 & NEWER",
    180: "PUD - MULTILEVEL - INCL SPLIT LEV/FOYER",
    190: "2 FAMILY CONVERSION - ALL STYLES AND AGES"
}

ms_class_mapping = {
    "1-STORY 1946 & NEWER ALL STYLES": "1-Story",
    "1-STORY 1945 & OLDER": "1-Story",
    "1-STORY W/FINISHED ATTIC ALL AGES": "1-Story",
    "1-STORY PUD (Planned Unit Development) - 1946 & NEWER": "1-Story",
    "1-1/2 STORY - UNFINISHED ALL AGES": "1-1/2 Story",
    "1-1/2 STORY FINISHED ALL AGES": "1-1/2 Story",
    "1-1/2 STORY PUD - ALL AGES": "1-1/2 Story",
    "2-STORY 1946 & NEWER": "2-Story",
    "2-STORY 1945 & OLDER": "2-Story",
    "2-STORY PUD - 1946 & NEWER": "2-Story",
    "SPLIT OR MULTI-LEVEL": "Split-Level",
    "SPLIT FOYER": "Split-Level",
    "PUD - MULTILEVEL - INCL SPLIT LEV/FOYER": "Split-Level",
    "DUPLEX - ALL STYLES AND AGES": "Multi-Family/Duplex",
    "2 FAMILY CONVERSION - ALL STYLES AND AGES": "Multi-Family/Duplex",
    "2-1/2 STORY ALL AGES": "2-1/2 Story",
}

In [None]:
def append_features(df):
    df = df.copy()

    #The commented features below ended up decreasing the overall score
    
    df["LivLotRatio"] = df.GrLivArea / df.LotArea
    # df["Spaciousness"] = (df['1stFlrSF'] + df['2ndFlrSF']) / df.TotRmsAbvGrd
    # df["Spaciousness"] = df.GrLivArea / df.TotRmsAbvGrd
    df["Spaciousness"] = df.GrLivArea / (df.TotRmsAbvGrd + df.FullBath + df.HalfBath + df.KitchenAbvGr)

    # df["Age"] = df.YrSold - df.YearBuilt
    # df["Age_since_mod"] = df.YrSold - df.YearRemodAdd
    # print(df.Age_since_mod.describe())

    # bldg_dummies = pd.get_dummies(df.BldgType, prefix="Bldg")
    # df = df.join(bldg_dummies.mul(df.GrLivArea, axis=0))
    
    # df["PorchTypes"] = df[["WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch"]].gt(0.0).sum(axis=1)

    # df["TotalOutsideSF"] = df.WoodDeckSF + df.OpenPorchSF + df.EnclosedPorch + df["3SsnPorch"] + df.ScreenPorch

    df["MSClass"] = (df["MSSubClass"].map(ms_subclass_mapping)
                                    .map(ms_class_mapping)
                                    .astype('category')
                                    .cat.add_categories("None")
                                    .fillna("None"))
    df["IsPUD"] = (df["MSSubClass"].map(ms_subclass_mapping)
                                  .str.contains('PUD')
                                  .astype('category')
                                  .cat.add_categories("None")
                                  .fillna("None"))
    # df.drop(columns = "MSSubClass", inplace = True)

    # df["MedNhbdArea"] = df.groupby("Neighborhood")["GrLivArea"].transform("median")

    # #PCA inspired as specified in https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices
    # df["Feature1"] = df.GrLivArea + df.TotalBsmtSF
    # df["Feature2"] = df.YearRemodAdd * df.TotalBsmtSF

    # df["OverallScore"] = df.OverallQual.cat.codes * df.OverallCond.cat.codes
    # df["OverallScore"] = df.OverallQual.cat.codes + df.OverallCond.cat.codes

    # df["LotAreaFrontage"] = df.LotArea * (df.LotFrontage + 21.0/10)  
    #                                     # adding a small value to avoid effect of 0 LotFrontage. 
    #                                     # 21 is minimum LotFrontage before replacing NA with 0
    # df["LotAreaFrontage"] = df.LotArea * df.LotFrontage

    # df["Age_with_quality"] = (df.YrSold - df.YearBuilt) * df.OverallQual.cat.codes 

    # df["TotalBathrooms"] = df.FullBath + (0.5 * df.HalfBath) + df.BsmtFullBath + (0.5 * df.BsmtHalfBath)

    df["GarageAreaPerCar"] = df.GarageArea / (df.GarageCars + 0.1)
    # print(df["GarageAreaPerCar"].describe())
    
    return df

# Load data and process

In [None]:
X, y = load_and_preprocess_data()
X_test, _ = load_and_preprocess_data(train_data = False)

print("removing less important features")
features_to_drop = ['PoolQC', 'MiscVal', 'MoSold', 'PoolArea', 'MiscFeature', 'Utilities']
X.drop(columns = features_to_drop, inplace = True)
X_test.drop(columns = features_to_drop, inplace = True)
print(X.shape)
print(X_test.shape)

print("appending features")
X = append_features(X)
print(X.shape)
X_test = append_features(X_test)
print(X_test.shape)

def remove_columns_from_list(orig_list, to_remove):
    return [f for f in orig_list if f not in to_remove]
    
ordinal_categorical_cols = remove_columns_from_list(ordered_levels.keys(), features_to_drop)
features_nom = remove_columns_from_list(features_nom, features_to_drop)

# Append Cluster information as training features

In [None]:
class AppendKMeans(BaseEstimator, TransformerMixin):
    def __init__(self, cluster_columns, n_clusters=20, return_cluster=True, return_distances=False):
        self.cluster_columns = cluster_columns
        self.n_clusters = n_clusters
        self.return_cluster = return_cluster
        self.return_distances = return_distances

    def fit(self, X, y=None):
        X = X.copy()
        for colname in X.select_dtypes(["category"]):
            X[colname] = X[colname].cat.codes
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X[self.cluster_columns])  # Scale features
        self.kmeans = KMeans(n_clusters=self.n_clusters, n_init=10, random_state=SEED)
        self.kmeans.fit(X_scaled)  # Fit K-Means on scaled features
        return self

    def transform(self, X):
        result = X.copy()
        X = X.copy()
        for colname in X.select_dtypes(["category"]):
            X[colname] = X[colname].cat.codes
        X_scaled = self.scaler.transform(X[self.cluster_columns])  # Apply same scaling as training
        if self.return_cluster:
            result["Cluster"] = self.kmeans.predict(X_scaled)  # Get cluster
        if self.return_distances:
            cluster_distances = self.kmeans.transform(X_scaled)
            cluster_distances = pd.DataFrame(
                    cluster_distances, columns=[f"distance_centroid_{i}" for i in range(cluster_distances.shape[1])]
            )
            cluster_distances.set_index(X.index, inplace = True)
            result = result.join(cluster_distances)
        return result

# Append PCA

In [None]:
class AppendPCA(BaseEstimator, TransformerMixin):
    def __init__(self, pca_columns, n_components=2, pca_col_prefix="PCA"):
        self.pca_columns = pca_columns
        self.n_components = n_components
        self.pca_col_prefix = pca_col_prefix

    def fit(self, X, y=None):
        X = X.copy()
        for colname in X.select_dtypes(["category"]):
            X[colname] = X[colname].cat.codes
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X[self.pca_columns])  # Scale features
        self.pca = PCA(n_components=self.n_components, random_state=SEED)
        self.pca.fit(X_scaled)  # Fit PCA on scaled features
        return self

    def transform(self, X):
        result = X.copy()
        X = X.copy()
        for colname in X.select_dtypes(["category"]):
            X[colname] = X[colname].cat.codes
        X_scaled = self.scaler.transform(X[self.pca_columns])  # Apply same scaling as training
        pca_components = self.pca.transform(X_scaled)  # Apply PCA
        # print(self.pca.explained_variance_ratio_)
        # print(np.cumsum(self.pca.explained_variance_ratio_))
        pca_components = pd.DataFrame(
                    pca_components, columns=[f"{self.pca_col_prefix}_{i}" for i in range(pca_components.shape[1])]
        )
        pca_components.set_index(X.index, inplace = True)
        result = result.join(pca_components)
        return result

# Target Encoding

In [None]:
class CrossFoldEncoder(BaseEstimator, TransformerMixin):
    
    #encoder_other_params should be a dict of argument_name and value
    # This is done to ensure it works properly within Pipeline
    # Not passing it as kwargs, because Pipeline uses sklearn.base.clone() and clone does not retain kwargs
    def __init__(self, cols, encoder, encoder_other_params):
        self.cols = cols
        self.encoder = encoder
        self.cv = KFold(n_splits=5)
        self.encoder_other_params = encoder_other_params  
        
    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit(self, X, y):
        self.fitted_encoders_ = []
        X_encoded = []
        for idx_encode, _ in self.cv.split(X):
            fitted_encoder = self.encoder(cols=self.cols, **self.encoder_other_params)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            self.fitted_encoders_.append(fitted_encoder)
        return self

    # To transform the data, average the encodings learned from
    # each fold.
    def transform(self, X):
        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        #drop columns for which target encoding has been created and join with target encodings
        return X.drop(columns=self.cols).join(X_encoded)   

# Training pipeline

Lets define a Transformer to convert categorical columns to their codes

In [None]:
class OrdinalEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        result = X.copy()
        for col in result.columns:
            result[col] = result[col].cat.codes
        return result

In [None]:
categorical_cols = [cname for cname in X.columns if
                    X[cname].dtype == "category"]

numerical_cols = [cname for cname in X.columns if 
                X[cname].dtype in ['int64', 'float64']]

small_cat_categorical_cols = [cname for cname in categorical_cols if
                             X[cname].nunique() < 10 and cname not in ordinal_categorical_cols]
large_cat_categorical_cols = [cname for cname in categorical_cols if
                             X[cname].nunique() >= 10 and cname not in ordinal_categorical_cols]

print(len(ordinal_categorical_cols))
print(len(small_cat_categorical_cols))
print(len(large_cat_categorical_cols))
print(len(categorical_cols))  
print(len(numerical_cols))

Train and test data have same distribution for all of the important variables as seen in the plots in https://www.kaggle.com/code/abhivij/housing-price-prediction-part-2-exploratory

While creating train-valid splits, we'll simulate this by binning sale price (i.e. the target variable) and ensuring that train and valid have same proportion of samples from each of these bins.

(I did explore creating a new concatenated column of OverallQual, Neighborhood, binned GarageArea, binned GrLivArea, binned YearBuilt, binned YearRemodAdd, binned YrSold. But the concatenated column had a large number of unique values with only 1 occurence, which caused issues during train-valid split based on this column. This issue persisted even after decreasing bin size, using fewer columns to obtain the concatenated column. So decided to bin sale price for train-valid split.) 

In [None]:
strat_y = pd.qcut(np.log(y), q=20, labels=False)
strat_y.value_counts().sort_index()

## Scoring function

In [None]:
def score_dataset(X, y, model=XGBRegressor()):
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    
    log_y = np.log(y)
    strat_y = pd.qcut(log_y, q=20, labels=False)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    splits = list(skf.split(X, strat_y))
    
    score = cross_val_score(
        model, X, log_y, cv=splits, scoring="neg_root_mean_squared_error"
    )

    # score = cross_val_score(
    #     model, X, log_y, cv=5, scoring="neg_root_mean_squared_error"
    # )    
    
    print(score)
    print(-1*np.median(score))
    print(np.std(score))
    score = -1 * np.mean(score)
    return score

## Pipeline definitions

In [None]:
#Pipeline 1
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('small_cat_catcode', OrdinalEncoder())
])

pipeline_xgb1 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', XGBRegressor(random_state = SEED))         
])
# score_dataset(X, y, pipeline_xgb1)
# [-0.12785618 -0.13046663 -0.14520162 -0.17736176 -0.11825496]
# 0.13046663079150023
# 0.02065921056003522
# 0.13982822954073137

In [None]:
# Pipeline 2
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder()),
    ('scaler', StandardScaler())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


pipeline_xgb2 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols+[col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', XGBRegressor(random_state = SEED))         
])

In [None]:
# Pipeline 3
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder()),
    ('scaler', StandardScaler())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


pipeline_xgb3 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)), 
    ('append_kmeans', AppendKMeans([f"PCA_{i}" for i in range(5)], 
                                   n_clusters = 10,
                                   return_cluster=True, return_distances=True)),
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, (numerical_cols + 
                                           [col + "_encoded" for col in large_cat_categorical_cols] + 
                                           [f"PCA_{i}" for i in range(5)] + 
                                           [f"distance_centroid_{i}" for i in range(10)])
            ),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols+['Cluster'])
        ],
        remainder="passthrough")
    ),
    ('model', XGBRegressor(random_state = SEED))         
])

In [None]:
# # Pipeline 4 - RandomForestRegressor
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('small_cat_catcode', OrdinalEncoder())
])

pipeline_rf = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', RandomForestRegressor(random_state = SEED))         
])

In [None]:
# Pipeline 5
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

pipeline_xgb5 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', XGBRegressor(random_state = SEED))         
])

In [None]:
# Pipeline 6
numerical_transformer = Pipeline(steps=[
    ('skew_handler', PowerTransformer(method='yeo-johnson', standardize=False)),
    ('scaler', RobustScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder()),
    ('scaler', RobustScaler())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


pipeline_xgb6 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols+[col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', XGBRegressor(random_state = SEED))         
])

In [None]:
# Pipeline 7 - only k-means

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


pipeline_xgb7 = Pipeline([
    ('append_kmeans', AppendKMeans(X.columns, 
                                   n_clusters = 10,
                                   return_cluster=True, return_distances=False)),
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, ([col + "_encoded" for col in large_cat_categorical_cols])
            ),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', XGBRegressor(random_state = SEED))         
])

In [None]:
# Pipeline 8 - only 1-hot encode for XGBRegressor

ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('smallcat_onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
large_categorical_transformer = Pipeline(steps=[
    ('largecat_onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

pipeline_xgb8 = Pipeline([ 
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols),
            ('large_cat', large_categorical_transformer, large_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', XGBRegressor(random_state = SEED))         
])

In [None]:
# Pipeline 9
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
large_categorical_transformer = Pipeline(steps=[
    ('largecat_onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


pipeline_xgb9 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)), 
    ('append_kmeans', AppendKMeans([f"PCA_{i}" for i in range(5)], 
                                   n_clusters = 10,
                                   return_cluster=True, return_distances=True)),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, (numerical_cols + 
                                           [f"PCA_{i}" for i in range(5)] + 
                                           [f"distance_centroid_{i}" for i in range(10)])
            ),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols+['Cluster']),
            ('large_cat', large_categorical_transformer, large_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', XGBRegressor(random_state = SEED))         
])

# score_dataset(X, y, pipeline_xgb9)
# [-0.13190184 -0.1366358  -0.14314017 -0.17554378 -0.12275222]
# 0.13663580215769133
# 0.01804277868680093
# 0.1419947618503093

In [None]:
# Pipeline 10
pipeline_xgb10 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)), 
    ('append_kmeans', AppendKMeans([f"PCA_{i}" for i in range(5)], 
                                   n_clusters = 10,
                                   return_cluster=False, return_distances=True)),
    ('model', XGBRegressor(random_state = SEED, enable_categorical = True))         
])

# score_dataset(X, y, pipeline_xgb10)
# [-0.13683021 -0.12144668 -0.14358679 -0.17946046 -0.12809528]
# 0.13683021145563887
# 0.02023977746270551
# 0.14188388439875155

In [None]:
# Pipeline CatBoost 1
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('small_cat_catcode', OrdinalEncoder())
])

pipeline_cb1 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', CatBoostRegressor(random_state = SEED, verbose = False))         
])
# score_dataset(X, y, pipeline_cb1)
# [-0.11624901 -0.11453878 -0.12176986 -0.1552284  -0.10681426]
# 0.11624900798690328
# 0.016848130288228083
# 0.1229200608367171

In [None]:
# Pipeline CatBoost 2 - catboost of Pipeline 5
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

pipeline_cb2 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', CatBoostRegressor(random_state = SEED, verbose = False))         
])


# score_dataset(X, y, pipeline_cb2)
# [-0.11657571 -0.11399993 -0.12022822 -0.151068   -0.10718364]
# 0.11657570939426244
# 0.015237178152076106
# 0.12181110094841437

In [None]:
# Pipeline Catboost 3 - CatBoost of k-means only pipeline

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


pipeline_cb3 = Pipeline([
    ('append_kmeans', AppendKMeans(X.columns, 
                                   n_clusters = 10,
                                   return_cluster=True, return_distances=False)),
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, ([col + "_encoded" for col in large_cat_categorical_cols])
            ),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', CatBoostRegressor(random_state = SEED, verbose = False))         
])

In [None]:
# Pipeline Catboost 4

pipeline_cb4 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('model', CatBoostRegressor(random_state = SEED, verbose = False, cat_features = categorical_cols))         
])

#score_dataset(X, y, pipeline_cb4)
# [-0.1245166  -0.12251158 -0.12408372 -0.15749913 -0.11058973]
# 0.124083723980751
# 0.015689374060298498
# 0.12784015103401533

In [None]:
# Pipeline CatBoost 5
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('smallcat_onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
large_categorical_transformer = Pipeline(steps=[
    ('largecat_onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

pipeline_cb5 = Pipeline([ 
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols),
            ('large_cat', large_categorical_transformer, large_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', CatBoostRegressor(random_state = SEED, verbose = False))         
])

In [None]:
# Pipeline Catboost 6

pipeline_cb6 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)), 
    ('append_kmeans', AppendKMeans([f"PCA_{i}" for i in range(5)], 
                                   n_clusters = 10,
                                   return_cluster=False, return_distances=True)),
    ('model', CatBoostRegressor(random_state = SEED, verbose = False, cat_features = categorical_cols))         
])
#score_dataset(X, y, pipeline_cb6)
# [-0.12623897 -0.12059074 -0.12858833 -0.15933993 -0.11112165]
# 0.1262389656010505
# 0.0162388462195836
# 0.12917592388929722

In [None]:
# Pipeline LGBM 1
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('small_cat_catcode', OrdinalEncoder())
])

pipeline_lgbm1 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', LGBMRegressor(random_state = SEED, bagging_seed = SEED, verbose = -1))         
])
# score_dataset(X, y, pipeline_lgbm1)
# [-0.12668316 -0.1288637  -0.13902374 -0.16449322 -0.11501904]
# 0.12886369701337194
# 0.016684932158762624
# 0.1348165713973164

In [None]:
# Pipeline LGBM 2
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

pipeline_lgbm2 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', LGBMRegressor(random_state = SEED, bagging_seed = SEED, verbose = -1))         
])
# score_dataset(X, y, pipeline_lgbm2)
# [-0.12638687 -0.13201789 -0.13919122 -0.16584972 -0.11779284]
# 0.1320178902768938
# 0.016374693948860188
# 0.13624770822490476

In [None]:
# Pipeline LGBM 3
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder()),
    ('scaler', StandardScaler())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


pipeline_lgbm3 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)), 
    ('append_kmeans', AppendKMeans([f"PCA_{i}" for i in range(5)], 
                                   n_clusters = 10,
                                   return_cluster=True, return_distances=True)),
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, (numerical_cols + 
                                           [col + "_encoded" for col in large_cat_categorical_cols] + 
                                           [f"PCA_{i}" for i in range(5)] + 
                                           [f"distance_centroid_{i}" for i in range(10)])
            ),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols+['Cluster'])
        ],
        remainder="passthrough")
    ),
    ('model', LGBMRegressor(random_state = SEED, bagging_seed = SEED, verbose = -1))         
])

In [None]:
# Pipeline LGBM 4 - kmeans only
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


pipeline_lgbm4 = Pipeline([
    ('append_kmeans', AppendKMeans(X.columns, 
                                   n_clusters = 10,
                                   return_cluster=True, return_distances=False)),
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, ([col + "_encoded" for col in large_cat_categorical_cols])
            ),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', LGBMRegressor(random_state = SEED, bagging_seed = SEED, verbose = -1))         
])

In [None]:
# Pipeline LGBM 5 - only 1-hot encoding

ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('smallcat_onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
large_categorical_transformer = Pipeline(steps=[
    ('largecat_onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

pipeline_lgbm5 = Pipeline([ 
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols),
            ('large_cat', large_categorical_transformer, large_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', LGBMRegressor(random_state = SEED, bagging_seed = SEED, verbose = -1))         
])

In [None]:
# Pipeline LGBM 6 
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
large_categorical_transformer = Pipeline(steps=[
    ('largecat_onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


pipeline_lgbm6 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)), 
    ('append_kmeans', AppendKMeans([f"PCA_{i}" for i in range(5)], 
                                   n_clusters = 10,
                                   return_cluster=True, return_distances=True)),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, (numerical_cols + 
                                           [f"PCA_{i}" for i in range(5)] + 
                                           [f"distance_centroid_{i}" for i in range(10)])
            ),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols+['Cluster']),
            ('large_cat', large_categorical_transformer, large_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', LGBMRegressor(random_state = SEED, bagging_seed = SEED, verbose = -1))         
])

# Optimize hyperparameters

* This section contains functions to optimize the pipeline hyperparameters using Optuna
* To run the code in this section, rename any of the pipelines defined in the previous section to 'pipeline' and uncomment the corresponding objective function in this section

In [None]:
# def xgb_objective(trial):  
    
#     params = {
#         'model__n_estimators':       trial.suggest_int('model__n_estimators', 500, 2000, step = 50),
#         'model__learning_rate':      trial.suggest_float('model__learning_rate', 1e-4, 0.1, log=True),
#         'model__max_depth':          trial.suggest_int('model__max_depth', 0, 16),
#         'model__min_child_weight':   trial.suggest_int('model__min_child_weight', 1, 10),
#         'model__lambda':             trial.suggest_float('model__lambda', 1e-4, 10.0, log = True),
#         'model__alpha':              trial.suggest_float('model__alpha', 1e-4, 10.0, log = True),
#         'model__subsample':          trial.suggest_float('model__subsample', 0.4, 1.0, step = 0.01),
#         'model__colsample_bytree':   trial.suggest_float('model__colsample_bytree', 0.4, 1.0, step = 0.01)
#     }
#     pipeline_clone = clone(pipeline)
#     pipeline_clone.set_params(**params)

#     val_score = score_dataset(X, y, pipeline_clone)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')
# study.optimize(xgb_objective, n_trials = 300)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"XGB tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [None]:
# #defining a separate objective function with additional params for categorical data handling

# def xgb10_objective(trial):  
    
#     params = {
#         'model__n_estimators':       trial.suggest_int('model__n_estimators', 500, 2000, step = 50),
#         'model__learning_rate':      trial.suggest_float('model__learning_rate', 1e-4, 0.1, log=True),
#         'model__max_depth':          trial.suggest_int('model__max_depth', 0, 16),
#         'model__min_child_weight':   trial.suggest_int('model__min_child_weight', 1, 10),
#         'model__lambda':             trial.suggest_float('model__lambda', 1e-4, 10.0, log = True),
#         'model__alpha':              trial.suggest_float('model__alpha', 1e-4, 10.0, log = True),
#         'model__subsample':          trial.suggest_float('model__subsample', 0.4, 1.0, step = 0.01),
#         'model__colsample_bytree':   trial.suggest_float('model__colsample_bytree', 0.4, 1.0, step = 0.01),
#         'model__max_cat_to_onehot':  trial.suggest_int('model__max_cat_to_onehot', 2, 25),
#         'model__max_cat_threshold':  trial.suggest_int('model__max_cat_threshold', 2, 32),

#     }
#     pipeline_clone = clone(pipeline_xgb10)
#     pipeline_clone.set_params(**params)

#     val_score = score_dataset(X, y, pipeline_clone)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')
# study.optimize(xgb10_objective, n_trials = 300)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"XGB10 tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [None]:
# def rf_objective(trial):  
    
#     params = {
#         'model__n_estimators':          trial.suggest_int('model__n_estimators', 500, 10000, step = 100),
#         'model__max_depth':             trial.suggest_categorical('model__max_depth', [None] + list(range(4, 9))),
#         'model__min_samples_split':     trial.suggest_int('model__min_samples_split', 2, 20),
#         'model__min_samples_leaf':      trial.suggest_int('model__min_samples_leaf', 1, 20),
#         'model__max_features':          trial.suggest_categorical('model__max_features', ["sqrt", "log2", None]),
#         'model__min_impurity_decrease': trial.suggest_categorical('model__min_impurity_decrease',
#                                                                   [0, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2])
#     }
#     pipeline_clone = clone(pipeline)
#     pipeline_clone.set_params(**params)

#     val_score = score_dataset(X, y, pipeline_clone)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')
# study.optimize(rf_objective, n_trials = 100)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"RF tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [None]:
# def cb_objective(trial):  
    
#     params = {
#         'model__n_estimators':       trial.suggest_int('model__n_estimators', 500, 2000, step = 50),
#         'model__learning_rate':      trial.suggest_float('model__learning_rate', 1e-4, 0.1, log=True),
#         'model__l2_leaf_reg':        trial.suggest_float('model__l2_leaf_reg', 1e-3, 10.0, log = True),
#         'model__min_data_in_leaf':   trial.suggest_int('model__min_data_in_leaf', 1, 50),
#         'model__max_depth':          trial.suggest_int('model__max_depth', 4, 16),
#         'model__subsample':          trial.suggest_float('model__subsample', 0.4, 1.0, step = 0.01),
#         'model__colsample_bylevel':  trial.suggest_float('model__colsample_bylevel', 0.4, 1.0, step = 0.01)
#     }

#     pipeline_clone = clone(pipeline)
#     pipeline_clone.set_params(**params)

#     val_score = score_dataset(X, y, pipeline_clone)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')
# study.optimize(cb_objective, n_trials = 50)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"CB tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [None]:
# def lgbm_objective(trial):  
    
#     params = {
#          'model__n_estimators':       trial.suggest_int('model__n_estimators', 100, 2000, step = 50),
#          'model__learning_rate':      trial.suggest_float('model__learning_rate', 1e-4, 0.1, log=True),
#          'model__num_leaves':         trial.suggest_int('model__num_leaves', 16, 256),
#          'model__max_depth':          trial.suggest_int('model__max_depth', 0, 16),
#          'model__min_data_in_leaf':   trial.suggest_int('model__min_data_in_leaf', 1, 50),
#          'model__bagging_freq':       trial.suggest_int('model__bagging_freq', 0, 7),
#          'model__bagging_fraction':   trial.suggest_float('model__bagging_fraction', 0.5, 1.0, step = 0.05),
#          'model__reg_alpha':          trial.suggest_float('model__reg_alpha', 1e-4, 10.0, log = True),
#          'model__reg_lambda':         trial.suggest_float('model__reg_lambda', 1e-4, 10.0, log = True),
#          'model__colsample_bytree':   trial.suggest_float('model__colsample_bytree', 0.4, 1.0, step = 0.01)
#     }

#     pipeline_clone = clone(pipeline)
#     pipeline_clone.set_params(**params)

#     val_score = score_dataset(X, y, pipeline_clone)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')
# study.optimize(lgbm_objective, n_trials = 300)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"LGBM tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [None]:
# # Best params
# with pipeline1
best_params_xgb1 = {'model__n_estimators': 1850, 'model__learning_rate': 0.0164692960710159, 
                    'model__max_depth': 4, 'model__min_child_weight': 2, 
                    'model__lambda': 0.00030967125261382463, 'model__alpha': 0.009462027221582257, 
                    'model__subsample': 0.64, 'model__colsample_bytree': 0.46}
# Best is trial 212 with value: 0.12096327119801184.
# [-0.11749978 -0.11468448 -0.11998631 -0.16015004 -0.09249576]
# 0.1174997758651165
# 0.02190148844115302

In [None]:
# # Best params
# with pipeline2
best_params_xgb2 ={'model__n_estimators': 1850, 'model__learning_rate': 0.007663483674441529, 
                   'model__max_depth': 4, 'model__min_child_weight': 1, 
                   'model__lambda': 1.3221982712197484, 'model__alpha': 0.0005031631526708031, 
                   'model__subsample': 0.66, 'model__colsample_bytree': 0.5900000000000001}
# Best is trial 169 with value: 0.12121695838464552.
# [-0.1190666  -0.11047241 -0.12027858 -0.15368264 -0.10258457]
# 0.11906659648415295
# 0.0174478434513716

In [None]:
# # Best params
# with pipeline3
best_params_xgb3 = {'model__n_estimators': 1350, 'model__learning_rate': 0.019180364463132607, 
                    'model__max_depth': 3, 'model__min_child_weight': 1, 
                    'model__lambda': 0.380879553148292, 'model__alpha': 0.1694031754175522, 
                    'model__subsample': 0.52, 'model__colsample_bytree': 0.8300000000000001}
# Best is trial 255 with value: 0.12291040055737397.
# [-0.1207175  -0.11784717 -0.12084213 -0.15751711 -0.09762809]
# 0.12071750059786712
# 0.01934705328976381

In [None]:
# # Pipeline 4 best params
best_params_rf = {'model__n_estimators': 8100, 'model__max_depth': None, 
                  'model__min_samples_split': 2, 'model__min_samples_leaf': 1, 
                  'model__max_features': 'log2', 'model__min_impurity_decrease': 1e-06}
# Best is trial 99 with value: 0.13801475483285586.
# [-0.1305587  -0.12929222 -0.14595504 -0.15818268 -0.12608512]
# 0.13055870069685813
# 0.0121965965512414

In [None]:
# Pipeline 5 best params
best_params_xgb5 = {'model__n_estimators': 1700, 'model__learning_rate': 0.016890178924066624,
                    'model__max_depth': 3, 'model__min_child_weight': 2,
                    'model__lambda': 0.006476272438913827, 'model__alpha': 0.00020362981148242182,
                    'model__subsample': 0.55, 'model__colsample_bytree': 0.62}

# Best is trial 251 with value: 0.12121663671265123.
# [-0.1183398  -0.11764894 -0.11819146 -0.15724026 -0.09466274]
# 0.11819145566235034
# 0.02016416246342805

In [None]:
# Pipeline 6 best params
# skf split - no run is successful, all fail. Default 5 fold CV split does give result

In [None]:
# Pipeline 7 best params
best_params_xgb7 = {'model__n_estimators': 900, 'model__learning_rate': 0.034132716349379644, 
                    'model__max_depth': 4, 'model__min_child_weight': 3, 
                    'model__lambda': 0.07598434952799418, 'model__alpha': 0.006227713546260122, 
                    'model__subsample': 0.62, 'model__colsample_bytree': 0.42000000000000004}
# Best is trial 290 with value: 0.12016913498104102.
# [-0.1231054  -0.10620493 -0.11861282 -0.15727291 -0.09564962]
# 0.11861282156685948
# 0.020899470502765815

In [None]:
# # Pipeline 8 best params
best_params_xgb8 = {'model__n_estimators': 2000, 'model__learning_rate': 0.013062870439665343, 
                    'model__max_depth': 4, 'model__min_child_weight': 1, 
                    'model__lambda': 0.6898851881426148, 'model__alpha': 0.01942788510854367, 
                    'model__subsample': 0.45, 'model__colsample_bytree': 0.66}
# Best is trial 238 with value: 0.12218887615283583.
# [-0.12311578 -0.10393904 -0.11978129 -0.16208063 -0.10202764]
# 0.11978128899996028
# 0.021621929170878792

In [None]:
# Pipeline 9 best params
best_params_xgb9 = {'model__n_estimators': 1300, 'model__learning_rate': 0.017812751084206546, 
                    'model__max_depth': 4, 'model__min_child_weight': 1, 
                    'model__lambda': 0.5374808280674902, 'model__alpha': 0.014717951591959225, 
                    'model__subsample': 0.42000000000000004, 'model__colsample_bytree': 0.5800000000000001}

# Best is trial 262 with value: 0.12289801572950965.
# [-0.11956882 -0.11307731 -0.12082082 -0.16199867 -0.09902446]
# 0.1195688216223085
# 0.02102690141132909

In [None]:
# Pipeline 10 best params
best_params_xgb10 = {'model__n_estimators': 1800, 'model__learning_rate': 0.009314521918435957, 
                     'model__max_depth': 4, 'model__min_child_weight': 2, 
                     'model__lambda': 0.05452883626379855, 'model__alpha': 0.0002684304975102518, 
                     'model__subsample': 0.76, 'model__colsample_bytree': 0.46}
# Best is trial 253 with value: 0.12189723833484438.
# [-0.11769908 -0.11210582 -0.12542139 -0.15399192 -0.10026798]
# 0.11769907780421265
# 0.018021150486328682

#skf split with optuna with cat params
# best_params_xgb10 = {'model__n_estimators': 950, 'model__learning_rate': 0.024273156624668824, 
#                      'model__max_depth': 4, 'model__min_child_weight': 4, 
#                      'model__lambda': 0.001385577354934915, 'model__alpha': 0.0004628157478762704, 
#                      'model__subsample': 0.78, 'model__colsample_bytree': 0.4, 
#                      'model__max_cat_to_onehot': 5, 'model__max_cat_threshold': 16}
# # Best is trial 150 with value: 0.1227792632215197.
# # [-0.11871171 -0.11198431 -0.1244113  -0.15544868 -0.10334032]
# # 0.11871171137791076
# # 0.017781468360262627

In [None]:
# CatBoost Pipeline 1
best_params_cb1 = {'model__n_estimators': 1600, 'model__learning_rate': 0.013567817011450631, 
                   'model__l2_leaf_reg': 2.352528904227011, 'model__min_data_in_leaf': 33, 
                   'model__max_depth': 7, 'model__subsample': 0.62, 
                   'model__colsample_bylevel': 0.9400000000000001}
# Best is trial 26 with value: 0.12146870978524071.
# [-0.11689243 -0.11222937 -0.1195251  -0.1484336  -0.11026306]
# 0.11689242795517175
# 0.013876291530023033

In [None]:
# CatBoost Pipeline 2
best_params_cb2 = {'model__n_estimators': 1000, 'model__learning_rate': 0.020766651766510266, 
                   'model__l2_leaf_reg': 0.0031728824818652163, 'model__min_data_in_leaf': 50, 
                   'model__max_depth': 6, 'model__subsample': 0.8, 
                   'model__colsample_bylevel': 0.91}

# Best is trial 12 with value: 0.12063624125827858.
# [-0.1156934  -0.10911198 -0.12282546 -0.15034066 -0.1052097 ]
# 0.1156933974068409
# 0.01601476634147856

In [None]:
# CatBoost Pipeline 3
best_params_cb3 = {'model__n_estimators': 700, 'model__learning_rate': 0.030625945706803, 
                   'model__l2_leaf_reg': 0.2906471745862896, 'model__min_data_in_leaf': 26, 
                   'model__max_depth': 6, 'model__subsample': 0.66, 
                   'model__colsample_bylevel': 0.64}
# Best is trial 43 with value: 0.12117629660914572.
# [-0.11685888 -0.11011589 -0.1235822  -0.1489466  -0.10637792]
# 0.11685887726354985
# 0.015079407156906926

In [None]:
# CatBoost Pipeline 4

best_params_cb4_1 = {'model__n_estimators': 1900, 'model__learning_rate': 0.023875793128268784, 
                     'model__l2_leaf_reg': 0.20539579640233013, 'model__min_data_in_leaf': 5, 
                     'model__max_depth': 5, 'model__subsample': 0.9400000000000001, 
                     'model__colsample_bylevel': 0.8500000000000001}
# # Best is trial 1 with value: 0.12746553160273738
# # [-0.12479513 -0.11602021 -0.12677251 -0.16181625 -0.10792355]
# # 0.1247951298785159
# # 0.01844162469894824

best_params_cb4_2 = {'model__n_estimators': 1400, 'model__learning_rate': 0.05291628705588727, 
                     'model__l2_leaf_reg': 0.11661179025482198, 'model__min_data_in_leaf': 10, 
                     'model__max_depth': 4, 'model__subsample': 0.99, 
                     'model__colsample_bylevel': 0.67}
# Best is trial 13 with value: 0.12657391647714436.
# [-0.12250947 -0.11908302 -0.12198717 -0.16438165 -0.10490827]
# 0.12198717243937014
# 0.019962446238785396

In [None]:
# CatBoost Pipeline 5

best_params_cb5 = {'model__n_estimators': 1850, 'model__learning_rate': 0.03310538371449341, 
                   'model__l2_leaf_reg': 0.5692385591345918, 'model__min_data_in_leaf': 5, 
                   'model__max_depth': 5, 'model__subsample': 0.53, 
                   'model__colsample_bylevel': 0.73}
# Best is trial 33 with value: 0.12232080734704395.
# [-0.11730409 -0.10672066 -0.12218277 -0.15967962 -0.1057169 ]
# 0.11730408572695568
# 0.019697317031556482

In [None]:
# CatBoost Pipeline 6

best_params_cb6 = {'model__n_estimators': 1700, 'model__learning_rate': 0.016184394757080008,
                   'model__l2_leaf_reg': 0.01949463452385371, 'model__min_data_in_leaf': 6,
                   'model__max_depth': 5, 'model__subsample': 0.5700000000000001,
                   'model__colsample_bylevel': 0.73}
# Best is trial 33 with value: 0.1274473054822071
# [-0.12410766 -0.11949513 -0.12663447 -0.15884384 -0.10815544]
# 0.124107658094659
# 0.01692908485762067

In [None]:
# Pipeline LGBM 1

best_params_lgbm1 = {'model__n_estimators': 1600, 'model__learning_rate': 0.009731152795312595, 
                     'model__num_leaves': 114, 'model__max_depth': 4, 
                     'model__min_data_in_leaf': 1, 'model__bagging_freq': 7, 
                     'model__bagging_fraction': 0.55, 'model__reg_alpha': 0.1807157822128309, 
                     'model__reg_lambda': 0.09215157159968763, 'model__colsample_bytree': 0.4}
# Best is trial 243 with value: 0.12052610219793292.
# [-0.11954391 -0.11328187 -0.11886363 -0.15219455 -0.09874655]
# 0.11886362989527258
# 0.01751254964107485

In [None]:
# Pipeline LGBM 2

best_params_lgbm2 = {'model__n_estimators': 1050, 'model__learning_rate': 0.014161850233991993, 
                     'model__num_leaves': 203, 'model__max_depth': 5, 
                     'model__min_data_in_leaf': 2, 'model__bagging_freq': 3, 
                     'model__bagging_fraction': 0.65, 'model__reg_alpha': 0.0034654923995791806, 
                     'model__reg_lambda': 0.06528414476899567, 'model__colsample_bytree': 0.43000000000000005}
# Best is trial 288 with value: 0.12006520251936215.
# [-0.1175394  -0.10907231 -0.12194504 -0.15602383 -0.09574543]
# 0.11753940364579261
# 0.020076009133654676

In [None]:
# Pipeline LGBM 3 - (with kmeans)

best_params_lgbm3 = {'model__n_estimators': 1350, 'model__learning_rate': 0.005813900232531856, 
                     'model__num_leaves': 248, 'model__max_depth': 10, 
                     'model__min_data_in_leaf': 20, 'model__bagging_freq': 1, 
                     'model__bagging_fraction': 0.5, 'model__reg_alpha': 0.0028223797745013522, 
                     'model__reg_lambda': 0.005988097246415957, 'model__colsample_bytree': 0.41000000000000003}
# Best is trial 241 with value: 0.12476945989203363.
# [-0.12135222 -0.11226465 -0.1272039  -0.1535579  -0.10946863]
# 0.12135222151106254
# 0.015728090881840222

In [None]:
# Pipeline LGBM 4 

best_params_lgbm4 = {'model__n_estimators': 1600, 'model__learning_rate': 0.016221004562637534, 
                     'model__num_leaves': 55, 'model__max_depth': 4, 
                     'model__min_data_in_leaf': 3, 'model__bagging_freq': 7, 
                     'model__bagging_fraction': 0.7, 'model__reg_alpha': 0.003283241571893786, 
                     'model__reg_lambda': 0.042729411250030144, 'model__colsample_bytree': 0.41000000000000003}
# Best is trial 233 with value: 0.12056356608490129.
# [-0.12052822 -0.10662705 -0.12023993 -0.15732043 -0.0981022 ]
# 0.12023993219991116
# 0.020248023319354637

In [None]:
# Pipeline LGBM 5

best_params_lgbm5 = {'model__n_estimators': 1700, 'model__learning_rate': 0.030862626260224756, 
                     'model__num_leaves': 249, 'model__max_depth': 4, 
                     'model__min_data_in_leaf': 1, 'model__bagging_freq': 1, 
                     'model__bagging_fraction': 0.7, 'model__reg_alpha': 0.00018348579563313004, 
                     'model__reg_lambda': 0.00038494776166918504, 'model__colsample_bytree': 0.61}

# Best is trial 294 with value: 0.12194678876497556.
# [-0.12324616 -0.10222344 -0.12207349 -0.16217016 -0.1000207 ]
# 0.1220734899492754
# 0.02231316891513297

In [None]:
# Pipeline LGBM 6

best_params_lgbm6 = {'model__n_estimators': 850, 'model__learning_rate': 0.01593545381425983, 
                     'model__num_leaves': 140, 'model__max_depth': 3, 
                     'model__min_data_in_leaf': 2, 'model__bagging_freq': 5, 
                     'model__bagging_fraction': 0.6, 'model__reg_alpha': 0.000599107791544195, 
                     'model__reg_lambda': 0.38106260630011507, 'model__colsample_bytree': 0.81}
# Best is trial 262 with value: 0.1243739978569125.
# [-0.12021403 -0.1201359  -0.11912219 -0.16589076 -0.0965071 ]
# 0.1201359012486722
# 0.02264091018759263

# Base model results

To submit a base model result to the competition, uncomment the cell below and replace "pipeline", "best_params" to the pipeline/params to be submitted, and also comment out the pipeline definition in the ensemble section(next section).

The pipeline set with best params will be used to train on full data and obtain test predictions in the last section.


In [None]:
# pipeline = pipeline_lgbm2
# best_params = best_params_lgbm2
# pipeline.set_params(**best_params)

In [None]:
#xgb1
# 0.12096327119801184.
# [-0.11749978 -0.11468448 -0.11998631 -0.16015004 -0.09249576]
# 0.1174997758651165
# 0.02190148844115302
# public score : 0.12208

#xgb2
# 0.12121695838464552.
# [-0.1190666  -0.11047241 -0.12027858 -0.15368264 -0.10258457]
# 0.11906659648415295
# 0.0174478434513716
# public score : 0.12121

#xgb3
# 0.12291040055737397.
# [-0.1207175  -0.11784717 -0.12084213 -0.15751711 -0.09762809]
# 0.12071750059786712
# 0.01934705328976381
# public score : 0.12344

#xgb5
# 0.12121663671265123.
# [-0.1183398  -0.11764894 -0.11819146 -0.15724026 -0.09466274]
# 0.11819145566235034
# 0.02016416246342805
# public score : 0.12270

#xgb7
# 0.12016913498104102.
# [-0.1231054  -0.10620493 -0.11861282 -0.15727291 -0.09564962]
# 0.11861282156685948
# 0.020899470502765815
# public score : 0.12176

#xgb8
# 0.12218887615283583.
# [-0.12311578 -0.10393904 -0.11978129 -0.16208063 -0.10202764]
# 0.11978128899996028
# 0.021621929170878792
# public score : 0.12340

#xgb9
# 0.12289801572950965
# [-0.11956882 -0.11307731 -0.12082082 -0.16199867 -0.09902446]
# 0.1195688216223085
# 0.02102690141132909
# public score : 0.12279

#xgb10
# 0.12189723833484438.
# [-0.11769908 -0.11210582 -0.12542139 -0.15399192 -0.10026798]
# 0.11769907780421265
# 0.018021150486328682
# public score : 0.12595

#cb1
# 0.12146870978524071
# [-0.11689243 -0.11222937 -0.1195251  -0.1484336  -0.11026306]
# 0.11689242795517175
# 0.013876291530023033
# public score : 0.12256

#cb2
# 0.12063624125827858
# [-0.1156934  -0.10911198 -0.12282546 -0.15034066 -0.1052097 ]
# 0.1156933974068409
# 0.01601476634147856
# public score : 0.12234

#cb3
# 0.12117629660914572
# [-0.11685888 -0.11011589 -0.1235822  -0.1489466  -0.10637792]
# 0.11685887726354985
# 0.015079407156906926
# public score : 0.12243

#cb4
# 0.12746553160273738.
# [-0.12479513 -0.11602021 -0.12677251 -0.16181625 -0.10792355]
# 0.1247951298785159
# 0.01844162469894824
# public score : 0.11767

# 0.12657391647714436.
# [-0.12250947 -0.11908302 -0.12198717 -0.16438165 -0.10490827]
# 0.12198717243937014
# 0.019962446238785396
# public score : 0.11843

#cb5
# 0.12232080734704395
# [-0.11730409 -0.10672066 -0.12218277 -0.15967962 -0.1057169 ]
# 0.11730408572695568
# 0.019697317031556482
# public score : 0.12262

#cb6
# 0.1274473054822071
# [-0.12410766 -0.11949513 -0.12663447 -0.15884384 -0.10815544]
# 0.124107658094659
# 0.01692908485762067
# public score : 0.11918

#lgbm1
# 0.12052610219793292
# [-0.11954391 -0.11328187 -0.11886363 -0.15219455 -0.09874655]
# 0.11886362989527258
# 0.01751254964107485
# public score : 0.12100

#lgbm2
# 0.12006520251936215
# [-0.1175394  -0.10907231 -0.12194504 -0.15602383 -0.09574543]
# 0.11753940364579261
# 0.020076009133654676
# public score : 0.12188

#lgbm3
# 0.12476945989203363
# [-0.12135222 -0.11226465 -0.1272039  -0.1535579  -0.10946863]
# 0.12135222151106254
# 0.015728090881840222
# public score : 0.12432

#lgbm4
# 0.12056356608490129
# [-0.12052822 -0.10662705 -0.12023993 -0.15732043 -0.0981022 ]
# 0.12023993219991116
# 0.020248023319354637
# public score : 0.12081

#lgbm5
# 0.12194678876497556.
# [-0.12324616 -0.10222344 -0.12207349 -0.16217016 -0.1000207 ]
# 0.1220734899492754
# 0.02231316891513297
# public score : 0.12629

#lgbm6
# 0.1243739978569125.
# [-0.12021403 -0.1201359  -0.11912219 -0.16589076 -0.0965071 ]
# 0.1201359012486722
# 0.02264091018759263
# public score : 0.12429

#rf
# 0.13801475483285586.
# [-0.1305587  -0.12929222 -0.14595504 -0.15818268 -0.12608512]
# 0.13055870069685813
# 0.0121965965512414
# public score : 0.14004

Among pipelines containing each of the 3 different models - XGB, CB and LGBM, the pipelines corresponding to best CV score and best public score was chosen for ensembling.

The best params for these pipelines are assigned below.

In [None]:
pipeline_xgb2.set_params(**best_params_xgb2)
pipeline_xgb7.set_params(**best_params_xgb7)

pipeline_cb2.set_params(**best_params_cb2)

pipeline_cb4_1 = clone(pipeline_cb4)
pipeline_cb4_2 = clone(pipeline_cb4)
pipeline_cb4_1.set_params(**best_params_cb4_1)
pipeline_cb4_2.set_params(**best_params_cb4_2)

pipeline_lgbm2.set_params(**best_params_lgbm2)
pipeline_lgbm4.set_params(**best_params_lgbm4)

# Ensemble

## Voting Ensemble

This ensemble method combines the predictions of the base models by taking their average.

In [None]:
# Creating a custom class because VotingRegressor passes the data as numpy array to the individual estimators
# But the pipelines used (i.e. the estimators for the VotingRegressor) expect Pandas DataFrame.
class CustomVotingRegressor(VotingRegressor):
    def __init__(self, estimators, *, weights=None, n_jobs=None, verbose=False):
        super().__init__(estimators=estimators, weights=weights, n_jobs=n_jobs, verbose=verbose)

    def fit(self, X, y):
        for name, estimator in self.estimators:
            if hasattr(estimator, "fit"):
                estimator.fit(X, y)
        return self

    def predict(self, X):
        # Collect predictions from each estimator
        predictions = []
        for name, estimator in self.estimators:
            if hasattr(estimator, "predict"):
                predictions.append(estimator.predict(X))

        # Combine predictions using weights
        final_predictions = np.average(predictions, axis=0, weights=self.weights)

        return final_predictions

The cell below shows the definition and results for 3 different voting ensembles tried.

In [None]:
# best CV score ensemble
# pipeline = CustomVotingRegressor([("xgb7", pipeline_xgb7),
#                                   ("cb2", pipeline_cb2),
#                                   ("lgbm2", pipeline_lgbm2)])

# [-0.11683245 -0.10538843 -0.11915497 -0.15306302 -0.09637309]
# 0.11683245126666751
# 0.019280847882660095
# 0.11816239196799852
# public score : 0.11982

# best public score ensemble
pipeline = CustomVotingRegressor([("xgb2", pipeline_xgb2),
                                  ("cb4_1", pipeline_cb4_1),
                                  ("lgbm4", pipeline_lgbm4)])

# [-0.11909416 -0.10784494 -0.12015935 -0.15589714 -0.09967064]
# 0.1190941614942
# 0.019230481823789974
# 0.12053324647111108
# public score : 0.11756


# best public score ensemble
# pipeline = CustomVotingRegressor([("cb4_1", pipeline_cb4_1),
#                                   ("cb4_2", pipeline_cb4_2),
#                                   ("cb6", pipeline_cb6)])
# [-0.1230189  -0.11669888 -0.12419558 -0.16068482 -0.10595699]
# 0.12301890277401711
# 0.018457255239078416
# 0.12611103443993923
# public score : 0.11822

## Stacking Ensemble

This ensemble method takes the predictions from the base models as features for a new classifier called stacking classifier / final estimator, and the predictions from this classifier is taken as the final prediction. I've tried ExtraTreesRegressor, XGB, LGBM as stacking classifier.

In [None]:
class CustomStackingRegressor(StackingRegressor):
    def __init__(self, estimators, final_estimator=None, *, cv=None, n_jobs=None, passthrough=False, verbose=0):
        super().__init__(estimators=estimators, final_estimator=final_estimator, 
                         cv=cv, n_jobs=n_jobs, passthrough=passthrough, verbose=verbose)

    def fit(self, X, y):
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input to fit must be a pandas DataFrame.")        

        # Fit all base estimators
        self.base_estimators_ = []  # Store fitted base estimators
        for name, estimator in self.estimators:
            if hasattr(estimator, "fit"):
                fitted_estimator = clone(estimator).fit(X, y)
                self.base_estimators_.append((name, fitted_estimator))
            else:
                raise ValueError(f"Estimator {name} does not implement a fit method.")

        # Generate predictions from base estimators for training the final estimator
        meta_features = self._predict_base_estimators(X)

        # Assign and fit the final estimator
        if self.final_estimator is None:
            self.final_estimator_ = RidgeCV(alphas=np.logspace(-6, 6, 13))
        else:
            self.final_estimator_ = clone(self.final_estimator)

        self.final_estimator_.fit(meta_features, y)
        return self

    def predict(self, X):
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input to predict must be a pandas DataFrame.")

        # Generate predictions from base estimators
        meta_features = self._predict_base_estimators(X)

        # Use the final estimator to make predictions
        return self.final_estimator_.predict(meta_features)

    def _predict_base_estimators(self, X):
        """
        Generate predictions from all base estimators and return as a DataFrame for meta-learning.
        """
        predictions = []
        for name, estimator in self.base_estimators_:
            if hasattr(estimator, "predict"):
                predictions.append(estimator.predict(X))
            else:
                raise ValueError(f"Estimator {name} does not implement a predict method.")

        # Stack base predictions column-wise and return as a DataFrame
        meta_features = pd.DataFrame(
            np.column_stack(predictions),
            columns=[name for name, _ in self.base_estimators_]
        )
        return meta_features

The functions below are for optimizing the hyperparameters of the stacking classifier used for ensembling

In [None]:
# def et_stacking_objective(trial):  
    
#     params = {
#         'n_estimators':          trial.suggest_int('n_estimators', 50, 300, step=10),  
#         'max_depth':             trial.suggest_categorical('max_depth', [None] + list(range(3, 12))),  
#         'min_samples_split':     trial.suggest_int('min_samples_split', 2, 10),  
#         'min_samples_leaf':      trial.suggest_int('min_samples_leaf', 1, 10),  
#         'max_features':          trial.suggest_categorical('max_features', ["sqrt", None, 0.5]), 
#         'bootstrap':             trial.suggest_categorical('bootstrap', [True, False]),  
#         'min_impurity_decrease': trial.suggest_categorical('min_impurity_decrease', [0, 1e-6, 1e-5, 1e-4, 1e-3])  
#     }

#     final_estimator = ExtraTreesRegressor(random_state=SEED, **params)
    
#     pipeline = CustomStackingRegressor([("xgb2", pipeline_xgb2),
#                                         ("cb4_1", pipeline_cb4_1),
#                                         ("lgbm4", pipeline_lgbm4)],
#                                        final_estimator=final_estimator)    
#     val_score = score_dataset(X, y, pipeline)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')
# study.enqueue_trial({'n_estimators': 200, 'max_depth': 10, 
#                     'min_samples_split': 3, 'min_samples_leaf': 5, 
#                     'max_features': None, 'bootstrap': False, 
#                     'min_impurity_decrease': 1e-05})
# study.enqueue_trial({'n_estimators': 200, 'max_depth': 10, 
#                     'min_samples_split': 5, 'min_samples_leaf': 4, 
#                     'max_features': None, 'bootstrap': False, 
#                     'min_impurity_decrease': 1e-05})
# study.enqueue_trial({'n_estimators': 200, 'max_depth': 7, 
#                     'min_samples_split': 10, 'min_samples_leaf': 4, 
#                     'max_features': 0.5, 'bootstrap': False, 
#                     'min_impurity_decrease': 1e-05})
# study.optimize(et_stacking_objective, n_trials = 100)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"ETStack tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [None]:
# def xgb_stacking_objective(trial):  
    
#     params = {
#         'n_estimators':             trial.suggest_int('n_estimators', 50, 300, step=10), 
#         'learning_rate':            trial.suggest_float('learning_rate', 0.001, 0.1, log=True),  
#         'max_depth':                trial.suggest_int('max_depth', 2, 10),  
#         'min_child_weight':         trial.suggest_int('min_child_weight', 1, 6),  
#         'lambda':                   trial.suggest_float('lambda', 1e-3, 10.0, log=True),  
#         'alpha':                    trial.suggest_float('alpha', 1e-3, 10.0, log=True),  
#         'subsample':                trial.suggest_float('subsample', 0.6, 1.0, step=0.05),  
#         'colsample_bytree':         trial.suggest_float('colsample_bytree', 0.6, 1.0, step=0.05)  
#     }
#     final_estimator = XGBRegressor(random_state = SEED, **params)
#     pipeline = CustomStackingRegressor([("xgb2", pipeline_xgb2),
#                                         ("cb4_1", pipeline_cb4_1),
#                                         ("lgbm4", pipeline_lgbm4)],
#                                        final_estimator=final_estimator)      
#     val_score = score_dataset(X, y, pipeline)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')
# study.enqueue_trial({'n_estimators': 150, 'learning_rate': 0.07375950393946418, 
#                      'max_depth': 3, 'min_child_weight': 5, 
#                      'lambda': 0.025091460051336875, 'alpha': 0.26969435515966034, 
#                      'subsample': 0.75, 'colsample_bytree': 1.0})
# study.optimize(xgb_stacking_objective, n_trials = 100)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"XGBStack tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [None]:
# def lgbm_stacking_objective(trial):  
    
#     params = {
#         'n_estimators':             trial.suggest_int('n_estimators', 100, 1000, step=50), 
#         'learning_rate':            trial.suggest_float('learning_rate', 0.001, 0.1, log=True),  
#         'max_depth':                trial.suggest_int('max_depth', 2, 10),  
#         'min_child_weight':         trial.suggest_int('min_child_weight', 1, 6),  
#         'lambda':                   trial.suggest_float('lambda', 1e-3, 10.0, log=True),  
#         'alpha':                    trial.suggest_float('alpha', 1e-3, 10.0, log=True),  
#         'subsample':                trial.suggest_float('subsample', 0.6, 1.0, step=0.05),  
#         'colsample_bytree':         trial.suggest_float('colsample_bytree', 0.6, 1.0, step=0.05)  
#     }
#     final_estimator = LGBMRegressor(random_state = SEED, bagging_seed = SEED, verbose = -1, **params)
#     pipeline = CustomStackingRegressor([("xgb2", pipeline_xgb2),
#                                         ("cb4_1", pipeline_cb4_1),
#                                         ("lgbm4", pipeline_lgbm4)],
#                                        final_estimator=final_estimator)   
#     val_score = score_dataset(X, y, pipeline)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')
# study.optimize(lgbm_stacking_objective, n_trials = 50)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"LGBMStack tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

The cells below shows the best hyperparameters, definition and results for 2*3 different stacking ensembles tried.

In [None]:
# best_params_stack_1_et = {'n_estimators': 150, 'max_depth': 9, 'min_samples_split': 9, 
#                           'min_samples_leaf': 1, 'max_features': 0.5, 
#                           'bootstrap': True, 'min_impurity_decrease': 1e-06}
# final_estimator = ExtraTreesRegressor(random_state=SEED, **best_params_stack_1_et)
# # Best is trial 96 with value: 0.11811866066958998.
# # [-0.11734512 -0.10611117 -0.11802932 -0.15376507 -0.09534262]
# # 0.11734512463400401
# # 0.019668250006962325
# public score : 0.11992

# best_params_stack_1_xgb = {'n_estimators': 190, 'learning_rate': 0.08769177958535646, 
#                            'max_depth': 7, 'min_child_weight': 6, 'lambda': 0.086495311711192, 
#                            'alpha': 1.410568758087574, 'subsample': 0.65, 'colsample_bytree': 0.65}
# final_estimator = XGBRegressor(random_state = SEED, **best_params_stack_1_xgb)
# # Best is trial 89 with value: 0.11859734903879329.
# # [-0.1162992  -0.11061317 -0.11796542 -0.14896509 -0.09914387]
# # 0.11629920408371323
# # 0.016553340014904726
# public score : 0.12011

# best_params_stack_1_lgbm = {'n_estimators': 450, 'learning_rate': 0.010840522748967163, 
#                             'max_depth': 3, 'min_child_weight': 5, 'lambda': 3.3906080499991837, 
#                             'alpha': 0.003051846651081862, 'subsample': 0.75, 'colsample_bytree': 0.95}
# final_estimator = LGBMRegressor(random_state = SEED, bagging_seed = SEED, verbose = -1, **best_params_stack_1_lgbm)
# # Best is trial 43 with value: 0.12087691224407573.
# # [-0.12269626 -0.11534181 -0.11637662 -0.1502804  -0.09968947]
# # 0.11637662226846344
# # 0.01653887974797546
# public score : 0.12159

# pipeline = CustomStackingRegressor([("xgb7", pipeline_xgb7),
#                                     ("cb2", pipeline_cb2),
#                                     ("lgbm2", pipeline_lgbm2)],
#                                    final_estimator=final_estimator)

In [None]:
# best_params_stack_2_et = {'n_estimators': 220, 'max_depth': 7, 'min_samples_split': 3, 
#                           'min_samples_leaf': 6, 'max_features': None, 
#                           'bootstrap': False, 'min_impurity_decrease': 1e-06}
# final_estimator = ExtraTreesRegressor(random_state=SEED, **best_params_stack_2_et)
# # Best is trial 80 with value: 0.1192352445126231.
# # [-0.11904213 -0.10864097 -0.11902749 -0.14953201 -0.09993363]
# # 0.11902749083788132
# # 0.01674972014959425
# public score : 0.11795

# best_params_stack_2_xgb = {'n_estimators': 120, 'learning_rate': 0.08323395974043807, 
#                            'max_depth': 5, 'min_child_weight': 2, 'lambda': 0.01966703130433511, 
#                            'alpha': 1.5843093533338586, 'subsample': 0.85, 'colsample_bytree': 0.8}
# final_estimator = XGBRegressor(random_state = SEED, **best_params_stack_2_xgb)
# # Best is trial 41 with value: 0.11906722703760253.
# # [-0.11932822 -0.11207885 -0.11744712 -0.14669344 -0.0997885 ]
# # 0.11744712156659413
# # 0.015403944333192962
# public score : 0.11810

# best_params_stack_2_lgbm = {'n_estimators': 400, 'learning_rate': 0.01777858244157366, 
#                             'max_depth': 2, 'min_child_weight': 5, 'lambda': 7.932044228972671, 
#                             'alpha': 0.8645676421523685, 'subsample': 0.7, 'colsample_bytree': 0.6}
# final_estimator = LGBMRegressor(random_state = SEED, bagging_seed = SEED, verbose = -1, **best_params_stack_2_lgbm)
# # Best is trial 14 with value: 0.12029488765955257.
# # [-0.12039283 -0.11591539 -0.11684868 -0.14817247 -0.10014507]
# # 0.11684867795871483
# # 0.015583586739827472
# public score : 0.11983

# pipeline = CustomStackingRegressor([("xgb2", pipeline_xgb2),
#                                     ("cb4_1", pipeline_cb4_1),
#                                     ("lgbm4", pipeline_lgbm4)],
#                                    final_estimator=final_estimator) 

# Train on full data and obtain test predictions

In [None]:
#retrain on full data and obtain test predictions using best model hyperparameter values
pipeline.fit(X, np.log(y))

# Preprocessing of validation data, get predictions
pred = np.exp(pipeline.predict(X_test))

print(pred[:10])

In [None]:
pipeline

In [None]:
# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': pred})
output.to_csv('submission.csv', index=False)
print('saved output file')

# References
- sklearn pipeline : https://www.kaggle.com/code/alexisbcook/pipelines
- https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices
- https://www.kaggle.com/code/marto24/beginners-prediction-top3

Please upvote if you found this notebook helpful : https://www.kaggle.com/code/abhivij/housing-price-prediction-part-2