This notebook is a cleaned up version of https://www.kaggle.com/code/abhivij/housing-price-prediction-part-2-exploratory 

# References
- sklearn pipeline : https://www.kaggle.com/code/alexisbcook/pipelines
- https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices
- https://www.kaggle.com/code/marto24/beginners-prediction-top3

# Import libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from pandas.api.types import CategoricalDtype

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, ExtraTreesRegressor, VotingRegressor, StackingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import BayesianRidge

from sklearn.model_selection import KFold, cross_val_score
from sklearn.feature_selection import mutual_info_regression

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, PowerTransformer

from functools import reduce

from category_encoders import MEstimateEncoder, cat_boost

from sklearn.compose import ColumnTransformer

import optuna
import time

# Global variables

In [2]:
SEED = 0

# Load data and preprocess function

In [3]:
def load_and_preprocess_data(train_data = True, perform_impute = True):
    if train_data:
        print("Train data")
        X = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv', index_col='Id')
        X.dropna(axis=0, subset=['SalePrice'], inplace=True)
        y = X.SalePrice
        X.drop(['SalePrice'], axis=1, inplace=True)
    else:
        print("Test data")
        X = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv', index_col='Id')
        y = None
    print("Loaded data")
    print(X.shape)

    X["GarageYrBlt"] = X["GarageYrBlt"].where((X.GarageYrBlt.isna() | (X.GarageYrBlt <= 2024)), X.YearRemodAdd)  #there is 1 GarageYrBlt with value 2207
    X["Exterior2nd"] = X["Exterior2nd"].replace({"Brk Cmn": "BrkComm"})
    
    X = encode(X)
    if perform_impute:
        X = impute(X)
    
    return (X, y)

def encode(df):  # lists of columns needed for this is defined in next cell
    # Nominal categories
    for name in features_nom:
        df[name] = df[name].astype("category")
        # Add a None category for missing values
        if "None" not in df[name].cat.categories:
            df[name] = df[name].cat.add_categories("None")
    # Ordinal categories
    for name, levels in ordered_levels.items():
        df[name] = df[name].astype(CategoricalDtype(levels,
                                                    ordered=True))
    return df

def impute(df):
    df.loc[df.GarageYrBlt.isna() & df.GarageType.notna(), "GarageYrBlt"] = df.YearRemodAdd
    for name in df.select_dtypes("number"):
        df[name] = df[name].fillna(0)
    for name in df.select_dtypes("category"):
        df[name] = df[name].fillna("None")
    return df

# Categorical features - special handling
Ref : https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices

In [4]:
# The nominative (unordered) categorical features
features_nom = ["MSSubClass", "MSZoning", "Street", "Alley", "LandContour", "LotConfig", "Neighborhood", 
                "Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", 
                "MasVnrType", "Foundation", "Heating", "CentralAir", "GarageType", "MiscFeature", "SaleType", "SaleCondition",
                "Fence", "Electrical"]


# The ordinal (ordered) categorical features 
five_levels = ["Po", "Fa", "TA", "Gd", "Ex"]
ten_levels = list(range(1, 11))

ordered_levels = {
    "OverallQual": ten_levels,
    "OverallCond": ten_levels,
    "ExterQual": five_levels,
    "ExterCond": five_levels,
    "BsmtQual": five_levels,
    "BsmtCond": five_levels,
    "HeatingQC": five_levels,
    "KitchenQual": five_levels,
    "FireplaceQu": five_levels,
    "GarageQual": five_levels,
    "GarageCond": five_levels,
    "PoolQC": five_levels,
    "LotShape": ["IR3", "IR2", "IR1", "Reg"],
    "LandSlope": ["Sev", "Mod", "Gtl"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "Functional": ["Sal", "Sev", "Maj1", "Maj2", "Mod", "Min2", "Min1", "Typ"],
    "GarageFinish": ["Unf", "RFn", "Fin"],
    "PavedDrive": ["N", "P", "Y"],
    "Utilities": ["ELO", "NoSeWa", "NoSewr", "AllPub"],
    "CentralAir": ["N", "Y"]
}

ordered_levels = {key: ["None"] + value for key, value in
                  ordered_levels.items()}
ordered_levels.keys()

dict_keys(['OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC', 'LotShape', 'LandSlope', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Functional', 'GarageFinish', 'PavedDrive', 'Utilities', 'CentralAir'])

# Append features

In [5]:
ms_subclass_mapping = {
    20: "1-STORY 1946 & NEWER ALL STYLES",
    30: "1-STORY 1945 & OLDER",
    40: "1-STORY W/FINISHED ATTIC ALL AGES",
    45: "1-1/2 STORY - UNFINISHED ALL AGES",
    50: "1-1/2 STORY FINISHED ALL AGES",
    60: "2-STORY 1946 & NEWER",
    70: "2-STORY 1945 & OLDER",
    75: "2-1/2 STORY ALL AGES",
    80: "SPLIT OR MULTI-LEVEL",
    85: "SPLIT FOYER",
    90: "DUPLEX - ALL STYLES AND AGES",
    120: "1-STORY PUD (Planned Unit Development) - 1946 & NEWER",
    150: "1-1/2 STORY PUD - ALL AGES",
    160: "2-STORY PUD - 1946 & NEWER",
    180: "PUD - MULTILEVEL - INCL SPLIT LEV/FOYER",
    190: "2 FAMILY CONVERSION - ALL STYLES AND AGES"
}

ms_class_mapping = {
    "1-STORY 1946 & NEWER ALL STYLES": "1-Story",
    "1-STORY 1945 & OLDER": "1-Story",
    "1-STORY W/FINISHED ATTIC ALL AGES": "1-Story",
    "1-STORY PUD (Planned Unit Development) - 1946 & NEWER": "1-Story",
    "1-1/2 STORY - UNFINISHED ALL AGES": "1-1/2 Story",
    "1-1/2 STORY FINISHED ALL AGES": "1-1/2 Story",
    "1-1/2 STORY PUD - ALL AGES": "1-1/2 Story",
    "2-STORY 1946 & NEWER": "2-Story",
    "2-STORY 1945 & OLDER": "2-Story",
    "2-STORY PUD - 1946 & NEWER": "2-Story",
    "SPLIT OR MULTI-LEVEL": "Split-Level",
    "SPLIT FOYER": "Split-Level",
    "PUD - MULTILEVEL - INCL SPLIT LEV/FOYER": "Split-Level",
    "DUPLEX - ALL STYLES AND AGES": "Multi-Family/Duplex",
    "2 FAMILY CONVERSION - ALL STYLES AND AGES": "Multi-Family/Duplex",
    "2-1/2 STORY ALL AGES": "2-1/2 Story",
}

In [6]:
def append_features(df):
    df = df.copy()

    #The commented features below ended up decreasing the overall score
    
    df["LivLotRatio"] = df.GrLivArea / df.LotArea
    # df["Spaciousness"] = (df['1stFlrSF'] + df['2ndFlrSF']) / df.TotRmsAbvGrd
    # df["Spaciousness"] = df.GrLivArea / df.TotRmsAbvGrd
    df["Spaciousness"] = df.GrLivArea / (df.TotRmsAbvGrd + df.FullBath + df.HalfBath + df.KitchenAbvGr)

    # df["Age"] = df.YrSold - df.YearBuilt
    # df["Age_since_mod"] = df.YrSold - df.YearRemodAdd
    # print(df.Age_since_mod.describe())

    # bldg_dummies = pd.get_dummies(df.BldgType, prefix="Bldg")
    # df = df.join(bldg_dummies.mul(df.GrLivArea, axis=0))
    
    # df["PorchTypes"] = df[["WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch"]].gt(0.0).sum(axis=1)

    # df["TotalOutsideSF"] = df.WoodDeckSF + df.OpenPorchSF + df.EnclosedPorch + df["3SsnPorch"] + df.ScreenPorch

    df["MSClass"] = (df["MSSubClass"].map(ms_subclass_mapping)
                                    .map(ms_class_mapping)
                                    .astype('category')
                                    .cat.add_categories("None")
                                    .fillna("None"))
    df["IsPUD"] = (df["MSSubClass"].map(ms_subclass_mapping)
                                  .str.contains('PUD')
                                  .astype('category')
                                  .cat.add_categories("None")
                                  .fillna("None"))
    # df.drop(columns = "MSSubClass", inplace = True)

    # df["MedNhbdArea"] = df.groupby("Neighborhood")["GrLivArea"].transform("median")

    # #PCA inspired as specified in https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices
    # df["Feature1"] = df.GrLivArea + df.TotalBsmtSF
    # df["Feature2"] = df.YearRemodAdd * df.TotalBsmtSF

    # df["OverallScore"] = df.OverallQual.cat.codes * df.OverallCond.cat.codes
    # df["OverallScore"] = df.OverallQual.cat.codes + df.OverallCond.cat.codes

    # df["LotAreaFrontage"] = df.LotArea * (df.LotFrontage + 21.0/10)  
    #                                     # adding a small value to avoid effect of 0 LotFrontage. 
    #                                     # 21 is minimum LotFrontage before replacing NA with 0
    # df["LotAreaFrontage"] = df.LotArea * df.LotFrontage

    # df["Age_with_quality"] = (df.YrSold - df.YearBuilt) * df.OverallQual.cat.codes 

    # df["TotalBathrooms"] = df.FullBath + (0.5 * df.HalfBath) + df.BsmtFullBath + (0.5 * df.BsmtHalfBath)

    df["GarageAreaPerCar"] = df.GarageArea / (df.GarageCars + 0.1)
    # print(df["GarageAreaPerCar"].describe())
    
    return df

# Load data and process

In [7]:
X, y = load_and_preprocess_data()
X_test, _ = load_and_preprocess_data(train_data = False)

print("removing less important features")
features_to_drop = ['PoolQC', 'MiscVal', 'MoSold', 'PoolArea', 'MiscFeature', 'Utilities']
X.drop(columns = features_to_drop, inplace = True)
X_test.drop(columns = features_to_drop, inplace = True)
print(X.shape)
print(X_test.shape)

print("appending features")
X = append_features(X)
print(X.shape)
X_test = append_features(X_test)
print(X_test.shape)

def remove_columns_from_list(orig_list, to_remove):
    return [f for f in orig_list if f not in to_remove]
    
ordinal_categorical_cols = remove_columns_from_list(ordered_levels.keys(), features_to_drop)
features_nom = remove_columns_from_list(features_nom, features_to_drop)

Train data
Loaded data
(1460, 79)
Test data
Loaded data
(1459, 79)
removing less important features
(1460, 73)
(1459, 73)
appending features
(1460, 78)
(1459, 78)


# Append Cluster information as training features

In [8]:
class AppendKMeans(BaseEstimator, TransformerMixin):
    def __init__(self, cluster_columns, n_clusters=20, return_cluster=True, return_distances=False):
        self.cluster_columns = cluster_columns
        self.n_clusters = n_clusters
        self.return_cluster = return_cluster
        self.return_distances = return_distances

    def fit(self, X, y=None):
        X = X.copy()
        for colname in X.select_dtypes(["category"]):
            X[colname] = X[colname].cat.codes
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X[self.cluster_columns])  # Scale features
        self.kmeans = KMeans(n_clusters=self.n_clusters, n_init=10, random_state=SEED)
        self.kmeans.fit(X_scaled)  # Fit K-Means on scaled features
        return self

    def transform(self, X):
        result = X.copy()
        X = X.copy()
        for colname in X.select_dtypes(["category"]):
            X[colname] = X[colname].cat.codes
        X_scaled = self.scaler.transform(X[self.cluster_columns])  # Apply same scaling as training
        if self.return_cluster:
            result["Cluster"] = self.kmeans.predict(X_scaled)  # Get cluster
        if self.return_distances:
            cluster_distances = self.kmeans.transform(X_scaled)
            cluster_distances = pd.DataFrame(
                    cluster_distances, columns=[f"distance_centroid_{i}" for i in range(cluster_distances.shape[1])]
            )
            cluster_distances.set_index(X.index, inplace = True)
            result = result.join(cluster_distances)
        return result

# Append PCA

In [9]:
class AppendPCA(BaseEstimator, TransformerMixin):
    def __init__(self, pca_columns, n_components=2, pca_col_prefix="PCA"):
        self.pca_columns = pca_columns
        self.n_components = n_components
        self.pca_col_prefix = pca_col_prefix

    def fit(self, X, y=None):
        X = X.copy()
        for colname in X.select_dtypes(["category"]):
            X[colname] = X[colname].cat.codes
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X[self.pca_columns])  # Scale features
        self.pca = PCA(n_components=self.n_components, random_state=SEED)
        self.pca.fit(X_scaled)  # Fit PCA on scaled features
        return self

    def transform(self, X):
        result = X.copy()
        X = X.copy()
        for colname in X.select_dtypes(["category"]):
            X[colname] = X[colname].cat.codes
        X_scaled = self.scaler.transform(X[self.pca_columns])  # Apply same scaling as training
        pca_components = self.pca.transform(X_scaled)  # Apply PCA
        # print(self.pca.explained_variance_ratio_)
        # print(np.cumsum(self.pca.explained_variance_ratio_))
        pca_components = pd.DataFrame(
                    pca_components, columns=[f"{self.pca_col_prefix}_{i}" for i in range(pca_components.shape[1])]
        )
        pca_components.set_index(X.index, inplace = True)
        result = result.join(pca_components)
        return result

# Target Encoding

In [10]:
class CrossFoldEncoder(BaseEstimator, TransformerMixin):
    
    #encoder_other_params should be a dict of argument_name and value
    # This is done to ensure it works properly within Pipeline
    # Not passing it as kwargs, because Pipeline uses sklearn.base.clone() and clone does not retain kwargs
    def __init__(self, cols, encoder, encoder_other_params):
        self.cols = cols
        self.encoder = encoder
        self.cv = KFold(n_splits=5)
        self.encoder_other_params = encoder_other_params  
        
    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit(self, X, y):
        self.fitted_encoders_ = []
        X_encoded = []
        for idx_encode, _ in self.cv.split(X):
            fitted_encoder = self.encoder(cols=self.cols, **self.encoder_other_params)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            self.fitted_encoders_.append(fitted_encoder)
        return self

    # To transform the data, average the encodings learned from
    # each fold.
    def transform(self, X):
        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        #drop columns for which target encoding has been created and join with target encodings
        return X.drop(columns=self.cols).join(X_encoded)   

# Training pipeline

Lets define a Transformer to convert categorical columns to their codes

In [11]:
class OrdinalEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        result = X.copy()
        for col in result.columns:
            result[col] = result[col].cat.codes
        return result

In [12]:
categorical_cols = [cname for cname in X.columns if
                    X[cname].dtype == "category"]

numerical_cols = [cname for cname in X.columns if 
                X[cname].dtype in ['int64', 'float64']]

small_cat_categorical_cols = [cname for cname in categorical_cols if
                             X[cname].nunique() < 10 and cname not in ordinal_categorical_cols]
large_cat_categorical_cols = [cname for cname in categorical_cols if
                             X[cname].nunique() >= 10 and cname not in ordinal_categorical_cols]

print(len(ordinal_categorical_cols))
print(len(small_cat_categorical_cols))
print(len(large_cat_categorical_cols))
print(len(categorical_cols))  
print(len(numerical_cols))

20
21
4
45
33


In [13]:
def score_dataset(X, y, model=XGBRegressor()):
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    log_y = np.log(y)
    score = cross_val_score(
        model, X, log_y, cv=5, scoring="neg_root_mean_squared_error"
    )
    print(score)
    print(-1*np.median(score))
    print(np.std(score))
    score = -1 * np.mean(score)
    return score

In [14]:
#Pipeline 1
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('small_cat_catcode', OrdinalEncoder())
])

pipeline_xgb1 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', XGBRegressor(random_state = SEED))         
])
# score_dataset(X, y, pipeline_xgb1)

# [-0.12334956 -0.13390977 -0.13267923 -0.11532492 -0.12691669]
# 0.12691668822938887
# 0.006753496607733489
# 0.12643603427331637

In [15]:
# Pipeline 2
# numerical_transformer = Pipeline(steps=[
#     ('scaler', StandardScaler())
# ])
# ord_categorical_transformer = Pipeline(steps=[
#     ('catcode', OrdinalEncoder()),
#     ('scaler', StandardScaler())
# ])
# small_categorical_transformer = Pipeline(steps=[
#     ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])


# pipeline = Pipeline([
#     ('append_pca', AppendPCA(X.columns, n_components = 5)),  
#     ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
#                                         encoder=MEstimateEncoder, 
#                                         encoder_other_params={"m":10.0})),
#     ('encoder_scaler', ColumnTransformer(
#         transformers=[
#             ('num', numerical_transformer, numerical_cols+[col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
#             ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
#             ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
#         ],
#         remainder="passthrough")
#     ),
#     ('model', XGBRegressor(random_state = SEED))         
# ])

# score_dataset(X, y, pipeline)

# # [-0.12328024 -0.14403148 -0.14249333 -0.11536911 -0.12724898]
# # 0.12724898077297406
# # 0.011122748447938504
# # 0.1304846298772378

In [16]:
# Pipeline 3
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder()),
    ('scaler', StandardScaler())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


pipeline_xgb3 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)), 
    ('append_kmeans', AppendKMeans([f"PCA_{i}" for i in range(5)], 
                                   n_clusters = 10,
                                   return_cluster=True, return_distances=True)),
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, (numerical_cols + 
                                           [col + "_encoded" for col in large_cat_categorical_cols] + 
                                           [f"PCA_{i}" for i in range(5)] + 
                                           [f"distance_centroid_{i}" for i in range(10)])
            ),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols+['Cluster'])
        ],
        remainder="passthrough")
    ),
    ('model', XGBRegressor(random_state = SEED))         
])

# score_dataset(X, y, pipeline_xgb3)

# # [-0.12575089 -0.14977669 -0.14573617 -0.11280657 -0.12453592]
# # 0.12575089246295004
# # 0.013910111600061406
# # 0.13172124644075708

In [17]:
# Pipeline 4 - RandomForestRegressor
# numerical_transformer = Pipeline(steps=[
#     ('scaler', StandardScaler())
# ])
# ord_categorical_transformer = Pipeline(steps=[
#     ('catcode', OrdinalEncoder())
# ])
# small_categorical_transformer = Pipeline(steps=[
#     ('small_cat_catcode', OrdinalEncoder())
# ])

# pipeline = Pipeline([
#     ('append_pca', AppendPCA(X.columns, n_components = 5)),  
#     ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
#                                         encoder=MEstimateEncoder, 
#                                         encoder_other_params={"m":10.0})),
#     ('encoder_scaler', ColumnTransformer(
#         transformers=[
#             ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
#             ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
#             ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
#         ],
#         remainder="passthrough")
#     ),
#     ('model', RandomForestRegressor(random_state = SEED))         
# ])
# score_dataset(X, y, pipeline)
# [-0.13334223 -0.14826003 -0.14243637 -0.12530829 -0.13019875]
# 0.13334223315889074
# 0.008327364089523639
# 0.1359091355635815

In [18]:
# Pipeline 5
# numerical_transformer = Pipeline(steps=[
#     ('scaler', StandardScaler())
# ])
# ord_categorical_transformer = Pipeline(steps=[
#     ('catcode', OrdinalEncoder())
# ])
# small_categorical_transformer = Pipeline(steps=[
#     ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])

# pipeline = Pipeline([
#     ('append_pca', AppendPCA(X.columns, n_components = 5)),  
#     ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
#                                         encoder=MEstimateEncoder, 
#                                         encoder_other_params={"m":10.0})),
#     ('encoder_scaler', ColumnTransformer(
#         transformers=[
#             ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
#             ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
#             ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
#         ],
#         remainder="passthrough")
#     ),
#     ('model', XGBRegressor(random_state = SEED))         
# ])

# score_dataset(X, y, pipeline)

# [-0.12651235 -0.13380629 -0.13357361 -0.11428085 -0.12639507]
# 0.12651234561050542
# 0.007097623139040923
# 0.126913633236225

# med_cat [> 5 and < 10]
# [-0.12506024 -0.14160065 -0.13534954 -0.11813159 -0.11999781]
# 0.12506024282103534
# 0.00904491349257496
# 0.12802796794855642

# med_cat [> 6 and < 10]
# [-0.1277651  -0.14225836 -0.13436323 -0.11917849 -0.12597884]
# 0.12776509761340055
# 0.007842616722643862
# 0.12990880359437723

# med_cat [> 4 and < 10]
# [-0.12548204 -0.142939   -0.13416994 -0.11402189 -0.12682665]
# 0.12682664692146164
# 0.009612285820809875
# 0.12868790109795897

In [19]:
# Pipeline 6
# numerical_transformer = Pipeline(steps=[
#     ('skew_handler', PowerTransformer(method='yeo-johnson', standardize=False)),
#     ('scaler', RobustScaler())
# ])
# ord_categorical_transformer = Pipeline(steps=[
#     ('catcode', OrdinalEncoder()),
#     ('scaler', RobustScaler())
# ])
# small_categorical_transformer = Pipeline(steps=[
#     ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])


# pipeline = Pipeline([
#     ('append_pca', AppendPCA(X.columns, n_components = 5)),  
#     ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
#                                         encoder=MEstimateEncoder, 
#                                         encoder_other_params={"m":10.0})),
#     ('encoder_scaler', ColumnTransformer(
#         transformers=[
#             ('num', numerical_transformer, numerical_cols+[col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
#             ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
#             ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
#         ],
#         remainder="passthrough")
#     ),
#     ('model', XGBRegressor(random_state = SEED))         
# ])

# score_dataset(X, y, pipeline)

# [-0.12212421 -0.14414503 -0.14381811 -0.11467164 -0.12970438]
# 0.12970437937124102
# 0.011697107907623815
# 0.13089267445607045

In [20]:
# Pipeline 7 - only k-means

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


pipeline_xgb7 = Pipeline([
    ('append_kmeans', AppendKMeans(X.columns, 
                                   n_clusters = 10,
                                   return_cluster=True, return_distances=False)),
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, ([col + "_encoded" for col in large_cat_categorical_cols])
            ),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', XGBRegressor(random_state = SEED))         
])

# score_dataset(X, y, pipeline_xgb7)

# only centroid distance
# [-0.13326775 -0.14492401 -0.14185682 -0.12436563 -0.13897696]
# 0.13897696253715167
# 0.007257085655132953
# 0.13667823374117763

# only Cluster  
# [-0.13092108 -0.14709257 -0.14129975 -0.12789252 -0.13499367]
# 0.13499367109437677
# 0.006967125977370855
# 0.1364399168990166

# only cluster - left untouched    - choosing this one
# [-0.12070129 -0.14202249 -0.14128743 -0.13420726 -0.12928795]
# 0.13420725736860215
# 0.007941353221420946
# 0.13350128260930716

In [21]:
# Pipeline 8 - only 1-hot encode for XGBRegressor

# ord_categorical_transformer = Pipeline(steps=[
#     ('catcode', OrdinalEncoder())
# ])
# small_categorical_transformer = Pipeline(steps=[
#     ('smallcat_onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])
# large_categorical_transformer = Pipeline(steps=[
#     ('largecat_onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])

# pipeline = Pipeline([ 
#     ('encoder_scaler', ColumnTransformer(
#         transformers=[
#             ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
#             ('small_cat', small_categorical_transformer, small_cat_categorical_cols),
#             ('large_cat', large_categorical_transformer, large_cat_categorical_cols)
#         ],
#         remainder="passthrough")
#     ),
#     ('model', XGBRegressor(random_state = SEED))         
# ])

# score_dataset(X, y, pipeline)
# [-0.13613093 -0.15101244 -0.13792802 -0.11980567 -0.13703808]
# 0.13703808145019195
# 0.00991673402099732
# 0.13638302697986404

In [22]:
# Pipeline CatBoost 1
# numerical_transformer = Pipeline(steps=[
#     ('scaler', StandardScaler())
# ])
# ord_categorical_transformer = Pipeline(steps=[
#     ('catcode', OrdinalEncoder())
# ])
# small_categorical_transformer = Pipeline(steps=[
#     ('small_cat_catcode', OrdinalEncoder())
# ])

# pipeline = Pipeline([
#     ('append_pca', AppendPCA(X.columns, n_components = 5)),  
#     ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
#                                         encoder=MEstimateEncoder, 
#                                         encoder_other_params={"m":10.0})),
#     ('encoder_scaler', ColumnTransformer(
#         transformers=[
#             ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
#             ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
#             ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
#         ],
#         remainder="passthrough")
#     ),
#     ('model', CatBoostRegressor(random_state = SEED, verbose = False))         
# ])
# score_dataset(X, y, pipeline)

# [-0.1085442  -0.12963785 -0.12813562 -0.11028954 -0.11625041]
# 0.11625041178886363
# 0.008814324369377749
# 0.11857152416791084

In [23]:
# Pipeline CatBoost 2 - catboost of Pipeline 5
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

pipeline_cb2 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', CatBoostRegressor(random_state = SEED, verbose = False))         
])

# score_dataset(X, y, pipeline_cb2)
# [-0.10855696 -0.1291194  -0.12396306 -0.11102852 -0.11627649]
# 0.11627649414347431
# 0.007742341323149529
# 0.11778888615576621

In [24]:
# Pipeline Catboost 3 - CatBoost of k-means only pipeline

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


pipeline_cb3 = Pipeline([
    ('append_kmeans', AppendKMeans(X.columns, 
                                   n_clusters = 10,
                                   return_cluster=True, return_distances=False)),
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, ([col + "_encoded" for col in large_cat_categorical_cols])
            ),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', CatBoostRegressor(random_state = SEED, verbose = False))         
])

# score_dataset(X, y, pipeline_cb3)

# [-0.10820295 -0.13116993 -0.12583494 -0.11234789 -0.11822297]
# 0.11822296922270072
# 0.008439584476879475
# 0.11915573581349706

In [25]:
# Pipeline Catboost 4

pipeline_cb4 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('model', CatBoostRegressor(random_state = SEED, verbose = False, cat_features = categorical_cols))         
])

# score_dataset(X, y, pipeline_cb4)
# [-0.11708812 -0.13720957 -0.12798973 -0.11292743 -0.12335977]
# 0.12335977226121901
# 0.008494589516422364
# 0.12371492410358331

# without PCA
# [-0.11357924 -0.14151363 -0.12863115 -0.11429514 -0.12462189]
# 0.1246218915669454
# 0.010295076083013943
# 0.12452821126916189

In [26]:
# Pipeline CatBoost 5
# ord_categorical_transformer = Pipeline(steps=[
#     ('catcode', OrdinalEncoder())
# ])
# small_categorical_transformer = Pipeline(steps=[
#     ('smallcat_onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])
# large_categorical_transformer = Pipeline(steps=[
#     ('largecat_onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])

# pipeline = Pipeline([ 
#     ('encoder_scaler', ColumnTransformer(
#         transformers=[
#             ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
#             ('small_cat', small_categorical_transformer, small_cat_categorical_cols),
#             ('large_cat', large_categorical_transformer, large_cat_categorical_cols)
#         ],
#         remainder="passthrough")
#     ),
#     ('model', CatBoostRegressor(random_state = SEED, verbose = False))         
# ])

# score_dataset(X, y, pipeline)

# all one-hot
# [-0.10672922 -0.13633214 -0.12457854 -0.10782042 -0.12033838]
# 0.12033838274832265
# 0.011033964213400175
# 0.11915974164505086

# small_cat one-hot, large_cat ord
# [-0.11096583 -0.13340597 -0.12563391 -0.108273   -0.12293233]
# 0.12293232865898586
# 0.009369152248574352
# 0.12024220711125147

# no PCA                                                       - choosing this as best
# [-0.10711505 -0.13372507 -0.12441688 -0.10847305 -0.12105381]
# 0.12105381144002565
# 0.010024449857969605
# 0.11895677082508065

# no PCA, but with numerical standard scaling
# [-0.10507992 -0.13093361 -0.12520356 -0.10951426 -0.12321157]
# 0.1232115691438285
# 0.009819949434289373
# 0.1187885845004681

In [27]:
# Pipeline LGBM 1
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('small_cat_catcode', OrdinalEncoder())
])

pipeline_lgbm1 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', LGBMRegressor(random_state = SEED, bagging_seed = SEED, verbose = -1))         
])
# score_dataset(X, y, pipeline_lgbm1)

# [-0.12680761 -0.13677544 -0.13297276 -0.12339225 -0.12447066]
# 0.126807605437858
# 0.0051563769780464805
# 0.12888374307590683

In [28]:
# Pipeline LGBM 2
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

pipeline_lgbm2 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', LGBMRegressor(random_state = SEED, bagging_seed = SEED, verbose = -1))         
])
# score_dataset(X, y, pipeline_lgbm2)

# [-0.12443105 -0.13954521 -0.13391999 -0.12133567 -0.12435406]
# 0.12443105229520869
# 0.006873280279288598
# 0.12871719775451934

In [29]:
# Pipeline LGBM 3
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder()),
    ('scaler', StandardScaler())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


pipeline_lgbm3 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)), 
    ('append_kmeans', AppendKMeans([f"PCA_{i}" for i in range(5)], 
                                   n_clusters = 10,
                                   return_cluster=True, return_distances=True)),
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, (numerical_cols + 
                                           [col + "_encoded" for col in large_cat_categorical_cols] + 
                                           [f"PCA_{i}" for i in range(5)] + 
                                           [f"distance_centroid_{i}" for i in range(10)])
            ),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols+['Cluster'])
        ],
        remainder="passthrough")
    ),
    ('model', LGBMRegressor(random_state = SEED, bagging_seed = SEED, verbose = -1))         
])

# score_dataset(X, y, pipeline_lgbm3)

# [-0.12556062 -0.14073109 -0.13393737 -0.12028624 -0.12396851]
# 0.12556061733110063
# 0.007416435833493804
# 0.12889676305259004

In [30]:
# Pipeline LGBM 4 - kmeans only
# numerical_transformer = Pipeline(steps=[
#     ('scaler', StandardScaler())
# ])
# ord_categorical_transformer = Pipeline(steps=[
#     ('catcode', OrdinalEncoder())
# ])
# small_categorical_transformer = Pipeline(steps=[
#     ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])


# pipeline = Pipeline([
#     ('append_kmeans', AppendKMeans(X.columns, 
#                                    n_clusters = 10,
#                                    return_cluster=True, return_distances=False)),
#     ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
#                                         encoder=MEstimateEncoder, 
#                                         encoder_other_params={"m":10.0})),
#     ('encoder_scaler', ColumnTransformer(
#         transformers=[
#             ('num', numerical_transformer, ([col + "_encoded" for col in large_cat_categorical_cols])
#             ),
#             ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
#             ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
#         ],
#         remainder="passthrough")
#     ),
#     ('model', LGBMRegressor(random_state = SEED, bagging_seed = SEED, verbose = -1))         
# ])

# score_dataset(X, y, pipeline)

# [-0.11694906 -0.14512712 -0.13344501 -0.1199983  -0.13515   ]
# 0.13344501041063725
# 0.010368142649721615
# 0.1301338982050343

In [31]:
# Pipeline LGBM 5 - only 1-hot encoding

# ord_categorical_transformer = Pipeline(steps=[
#     ('catcode', OrdinalEncoder())
# ])
# small_categorical_transformer = Pipeline(steps=[
#     ('smallcat_onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])
# large_categorical_transformer = Pipeline(steps=[
#     ('largecat_onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])

# pipeline = Pipeline([ 
#     ('encoder_scaler', ColumnTransformer(
#         transformers=[
#             ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
#             ('small_cat', small_categorical_transformer, small_cat_categorical_cols),
#             ('large_cat', large_categorical_transformer, large_cat_categorical_cols)
#         ],
#         remainder="passthrough")
#     ),
#     ('model', LGBMRegressor(random_state = SEED, bagging_seed = SEED, verbose = -1))         
# ])

# score_dataset(X, y, pipeline)
# [-0.12025565 -0.14658346 -0.13970263 -0.11828571 -0.1324062 ]
# 0.13240620276420592
# 0.010923859302942571
# 0.1314467317752831

In [32]:
# BayesianRidge

# numerical_transformer = Pipeline(steps=[
#     ('scaler', StandardScaler())
# ])
# ord_categorical_transformer = Pipeline(steps=[
#     ('catcode', OrdinalEncoder())
# ])
# small_categorical_transformer = Pipeline(steps=[
#     ('small_cat_catcode', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])

# pipeline = Pipeline([
#     ('append_pca', AppendPCA(X.columns, n_components = 5)),  
#     ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
#                                         encoder=MEstimateEncoder, 
#                                         encoder_other_params={"m":10.0})),
#     ('encoder_scaler', ColumnTransformer(
#         transformers=[
#             ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
#             ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
#             ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
#         ],
#         remainder="passthrough")
#     ),
#     ('model', BayesianRidge())         
# ])

# score_dataset(X, y, pipeline)

# with ord small cat
# [-0.11571775 -0.1404684  -0.13144348 -0.12177303 -0.1757111 ]
# 0.13144348097093617
# 0.021100927807127443
# 0.137022751261448

# with one-hot encoded small cat
# [-0.11508182 -0.13797263 -0.12967817 -0.11685041 -0.16775914]
# 0.12967817316312194
# 0.01910357166131022
# 0.1334684342946869

In [33]:
#HistGradientBoostingRegressor

# numerical_transformer = Pipeline(steps=[
#     ('scaler', StandardScaler())
# ])
# ord_categorical_transformer = Pipeline(steps=[
#     ('catcode', OrdinalEncoder())
# ])
# small_categorical_transformer = Pipeline(steps=[
#     ('small_cat_catcode', OrdinalEncoder())
# ])

# pipeline = Pipeline([
#     ('append_pca', AppendPCA(X.columns, n_components = 5)),  
#     ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
#                                         encoder=MEstimateEncoder, 
#                                         encoder_other_params={"m":10.0})),
#     ('encoder_scaler', ColumnTransformer(
#         transformers=[
#             ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
#             ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
#             ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
#         ],
#         remainder="passthrough")
#     ),
#     ('model', HistGradientBoostingRegressor())         
# ])

# score_dataset(X, y, pipeline)

# with ord small cat
# [-0.12457506 -0.13843074 -0.13592198 -0.12377773 -0.12229731]
# 0.1245750564606005
# 0.006762113478642641
# 0.12900056276317262

# with one-hot small cat
# [-0.12524236 -0.13972326 -0.13713656 -0.12461367 -0.1224119 ]
# 0.12524236050105655
# 0.007135083806368601
# 0.1298255490134746

In [34]:
#ExtraTreesRegressor

# numerical_transformer = Pipeline(steps=[
#     ('scaler', StandardScaler())
# ])
# ord_categorical_transformer = Pipeline(steps=[
#     ('catcode', OrdinalEncoder())
# ])
# small_categorical_transformer = Pipeline(steps=[
#     ('small_cat_catcode', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])

# pipeline = Pipeline([
#     ('append_pca', AppendPCA(X.columns, n_components = 5)),  
#     ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
#                                         encoder=MEstimateEncoder, 
#                                         encoder_other_params={"m":10.0})),
#     ('encoder_scaler', ColumnTransformer(
#         transformers=[
#             ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
#             ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
#             ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
#         ],
#         remainder="passthrough")
#     ),
#     ('model', ExtraTreesRegressor())         
# ])

# score_dataset(X, y, pipeline)

# with ord small cat
# [-0.11695888 -0.14201505 -0.1388615  -0.11885999 -0.12546584]
# 0.1254658428043362
# 0.010250146640853476
# 0.12843225351414586

# with one-hot small cat
# [-0.11463497 -0.14057763 -0.13820291 -0.11496482 -0.12522596]
# 0.1252259560096883
# 0.011048579290064723
# 0.12672125901329195

Since HistGradientBoostingRegressor and ExtraTreesRegressor give approximately same score (slightly worse) compared to XGB,
given that 3 other tree based methods have been tried and hyperparam optimization done, decided not to consider these 2 methods.

# Optimize hyperparameters

In [35]:
# def xgb_objective(trial):  
    
#     params = {
#         'model__n_estimators':       trial.suggest_int('model__n_estimators', 500, 2000, step = 50),
#         'model__learning_rate':      trial.suggest_float('model__learning_rate', 1e-4, 0.1, log=True),
#         'model__max_depth':          trial.suggest_int('model__max_depth', 0, 16),
#         'model__min_child_weight':   trial.suggest_int('model__min_child_weight', 1, 10),
#         'model__lambda':             trial.suggest_float('model__lambda', 1e-4, 10.0, log = True),
#         'model__alpha':              trial.suggest_float('model__alpha', 1e-4, 10.0, log = True),
#         'model__subsample':          trial.suggest_float('model__subsample', 0.4, 1.0, step = 0.01),
#         'model__colsample_bytree':   trial.suggest_float('model__colsample_bytree', 0.4, 1.0, step = 0.01)
#     }
#     pipeline_clone = clone(pipeline)
#     pipeline_clone.set_params(**params)

#     val_score = score_dataset(X, y, pipeline_clone)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')
# study.optimize(xgb_objective, n_trials = 100)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"XGB tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [36]:
# def xgb_objective(trial):  
    
#     params = {
#         'model__n_estimators':       trial.suggest_int('model__n_estimators', 1800, 8000, step = 200),
#         'model__learning_rate':      trial.suggest_float('model__learning_rate', 1e-4, 0.1, log=True),
#         'model__max_depth':          trial.suggest_int('model__max_depth', 0, 16),
#         'model__min_child_weight':   trial.suggest_int('model__min_child_weight', 1, 10),
#         'model__lambda':             trial.suggest_float('model__lambda', 1e-4, 10.0, log = True),
#         'model__alpha':              trial.suggest_float('model__alpha', 1e-4, 10.0, log = True),
#         'model__subsample':          trial.suggest_float('model__subsample', 0.4, 1.0, step = 0.01),
#         'model__colsample_bytree':   trial.suggest_float('model__colsample_bytree', 0.4, 1.0, step = 0.01)
#     }
#     pipeline_clone = clone(pipeline)
#     pipeline_clone.set_params(**params)

#     val_score = score_dataset(X, y, pipeline_clone)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')

# study.enqueue_trial({'model__n_estimators': 1800, 'model__learning_rate': 0.00851634744921019,
#                      'model__max_depth': 4, 'model__min_child_weight': 4,
#                      'model__lambda': 0.001364771669279719, 'model__alpha': 0.0003587496054225881,
#                      'model__subsample': 0.74, 'model__colsample_bytree': 0.45})

# study.optimize(xgb_objective, n_trials = 100)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"XGB tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [37]:
# def rf_objective(trial):  
    
#     params = {
#         'model__n_estimators':          trial.suggest_int('model__n_estimators', 500, 10000, step = 100),
#         'model__max_depth':             trial.suggest_categorical('model__max_depth', [None] + list(range(4, 9))),
#         'model__min_samples_split':     trial.suggest_int('model__min_samples_split', 2, 20),
#         'model__min_samples_leaf':      trial.suggest_int('model__min_samples_leaf', 1, 20),
#         'model__max_features':          trial.suggest_categorical('model__max_features', ["sqrt", "log2", None]),
#         'model__min_impurity_decrease': trial.suggest_categorical('model__min_impurity_decrease',
#                                                                   [0, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2])
#     }
#     pipeline_clone = clone(pipeline)
#     pipeline_clone.set_params(**params)

#     val_score = score_dataset(X, y, pipeline_clone)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')
# study.optimize(rf_objective, n_trials = 100)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"RF tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [38]:
# def cb_objective(trial):  
    
#     params = {
#         'model__n_estimators':       trial.suggest_int('model__n_estimators', 500, 2000, step = 50),
#         'model__learning_rate':      trial.suggest_float('model__learning_rate', 1e-4, 0.1, log=True),
#         'model__l2_leaf_reg':        trial.suggest_float('model__l2_leaf_reg', 1e-3, 10.0, log = True),
#         'model__min_data_in_leaf':   trial.suggest_int('model__min_data_in_leaf', 1, 50),
#         'model__max_depth':          trial.suggest_int('model__max_depth', 4, 16),
#         'model__subsample':          trial.suggest_float('model__subsample', 0.4, 1.0, step = 0.01),
#         'model__colsample_bylevel':  trial.suggest_float('model__colsample_bylevel', 0.4, 1.0, step = 0.01)
#     }

#     pipeline_clone = clone(pipeline)
#     pipeline_clone.set_params(**params)

#     val_score = score_dataset(X, y, pipeline_clone)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')
# study.optimize(cb_objective, n_trials = 50)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"CB tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [39]:
# def lgbm_objective(trial):  
    
#     params = {
#          'model__n_estimators':       trial.suggest_int('model__n_estimators', 100, 2000, step = 50),
#          'model__learning_rate':      trial.suggest_float('model__learning_rate', 1e-4, 0.1, log=True),
#          'model__num_leaves':         trial.suggest_int('model__num_leaves', 16, 256),
#          'model__max_depth':          trial.suggest_int('model__max_depth', 0, 16),
#          'model__min_data_in_leaf':   trial.suggest_int('model__min_data_in_leaf', 1, 50),
#          'model__bagging_freq':       trial.suggest_int('model__bagging_freq', 0, 7),
#          'model__bagging_fraction':   trial.suggest_float('model__bagging_fraction', 0.5, 1.0, step = 0.05),
#          'model__reg_alpha':          trial.suggest_float('model__reg_alpha', 1e-4, 10.0, log = True),
#          'model__reg_lambda':         trial.suggest_float('model__reg_lambda', 1e-4, 10.0, log = True),
#          'model__colsample_bytree':   trial.suggest_float('model__colsample_bytree', 0.4, 1.0, step = 0.01)
#     }

#     pipeline_clone = clone(pipeline)
#     pipeline_clone.set_params(**params)

#     val_score = score_dataset(X, y, pipeline_clone)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')
# study.optimize(lgbm_objective, n_trials = 100)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"LGBM tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [40]:
# def lgbm_objective(trial):  
    
#     params = {
#          'model__n_estimators':       trial.suggest_int('model__n_estimators', 1800, 8000, step = 50),
#          'model__learning_rate':      trial.suggest_float('model__learning_rate', 1e-4, 0.1, log=True),
#          'model__num_leaves':         trial.suggest_int('model__num_leaves', 16, 256),
#          'model__max_depth':          trial.suggest_int('model__max_depth', 0, 16),
#          'model__min_data_in_leaf':   trial.suggest_int('model__min_data_in_leaf', 1, 50),
#          'model__bagging_freq':       trial.suggest_int('model__bagging_freq', 0, 7),
#          'model__bagging_fraction':   trial.suggest_float('model__bagging_fraction', 0.5, 1.0, step = 0.05),
#          'model__reg_alpha':          trial.suggest_float('model__reg_alpha', 1e-4, 10.0, log = True),
#          'model__reg_lambda':         trial.suggest_float('model__reg_lambda', 1e-4, 10.0, log = True),
#          'model__colsample_bytree':   trial.suggest_float('model__colsample_bytree', 0.4, 1.0, step = 0.01)
#     }

#     pipeline_clone = clone(pipeline)
#     pipeline_clone.set_params(**params)

#     val_score = score_dataset(X, y, pipeline_clone)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')
# study.enqueue_trial({'model__n_estimators': 1850, 'model__learning_rate': 0.01090254772325081,
#                      'model__num_leaves': 248, 'model__max_depth': 4,
#                      'model__min_data_in_leaf': 2, 'model__bagging_freq': 4,
#                      'model__bagging_fraction': 0.6, 'model__reg_alpha': 0.013788629591410967,
#                      'model__reg_lambda': 0.10833316245693235, 'model__colsample_bytree': 0.56})
# study.optimize(lgbm_objective, n_trials = 100)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"LGBM tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [41]:
# def bayesian_ridge_objective(trial):  
    
#     params = {
#          'model__n_iter':         trial.suggest_int('model__n_iter', 100, 2000, step = 50),
#          'model__tol':            trial.suggest_float('model__tol', 1e-4, 0.1, log=True),
#          'model__alpha_1':        trial.suggest_float('model__alpha_1', 1e-7, 0.1, log = True),
#          'model__alpha_2':        trial.suggest_float('model__alpha_2', 1e-7, 0.1, log = True),
#          'model__lambda_1':       trial.suggest_float('model__lambda_1', 1e-7, 0.1, log = True),
#          'model__lambda_2':       trial.suggest_float('model__lambda_2', 1e-7, 0.1, log = True)
#     }

#     pipeline_clone = clone(pipeline)
#     pipeline_clone.set_params(**params)

#     val_score = score_dataset(X, y, pipeline_clone)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')
# study.optimize(bayesian_ridge_objective, n_trials = 100)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"Bayesian Ridge tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [42]:
# # Best params
# best_params = {'model__n_estimators': 650, 'model__learning_rate': 0.02384699181391458, 
#                'model__max_depth': 4, 'model__min_child_weight': 1, 
#                'model__lambda': 3.6219467321680083, 'model__alpha': 0.00555739253376164,
#                'model__subsample': 0.5767916833020451, 'model__colsample_bytree': 0.799700431685384, 
#                'model__colsample_bylevel': 0.8676463787826333, 'model__colsample_bynode': 0.8271978509268489}

# Best is trial 62 finished with value: 0.11742375540973124 
# [-0.10837224 -0.12998627 -0.12850005 -0.10610519 -0.11415502]
# 0.11415502442730315
# 0.010012248568044236

# # Best params
# best_params = {'model__n_estimators': 9000, 'model__learning_rate': 0.002221965665898899, 
#                'model__max_depth': 4, 'model__min_child_weight': 5, 
#                'model__lambda': 0.3279, 'model__alpha': 0.2417, 
#                'model__subsample': 0.8049000000000001, 'model__colsample_bytree': 0.9475, 
#                'model__colsample_bylevel': 0.4768, 'model__colsample_bynode': 0.4454}

# Best is trial 92 with value: 0.11728844905814861.
# [-0.1069524  -0.12825763 -0.12633835 -0.10719973 -0.11769413]
# 0.11769413498310685
# 0.009066468479194328

# best_params = {'model__n_estimators': 1000, 'model__learning_rate': 0.02384699181391458, 
#                'model__max_depth': 4, 'model__min_child_weight': 1, 
#                'model__lambda': 3.6219467321680083, 'model__alpha': 0.00555739253376164,
#                'model__subsample': 0.5767916833020451, 'model__colsample_bytree': 0.799700431685384, 
#                'model__colsample_bylevel': 0.8676463787826333, 'model__colsample_bynode': 0.8271978509268489}

# [-0.10795649 -0.13002858 -0.12830252 -0.10569348 -0.11434615]
# 0.11434615405227047
# 0.010137099509793399
# 0.11726544444298999

# best_params = {'model__n_estimators': 1650, 'model__learning_rate': 0.02234288131834421, 
#                'model__max_depth': 3, 'model__min_child_weight': 2, 
#                'model__lambda': 6.1143, 'model__alpha': 0.009000000000000001, 
#                'model__subsample': 0.4839, 'model__colsample_bytree': 0.41910000000000003, 
#                'model__colsample_bylevel': 0.7788, 'model__colsample_bynode': 0.7030000000000001}

# Best is trial 82 with value: 0.11790924808934752.
# [-0.10657961 -0.12788791 -0.12635717 -0.10699523 -0.12172632]
# 0.12172631618375584
# 0.009305776801589548

best_params_xgb1 = {'model__n_estimators': 1650, 'model__learning_rate': 0.007016525014732306,
                    'model__max_depth': 5, 'model__min_child_weight': 1,
                    'model__lambda': 0.0003700954457814859, 'model__alpha': 0.001536806691225178,
                    'model__subsample': 0.8200000000000001, 'model__colsample_bytree': 0.45}
# Best is trial 67 with value: 0.11733104229147182.
# [-0.110808   -0.129472   -0.12964501 -0.10534453 -0.11138566]
# 0.1113856557342387
# 0.010204027122436335

In [43]:
# # Best params
# with Age_since_mod feature

# best_params = {'model__n_estimators': 850, 'model__learning_rate': 0.017442598532274846, 
#                'model__max_depth': 4, 'model__min_child_weight': 1, 
#                'model__lambda': 0.8255067561921624, 'model__alpha': 0.004278929472964361, 
#                'model__subsample': 0.8974473156207723, 'model__colsample_bytree': 0.6523619866997316, 
#                'model__colsample_bylevel': 0.996362051631553, 'model__colsample_bynode': 0.4628476409826708}

# Best is trial 61 with value: 0.11779737989132073.
# [-0.10790906 -0.1318243  -0.12853275 -0.10425928 -0.11646151]
# 0.11646151088245223
# 0.010907303679586703

In [44]:
# # Best params
# with pipeline2

# best_params = {'model__n_estimators': 1000, 'model__learning_rate': 0.020124989124474368, 
#                'model__max_depth': 4, 'model__min_child_weight': 6, 
#                'model__lambda': 1.732780164845022, 'model__alpha': 0.03214159796407795, 
#                'model__subsample': 0.618267265549825, 'model__colsample_bytree': 0.7018360769686597, 
#                'model__colsample_bylevel': 0.5343572892012461, 'model__colsample_bynode': 0.9702456455515944}

#Best is trial 85 with value: 0.11782913284946137.
# [-0.1100461  -0.12623955 -0.12463757 -0.11017505 -0.1180474 ]
# 0.11804740104228104
# 0.0068745661426423185


# best_params = {'model__n_estimators': 8500, 'model__learning_rate': 0.0024356711073752263, 
#                'model__max_depth': 6, 'model__min_child_weight': 7, 
#                'model__lambda': 1.0561, 'model__alpha': 0.0106, 
#                'model__subsample': 0.7481, 'model__colsample_bytree': 0.6855, 
#                'model__colsample_bylevel': 0.8596, 'model__colsample_bynode': 0.7395}

# Best is trial 86 with value: 0.11844810379526453.
# [-0.11185226 -0.12871867 -0.12804039 -0.10766969 -0.11595951]
# 0.11595951124551873
# 0.008524891860194693


# best_params = {'model__n_estimators': 1600, 'model__learning_rate': 0.020124989124474368, 
#                'model__max_depth': 4, 'model__min_child_weight': 6, 
#                'model__lambda': 1.732780164845022, 'model__alpha': 0.03214159796407795, 
#                'model__subsample': 0.618267265549825, 'model__colsample_bytree': 0.7018360769686597, 
#                'model__colsample_bylevel': 0.5343572892012461, 'model__colsample_bynode': 0.9702456455515944}

# pipeline 2
# 0.11802772081955834  2000
# 0.11790519839005424  1800
# 0.11783945720039994  1700
# 0.11775646400190418  1600
# 0.11777810773615957  1500
# 0.11777453916635619  1400
# 0.11776996564553088  1300
# 0.1177948484577633   1200

In [45]:
# # Best params
# with pipeline3

best_params_xgb3 = {'model__n_estimators': 1800, 'model__learning_rate': 0.00851634744921019,
                    'model__max_depth': 4, 'model__min_child_weight': 4,
                    'model__lambda': 0.001364771669279719, 'model__alpha': 0.0003587496054225881,
                    'model__subsample': 0.74, 'model__colsample_bytree': 0.45}

# Best is trial 94 with value: 0.11586713304296034.
# [-0.10719354 -0.12648766 -0.12653823 -0.103357   -0.11575924]
# 0.11575923785884161
# 0.00957511471362708

# best_params = {'model__n_estimators': 3000, 'model__learning_rate': 0.011132262126243759, 
#                'model__max_depth': 4, 'model__min_child_weight': 3, 
#                'model__lambda': 0.0009857953750043423, 'model__alpha': 0.020320630432170268, 
#                'model__subsample': 0.79, 'model__colsample_bytree': 0.48000000000000004}

# Best is trial 53 with value: 0.11543456407868449.
# [-0.10707274 -0.12636534 -0.12598445 -0.10213922 -0.11561108]
# 0.11561107564972747
# 0.009772403947525961

In [46]:
# #pipeline 4 best params
# {'model__n_estimators': 500, 'model__max_depth': None, 
#  'model__min_samples_split': 4, 'model__min_samples_leaf': 1, 
#  'model__max_features': None, 'model__min_impurity_decrease': 1e-05}

# Best is trial 69 with value: 0.13452078571895393.
# [-0.13265846 -0.14837583 -0.13993977 -0.12230483 -0.12932503]
# 0.13265846285872251
# 0.00895568242880693

In [47]:
# Pipeline 5 best params
# best_params = {'model__n_estimators': 600, 'model__learning_rate': 0.0432035923480985, 
#                'model__max_depth': 5, 'model__min_child_weight': 4, 
#                'model__lambda': 1.6142, 'model__alpha': 0.014, 
#                'model__subsample': 0.6325000000000001, 'model__colsample_bytree': 0.4691, 
#                'model__colsample_bylevel': 0.4232, 'model__colsample_bynode': 0.9493}

# Best is trial 91 with value: 0.11773786792108754.
# [-0.10644425 -0.12944014 -0.12619847 -0.10790766 -0.11869882]
# 0.11869881720548991
# 0.009312649560497113


# best_params = {'model__n_estimators': 1050, 'model__learning_rate': 0.01062872306058349, 
#                'model__max_depth': 11, 'model__min_child_weight': 4, 
#                'model__lambda': 2.1802052986711025, 'model__alpha': 0.00921920490425625, 
#                'model__subsample': 0.47000000000000003, 'model__colsample_bytree': 0.44}

# Best is trial 58 with value: 0.11967778326753713.
# [-0.10904947 -0.134964   -0.12691889 -0.10738619 -0.12007037]
# 0.12007036705813962
# 0.010490951150130552

In [48]:
# Pipeline 6 best params
# best_params = {'model__n_estimators': 1150, 'model__learning_rate': 0.03352307389535568, 
#                'model__max_depth': 3, 'model__min_child_weight': 6, 
#                'model__lambda': 6.661700000000001, 'model__alpha': 0.045700000000000005, 
#                'model__subsample': 0.43370000000000003, 'model__colsample_bytree': 0.9654, 
#                'model__colsample_bylevel': 0.6141000000000001, 'model__colsample_bynode': 0.6744000000000001}
# Best is trial 93 with value: 0.11865236004232849.
# [-0.11241793 -0.12547345 -0.12666839 -0.10996032 -0.11874172]
# 0.11874172303964096
# 0.006711357597087486

In [49]:
# Pipeline 7 best params
best_params_xgb7 = {'model__n_estimators': 1100, 'model__learning_rate': 0.025937951529056733,
                    'model__max_depth': 4, 'model__min_child_weight': 4,
                    'model__lambda': 2.7752000000000003, 'model__alpha': 0.0146,
                    'model__subsample': 0.44530000000000003, 'model__colsample_bytree': 0.6413,
                    'model__colsample_bylevel': 0.4937, 'model__colsample_bynode': 0.7684}

# Best is trial 74 with value: 0.11660322347801237.
# [-0.1039625  -0.12587415 -0.12546581 -0.10753416 -0.12017951]
# 0.1201795084009607
# 0.009157741226268295

# best_params = {'model__n_estimators': 1950, 'model__learning_rate': 0.013543890928772154, 
#                'model__max_depth': 4, 'model__min_child_weight': 7, 
#                'model__lambda': 0.010221196759663355, 'model__alpha': 0.0013603517427302739, 
#                'model__subsample': 0.8500000000000001, 'model__colsample_bytree': 0.41000000000000003}

# Best is trial 86 with value: 0.1187055587523349.
# [-0.10663061 -0.12755437 -0.12829783 -0.10752408 -0.12352092]
# 0.1235209150589082
# 0.009636708582726212

In [50]:
# # Pipeline 8 best params
# best_params = {'model__n_estimators': 1950, 'model__learning_rate': 0.013920563709541603, 
#                'model__max_depth': 4, 'model__min_child_weight': 3, 
#                'model__lambda': 0.0047096596141103335, 'model__alpha': 0.01205623249289589, 
#                'model__subsample': 0.69, 'model__colsample_bytree': 0.49}

# Best is trial 83 with value: 0.11839759395186619.
# [-0.1051799  -0.13080341 -0.12716429 -0.10431576 -0.12452461]
# 0.12452460859487571
# 0.01132523820187055

In [51]:
# CatBoost Pipeline 1

# best_params = {'model__n_estimators': 1700, 'model__learning_rate': 0.016728902877239903, 
#                'model__l2_leaf_reg': 0.040835135117464366, 'model__min_data_in_leaf': 9, 
#                'model__max_depth': 6, 'model__subsample': 0.4963, 
#                'model__colsample_bylevel': 0.4702}

# Best is trial 34 with value: 0.11666610454039741.
# [-0.10795215 -0.12487765 -0.12579047 -0.10819275 -0.1165175 ]
# 0.11651750103817046
# 0.007725754808568546

# best_params = {'model__n_estimators': 1350, 'model__learning_rate': 0.03110573892360532, 
#                'model__l2_leaf_reg': 0.09438679768877248, 'model__min_data_in_leaf': 26, 
#                'model__max_depth': 6, 'model__subsample': 0.54, 
#                'model__colsample_bylevel': 0.79}

# Best is trial 15 with value: 0.11628226597361586.
# [-0.10668533 -0.12480329 -0.1268338  -0.10741574 -0.11567317]
# 0.11567317248352649
# 0.00842650896916564

In [52]:
# CatBoost Pipeline 2

best_params_cb2 = {'model__n_estimators': 1150, 'model__learning_rate': 0.017712553265578305,
                   'model__l2_leaf_reg': 0.08343115659041794, 'model__min_data_in_leaf': 4,
                   'model__max_depth': 5, 'model__subsample': 0.44410000000000005,
                   'model__colsample_bylevel': 0.4676}

# Best is trial 41 with value: 0.115131021385492.
# [-0.10631423 -0.12341525 -0.12524074 -0.10716469 -0.11352019]
# 0.11352019431883034
# 0.00793256565121934

In [53]:
# CatBoost Pipeline 3

# best_params = {'model__n_estimators': 2000, 'model__learning_rate': 0.007906985518057994, 
#                'model__l2_leaf_reg': 0.0922415789070384, 'model__min_data_in_leaf': 31, 
#                'model__max_depth': 4, 'model__subsample': 0.8200000000000001, 
#                'model__colsample_bylevel': 0.76}
# Best is trial 12 with value: 0.11855774737155153.
# [-0.10514406 -0.12587876 -0.12669242 -0.10977513 -0.12529837]
# 0.12529837064895213
# 0.00918986104825868

best_params_cb3 = {'model__n_estimators': 1750, 'model__learning_rate': 0.02157894921249164,
                   'model__l2_leaf_reg': 0.3194559118452819, 'model__min_data_in_leaf': 38,
                   'model__max_depth': 5, 'model__subsample': 0.43460000000000004,
                   'model__colsample_bylevel': 0.9166000000000001}
# Best is trial 31 with value: 0.11619643846168552.
# [-0.10281654 -0.126599   -0.12371644 -0.10972694 -0.11812327]
# 0.11812327086423803
# 0.008824425231554592

In [54]:
# CatBoost Pipeline 4

# best_params_cb4 = {'model__n_estimators': 1350, 'model__learning_rate': 0.024279894068992076,
#                    'model__l2_leaf_reg': 0.07703432557306762, 'model__min_data_in_leaf': 22,
#                    'model__max_depth': 6, 'model__subsample': 0.98,
#                    'model__colsample_bylevel': 0.81}

# Best is trial 1 with value: 0.1214007095620582.
# [-0.11295343 -0.13260574 -0.13311869 -0.11034042 -0.11798527]
# 0.11798526791881647
# 0.009676935519162008


best_params_cb4 = {'model__n_estimators': 1900, 'model__learning_rate': 0.023875793128268784, 
                   'model__l2_leaf_reg': 0.20539579640233013, 'model__min_data_in_leaf': 5, 
                   'model__max_depth': 5, 'model__subsample': 0.9400000000000001, 
                   'model__colsample_bylevel': 0.8500000000000001}
# Best is trial 26 with value: 0.11973587332021315.
# [-0.11301114 -0.12934981 -0.12627155 -0.10764008 -0.12240679]
# 0.12240678985145056
# 0.008170764103107636

# best_params_cb4 = {'model__n_estimators': 1900, 'model__learning_rate': 0.02, 
#                    'model__l2_leaf_reg': 0.20539579640233013, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11201021 -0.13352178 -0.12705421 -0.10952506 -0.11527173]
# 0.1152717345809794
# 0.00924281603375057
# 0.11947660122245345


# best_params_cb4 = {'model__n_estimators': 1900, 'model__learning_rate': 0.02, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11188239 -0.13235365 -0.12709226 -0.10918557 -0.11390662]
# 0.11390662005291685
# 0.009128671332061692
# 0.11888409741568533

# best_params_cb4 = {'model__n_estimators': 1950, 'model__learning_rate': 0.02, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}

# [-0.11189285 -0.13239772 -0.12709855 -0.1091963  -0.11390332]
# 0.1139033216933719
# 0.009139291670199706
# 0.11889774821034231

# best_params_cb4 = {'model__n_estimators': 2000, 'model__learning_rate': 0.02, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11187549 -0.13243271 -0.12707561 -0.10919286 -0.11377302]
# 0.11377302074072512
# 0.009163281155937423
# 0.11886993653590813

# best_params_cb4 = {'model__n_estimators': 2050, 'model__learning_rate': 0.02, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11189929 -0.13239579 -0.12705958 -0.10902376 -0.1137576 ]
# 0.11375760104627411
# 0.009183495070161809
# 0.11882720488252445

# best_params_cb4 = {'model__n_estimators': 2100, 'model__learning_rate': 0.02, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11195553 -0.13240284 -0.12707245 -0.1088707  -0.11380872]
# 0.11380872083482574
# 0.009206725554952992
# 0.11882204728095043

# best_params_cb4 = {'model__n_estimators': 2150, 'model__learning_rate': 0.02, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11188727 -0.13229883 -0.12710833 -0.10883597 -0.1138046 ]
# 0.11380459842870366
# 0.009200734837444853
# 0.11878700098596284

# best_params_cb4 = {'model__n_estimators': 2200, 'model__learning_rate': 0.02, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11187491 -0.13226491 -0.12717628 -0.10877656 -0.11378779]
# 0.11378778899899722
# 0.009219668505541253
# 0.11877609053306046

# best_params_cb4 = {'model__n_estimators': 2250, 'model__learning_rate': 0.02, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11188312 -0.13237645 -0.12718839 -0.10881034 -0.11384685]
# 0.11384684816994045
# 0.009239622081836861
# 0.11882102814314441

# best_params_cb4 = {'model__n_estimators': 2300, 'model__learning_rate': 0.02, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11185689 -0.13236175 -0.12719613 -0.10874102 -0.11392465]
# 0.11392464898140518
# 0.009247420514116129
# 0.11881608785479755

# best_params_cb4 = {'model__n_estimators': 2350, 'model__learning_rate': 0.02, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11194388 -0.13233419 -0.12720186 -0.10868674 -0.11405942]
# 0.11405942235383687
# 0.009225104178520382
# 0.11884521798601644

# best_params_cb4 = {'model__n_estimators': 1900, 'model__learning_rate': 0.018, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11274411 -0.13109232 -0.13000153 -0.10778541 -0.11933193]
# 0.11933193350986915
# 0.009221475370939702
# 0.12019106130273345

# best_params_cb4 = {'model__n_estimators': 1900, 'model__learning_rate': 0.019, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11430754 -0.13017581 -0.12899373 -0.10906591 -0.11834235]
# 0.11834234952337444
# 0.008233877845389339
# 0.12017706659772671

# best_params_cb4 = {'model__n_estimators': 1900, 'model__learning_rate': 0.017, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11610427 -0.13273477 -0.12919228 -0.10891501 -0.11945877]
# 0.11945877076840226
# 0.008681174676817551
# 0.12128101783289294

# best_params_cb4 = {'model__n_estimators': 1900, 'model__learning_rate': 0.016, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11476089 -0.12941193 -0.12948647 -0.11108606 -0.11869038]
# 0.11869037844886919
# 0.007547693989539927
# 0.1206871467033112

# best_params_cb4 = {'model__n_estimators': 1900, 'model__learning_rate': 0.015, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11363649 -0.13034886 -0.12870418 -0.10957033 -0.11996888]
# 0.11996887759049966
# 0.008138077449944102
# 0.12044574755157633

# best_params_cb4 = {'model__n_estimators': 2000, 'model__learning_rate': 0.015, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}

# [-0.11370692 -0.13021425 -0.12852065 -0.10942142 -0.11982283]
# 0.11982283435274602
# 0.00809820296285674
# 0.12033721580343484

# best_params_cb4 = {'model__n_estimators': 2000, 'model__learning_rate': 0.01, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}

# [-0.11432627 -0.13146917 -0.12834248 -0.11089592 -0.12130761]
# 0.12130761373334997
# 0.007872502061877364
# 0.12126829037306623

# best_params_cb4 = {'model__n_estimators': 2000, 'model__learning_rate': 0.005, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}

# [-0.11695995 -0.13324756 -0.13340318 -0.11716379 -0.12433965]
# 0.12433964901078248
# 0.007281728167621455
# 0.12502282693062136

In [55]:
# CatBoost Pipeline 5

# best_params = {'model__n_estimators': 1700, 'model__learning_rate': 0.02945430207184643, 
#                'model__l2_leaf_reg': 0.027563217977551877, 'model__min_data_in_leaf': 9, 
#                'model__max_depth': 7, 'model__subsample': 0.69, 
#                'model__colsample_bylevel': 0.8700000000000001}

# Best is trial 15 with value: 0.11733894066927993.
# [-0.10479313 -0.12909713 -0.12442671 -0.10861599 -0.11976175]
# 0.11976175179720044
# 0.009250393243949715

In [56]:
# Pipeline LGBM 1

best_params_lgbm1 = {'model__n_estimators': 1850, 'model__learning_rate': 0.00919961299356949,
                     'model__num_leaves': 16, 'model__max_depth': 15,
                     'model__min_data_in_leaf': 1, 'model__bagging_freq': 3,
                     'model__bagging_fraction': 0.5, 'model__reg_alpha': 0.006780286384457145,
                     'model__reg_lambda': 0.009128122260443947, 'model__colsample_bytree': 0.52}

# Best is trial 66 with value: 0.11588344796374961.
# [-0.10897157 -0.12806925 -0.12505263 -0.10505543 -0.11226835]
# 0.11226835473674715
# 0.009062634359964232

# best_params = {'model__n_estimators': 900, 'model__learning_rate': 0.016368209778087922, 
#                'model__num_leaves': 26, 'model__max_depth': 4, 
#                'model__min_data_in_leaf': 1, 'model__bagging_freq': 4, 
#                'model__bagging_fraction': 0.8500000000000001, 'model__reg_alpha': 0.07073139029094895, 
#                'model__reg_lambda': 0.9864368067144933, 'model__colsample_bytree': 0.48000000000000004}

# Best is trial 31 with value: 0.11633039703943031.
# [-0.10787882 -0.12904064 -0.12713678 -0.10639222 -0.11120351]
# 0.11120351159416869
# 0.009744842136375179

In [57]:
# Pipeline LGBM 2

best_params_lgbm2 = {'model__n_estimators': 1900, 'model__learning_rate': 0.022528140118977913,
                     'model__num_leaves': 18, 'model__max_depth': 4,
                     'model__min_data_in_leaf': 1, 'model__bagging_freq': 0,
                     'model__bagging_fraction': 0.8, 'model__reg_alpha': 0.0005985326337395847,
                     'model__reg_lambda': 0.0002363245035597099, 'model__colsample_bytree': 0.42000000000000004}

# Best is trial 38 with value: 0.11560446553278303.
# [-0.10663778 -0.12916449 -0.1261197  -0.10309439 -0.11300596]
# 0.11300596297350247
# 0.010374005785838268

In [58]:
# Pipeline LGBM 3 - (with kmeans)

best_params_lgbm3 = {'model__n_estimators': 1850, 'model__learning_rate': 0.01090254772325081,
                     'model__num_leaves': 248, 'model__max_depth': 4,
                     'model__min_data_in_leaf': 2, 'model__bagging_freq': 4,
                     'model__bagging_fraction': 0.6, 'model__reg_alpha': 0.013788629591410967,
                     'model__reg_lambda': 0.10833316245693235, 'model__colsample_bytree': 0.56}

# Best is trial 86 with value: 0.11545575046806685.
# [-0.10764055 -0.12391567 -0.12530486 -0.10516194 -0.11525574]
# 0.11525574056354786
# 0.008193353838302055

# The below param set has slightly better score than previous, but has large value for n_estimators.
# So using the previous as best params

# best_params = {'model__n_estimators': 6850, 'model__learning_rate': 0.0024998981926820743, 
#                'model__num_leaves': 92, 'model__max_depth': 4, 
#                'model__min_data_in_leaf': 1, 'model__bagging_freq': 5, 
#                'model__bagging_fraction': 0.65, 'model__reg_alpha': 0.008654799905365489, 
#                'model__reg_lambda': 0.004158569757202218, 'model__colsample_bytree': 0.46}
# Best is trial 64 with value: 0.11518509313360328.
# [-0.10615533 -0.1278085  -0.12361509 -0.10496181 -0.11338474]
# 0.11338473860489612
# 0.009161972687259528

In [59]:
# Pipeline LGBM 4 

# best_params = {'model__n_estimators': 800, 'model__learning_rate': 0.05310111992098777, 
#                'model__num_leaves': 126, 'model__max_depth': 2, 
#                'model__min_data_in_leaf': 2, 'model__bagging_freq': 4, 
#                'model__bagging_fraction': 0.8500000000000001, 'model__reg_alpha': 0.2286575446927289, 
#                'model__reg_lambda': 0.16178904582449463, 'model__colsample_bytree': 0.45}

# Best is trial 57 with value: 0.11904876713223907.
# [-0.10424882 -0.12487474 -0.1259728  -0.11114655 -0.12900091]
# 0.12487474415214168
# 0.009616786926739675

In [60]:
# Pipeline LGBM 5

# best_params = {'model__n_estimators': 1650, 'model__learning_rate': 0.010068383476147162, 
#                'model__num_leaves': 187, 'model__max_depth': 4, 
#                'model__min_data_in_leaf': 2, 'model__bagging_freq': 2, 
#                'model__bagging_fraction': 0.55, 'model__reg_alpha': 0.34281244232395713, 
#                'model__reg_lambda': 0.11822047924368936, 'model__colsample_bytree': 0.4}

# Best is trial 95 with value: 0.11911372688117125.
# [-0.10709439 -0.13089928 -0.12669271 -0.10927673 -0.12160552]
# 0.12160552202387531
# 0.009421046079262654

In [61]:
# Pipeline Bayesian Ridge Regression

# best_params = {'model__n_iter': 650, 'model__tol': 0.00012277969304736106, 
#                'model__alpha_1': 0.00010633679673272293, 'model__alpha_2': 1.4064886184001036e-05, 
#                'model__lambda_1': 7.387907093592057e-06, 'model__lambda_2': 0.024216858525360933}

# Best is trial 71 with value: 0.1330226223829522.
# [-0.11564542 -0.13708175 -0.13063521 -0.11616119 -0.16558953]
# 0.1306352120412312
# 0.018270494205095723

In [62]:
# pipeline.set_params(**best_params)

pipeline_xgb1.set_params(**best_params_xgb1)
pipeline_xgb3.set_params(**best_params_xgb3)
pipeline_xgb7.set_params(**best_params_xgb7)

pipeline_cb2.set_params(**best_params_cb2)
pipeline_cb3.set_params(**best_params_cb3)
pipeline_cb4.set_params(**best_params_cb4)

pipeline_lgbm1.set_params(**best_params_lgbm1)
pipeline_lgbm2.set_params(**best_params_lgbm2)
pipeline_lgbm3.set_params(**best_params_lgbm3)

# Ensemble

In [63]:
# Creating a custom class because VotingRegressor passes the data as numpy array to the individual estimators
# But the pipelines used (i.e. the estimators for the VotingRegressor) expect Pandas DataFrame.
class CustomVotingRegressor(VotingRegressor):
    def __init__(self, estimators, *, weights=None, n_jobs=None, verbose=False):
        super().__init__(estimators=estimators, weights=weights, n_jobs=n_jobs, verbose=verbose)

    def fit(self, X, y):
        for name, estimator in self.estimators:
            if hasattr(estimator, "fit"):
                estimator.fit(X, y)
        return self

    def predict(self, X):
        # Collect predictions from each estimator
        predictions = []
        for name, estimator in self.estimators:
            if hasattr(estimator, "predict"):
                predictions.append(estimator.predict(X))

        # Combine predictions using weights
        # final_predictions = np.average(predictions, axis=0, weights=self.weights)

        final_predictions = np.median(predictions, axis=0)
        return final_predictions


# pipeline = CustomVotingRegressor([("xgb3", pipeline_xgb3),
#                                   ("cb2", pipeline_cb2),
#                                   ("lgbm3", pipeline_lgbm3)])
# pipeline = CustomVotingRegressor([("xgb1", pipeline_xgb1), ("xgb3", pipeline_xgb3), ("xgb7", pipeline_xgb7),
#                                   ("cb2", pipeline_cb2), ("cb3", pipeline_cb3), ("cb4", pipeline_cb4),
#                                   ("lgbm1", pipeline_lgbm1), ("lgbm2", pipeline_lgbm2), ("lgbm3", pipeline_lgbm3)])
# pipeline = CustomVotingRegressor([("cb2", pipeline_cb2), ("cb3", pipeline_cb3), ("cb4", pipeline_cb4)])
# pipeline

In [64]:
pipeline = pipeline_cb4
pipeline

# CV Score

In [65]:
# print(score_dataset(X, y, pipeline_xgb1))
# # [-0.110808   -0.129472   -0.12964501 -0.10534453 -0.11138566]
# # 0.1113856557342387
# # 0.010204027122436335
# # 0.11733104229147182

# print(score_dataset(X, y, pipeline_xgb3))
# # [-0.10719354 -0.12648766 -0.12653823 -0.103357   -0.11575924]
# # 0.11575923785884161
# # 0.00957511471362708
# # 0.11586713304296034

# print(score_dataset(X, y, pipeline_xgb7))
# # [-0.1039625  -0.12587415 -0.12546581 -0.10753416 -0.12017951]
# # 0.1201795084009607
# # 0.009157741226268295
# # 0.11660322347801237

# print(score_dataset(X, y, pipeline_cb2))
# # [-0.10631423 -0.12341525 -0.12524074 -0.10716469 -0.11352019]
# # 0.11352019431883034
# # 0.00793256565121934
# # 0.115131021385492

# print(score_dataset(X, y, pipeline_cb3))
# # [-0.10281654 -0.126599   -0.12371644 -0.10972694 -0.11812327]
# # 0.11812327086423803
# # 0.008824425231554592
# # 0.11619643846168552

# print(score_dataset(X, y, pipeline_cb4))
# # [-0.11295343 -0.13260574 -0.13311869 -0.11034042 -0.11798527]
# # 0.11798526791881647
# # 0.009676935519162008
# # 0.1214007095620582

# print(score_dataset(X, y, pipeline_lgbm1))
# # [-0.10897157 -0.12806925 -0.12505263 -0.10505543 -0.11226835]
# # 0.11226835473674715
# # 0.009062634359964232
# # 0.11588344796374961

# print(score_dataset(X, y, pipeline_lgbm2))
# # [-0.10663778 -0.12916449 -0.1261197  -0.10309439 -0.11300596]
# # 0.11300596297350247
# # 0.010374005785838268
# # 0.11560446553278303

# print(score_dataset(X, y, pipeline_lgbm3))
# # [-0.10764055 -0.12391567 -0.12530486 -0.10516194 -0.11525574]
# # 0.11525574056354786
# # 0.008193353838302055
# # 0.11545575046806685

In [66]:
# score_dataset(X, y, pipeline)

# pipeline_xgb3
# 0.11586713304296034
# public score : 0.12172

# pipeline_cb2
# 0.115131021385492
# public score : 0.12092

# pipeline_cb4
	# 0.1214007095620582
	# public score : 0.11915
	
	# 0.11973587332021315
	# public score : 0.11767   *best*
	
	# 0.11877609053306046
	# public score : 0.11857
	
	# 0.11886993653590813
	# public score : 0.11854
	
	# 0.11888409741568533
	# public score : 0.11855
	
	# 0.11947660122245345
	# public score : 0.11848

# pipeline_lgbm3
# 0.11545575046806685
# public score : 0.12226

##################################################################################

# CustomVotingRegressor xgb3, cb2, lgbm3
# [-0.10592071 -0.12359544 -0.12471606 -0.10401465 -0.11350561]
# 0.11350561026281031
# 0.00861996029068565
# 0.11435049565486803

# public score : 0.12054

######################################
# with median voting

# [-0.10593103 -0.12457699 -0.12517086 -0.10322035 -0.11443593]
# 0.1144359285988727
# 0.0091206939039808
# 0.11466703109172434


##################################################################################

# CustomVotingRegressor xgb 137, cb 234, lgbm 123
# [-0.10452924 -0.12480733 -0.124325   -0.10344957 -0.11256266]
# 0.11256265630472277
# 0.009235317204888441
# 0.11393475744392181

# public score : 0.11934

######################################
# with median voting
# [-0.10510494 -0.12485347 -0.1248122  -0.10324854 -0.11312523]
# 0.11312523328731622
# 0.009272764398273466
# 0.11422887489665161

# public score : 0.11979

##################################################################################

# CustomVotingRegressor cb 234

# [-0.10501766 -0.1257221  -0.12555756 -0.10671793 -0.11465395]
# 0.1146539509677765
# 0.008869725329364584
# 0.11553383925228788

######################################
# with median voting

# [-0.10542628 -0.12512008 -0.12560693 -0.10800153 -0.11600194]
# 0.11600193500340161
# 0.00838144314418917
# 0.11603135196791996


# Train on full data and obtain test predictions

In [67]:
#retrain on full data and obtain test predictions using best model hyperparameter values
pipeline.fit(X, np.log(y))

# Preprocessing of validation data, get predictions
pred = np.exp(pipeline.predict(X_test))

print(pred[:10])

[121558.65130878 159741.89327204 187494.68235669 200495.695071
 184808.55544827 172012.16088957 179926.20132713 166937.7451205
 176027.53393187 126869.91993316]


In [68]:
pipeline

In [69]:
# # Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': pred})
output.to_csv('submission.csv', index=False)
print('saved output file')

saved output file
