This notebook is a cleaned up version of https://www.kaggle.com/code/abhivij/housing-price-prediction-part-2-exploratory 

# References
- sklearn pipeline : https://www.kaggle.com/code/alexisbcook/pipelines
- https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices
- https://www.kaggle.com/code/marto24/beginners-prediction-top3

# Import libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from pandas.api.types import CategoricalDtype

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, ExtraTreesRegressor, VotingRegressor, StackingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import BayesianRidge, RidgeCV, LinearRegression

from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold
from sklearn.feature_selection import mutual_info_regression

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, PowerTransformer

from functools import reduce

from category_encoders import MEstimateEncoder, cat_boost

from sklearn.compose import ColumnTransformer

import optuna
import time

In [2]:
# import xgboost
# import lightgbm
# print(xgboost.__version__)
# print(lightgbm.__version__)

# 2.0.3
# 4.5.0

# Global variables

In [3]:
SEED = 0

# Load data and preprocess function

In [4]:
def load_and_preprocess_data(train_data = True, perform_impute = True):
    if train_data:
        print("Train data")
        X = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv', index_col='Id')
        X.dropna(axis=0, subset=['SalePrice'], inplace=True)
        y = X.SalePrice
        X.drop(['SalePrice'], axis=1, inplace=True)
    else:
        print("Test data")
        X = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv', index_col='Id')
        y = None
    print("Loaded data")
    print(X.shape)

    X["GarageYrBlt"] = X["GarageYrBlt"].where((X.GarageYrBlt.isna() | (X.GarageYrBlt <= 2024)), X.YearRemodAdd)  #there is 1 GarageYrBlt with value 2207
    X["Exterior2nd"] = X["Exterior2nd"].replace({"Brk Cmn": "BrkComm"})
    
    X = encode(X)
    if perform_impute:
        X = impute(X)
    
    return (X, y)

def encode(df):  # lists of columns needed for this is defined in next cell
    # Nominal categories
    for name in features_nom:
        df[name] = df[name].astype("category")
        # Add a None category for missing values
        if "None" not in df[name].cat.categories:
            df[name] = df[name].cat.add_categories("None")
    # Ordinal categories
    for name, levels in ordered_levels.items():
        df[name] = df[name].astype(CategoricalDtype(levels,
                                                    ordered=True))
    return df

def impute(df):
    df.loc[df.GarageYrBlt.isna() & df.GarageType.notna(), "GarageYrBlt"] = df.YearRemodAdd
    for name in df.select_dtypes("number"):
        df[name] = df[name].fillna(0)
    for name in df.select_dtypes("category"):
        df[name] = df[name].fillna("None")
    return df

# Categorical features - special handling
Ref : https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices

In [5]:
# The nominative (unordered) categorical features
features_nom = ["MSSubClass", "MSZoning", "Street", "Alley", "LandContour", "LotConfig", "Neighborhood", 
                "Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", 
                "MasVnrType", "Foundation", "Heating", "CentralAir", "GarageType", "MiscFeature", "SaleType", "SaleCondition",
                "Fence", "Electrical"]


# The ordinal (ordered) categorical features 
five_levels = ["Po", "Fa", "TA", "Gd", "Ex"]
ten_levels = list(range(1, 11))

ordered_levels = {
    "OverallQual": ten_levels,
    "OverallCond": ten_levels,
    "ExterQual": five_levels,
    "ExterCond": five_levels,
    "BsmtQual": five_levels,
    "BsmtCond": five_levels,
    "HeatingQC": five_levels,
    "KitchenQual": five_levels,
    "FireplaceQu": five_levels,
    "GarageQual": five_levels,
    "GarageCond": five_levels,
    "PoolQC": five_levels,
    "LotShape": ["IR3", "IR2", "IR1", "Reg"],
    "LandSlope": ["Sev", "Mod", "Gtl"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "Functional": ["Sal", "Sev", "Maj1", "Maj2", "Mod", "Min2", "Min1", "Typ"],
    "GarageFinish": ["Unf", "RFn", "Fin"],
    "PavedDrive": ["N", "P", "Y"],
    "Utilities": ["ELO", "NoSeWa", "NoSewr", "AllPub"],
    "CentralAir": ["N", "Y"]
}

ordered_levels = {key: ["None"] + value for key, value in
                  ordered_levels.items()}
ordered_levels.keys()

dict_keys(['OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC', 'LotShape', 'LandSlope', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Functional', 'GarageFinish', 'PavedDrive', 'Utilities', 'CentralAir'])

# Append features

In [6]:
ms_subclass_mapping = {
    20: "1-STORY 1946 & NEWER ALL STYLES",
    30: "1-STORY 1945 & OLDER",
    40: "1-STORY W/FINISHED ATTIC ALL AGES",
    45: "1-1/2 STORY - UNFINISHED ALL AGES",
    50: "1-1/2 STORY FINISHED ALL AGES",
    60: "2-STORY 1946 & NEWER",
    70: "2-STORY 1945 & OLDER",
    75: "2-1/2 STORY ALL AGES",
    80: "SPLIT OR MULTI-LEVEL",
    85: "SPLIT FOYER",
    90: "DUPLEX - ALL STYLES AND AGES",
    120: "1-STORY PUD (Planned Unit Development) - 1946 & NEWER",
    150: "1-1/2 STORY PUD - ALL AGES",
    160: "2-STORY PUD - 1946 & NEWER",
    180: "PUD - MULTILEVEL - INCL SPLIT LEV/FOYER",
    190: "2 FAMILY CONVERSION - ALL STYLES AND AGES"
}

ms_class_mapping = {
    "1-STORY 1946 & NEWER ALL STYLES": "1-Story",
    "1-STORY 1945 & OLDER": "1-Story",
    "1-STORY W/FINISHED ATTIC ALL AGES": "1-Story",
    "1-STORY PUD (Planned Unit Development) - 1946 & NEWER": "1-Story",
    "1-1/2 STORY - UNFINISHED ALL AGES": "1-1/2 Story",
    "1-1/2 STORY FINISHED ALL AGES": "1-1/2 Story",
    "1-1/2 STORY PUD - ALL AGES": "1-1/2 Story",
    "2-STORY 1946 & NEWER": "2-Story",
    "2-STORY 1945 & OLDER": "2-Story",
    "2-STORY PUD - 1946 & NEWER": "2-Story",
    "SPLIT OR MULTI-LEVEL": "Split-Level",
    "SPLIT FOYER": "Split-Level",
    "PUD - MULTILEVEL - INCL SPLIT LEV/FOYER": "Split-Level",
    "DUPLEX - ALL STYLES AND AGES": "Multi-Family/Duplex",
    "2 FAMILY CONVERSION - ALL STYLES AND AGES": "Multi-Family/Duplex",
    "2-1/2 STORY ALL AGES": "2-1/2 Story",
}

In [7]:
def append_features(df):
    df = df.copy()

    #The commented features below ended up decreasing the overall score
    
    df["LivLotRatio"] = df.GrLivArea / df.LotArea
    # df["Spaciousness"] = (df['1stFlrSF'] + df['2ndFlrSF']) / df.TotRmsAbvGrd
    # df["Spaciousness"] = df.GrLivArea / df.TotRmsAbvGrd
    df["Spaciousness"] = df.GrLivArea / (df.TotRmsAbvGrd + df.FullBath + df.HalfBath + df.KitchenAbvGr)

    # df["Age"] = df.YrSold - df.YearBuilt
    # df["Age_since_mod"] = df.YrSold - df.YearRemodAdd
    # print(df.Age_since_mod.describe())

    # bldg_dummies = pd.get_dummies(df.BldgType, prefix="Bldg")
    # df = df.join(bldg_dummies.mul(df.GrLivArea, axis=0))
    
    # df["PorchTypes"] = df[["WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch"]].gt(0.0).sum(axis=1)

    # df["TotalOutsideSF"] = df.WoodDeckSF + df.OpenPorchSF + df.EnclosedPorch + df["3SsnPorch"] + df.ScreenPorch

    df["MSClass"] = (df["MSSubClass"].map(ms_subclass_mapping)
                                    .map(ms_class_mapping)
                                    .astype('category')
                                    .cat.add_categories("None")
                                    .fillna("None"))
    df["IsPUD"] = (df["MSSubClass"].map(ms_subclass_mapping)
                                  .str.contains('PUD')
                                  .astype('category')
                                  .cat.add_categories("None")
                                  .fillna("None"))
    # df.drop(columns = "MSSubClass", inplace = True)

    # df["MedNhbdArea"] = df.groupby("Neighborhood")["GrLivArea"].transform("median")

    # #PCA inspired as specified in https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices
    # df["Feature1"] = df.GrLivArea + df.TotalBsmtSF
    # df["Feature2"] = df.YearRemodAdd * df.TotalBsmtSF

    # df["OverallScore"] = df.OverallQual.cat.codes * df.OverallCond.cat.codes
    # df["OverallScore"] = df.OverallQual.cat.codes + df.OverallCond.cat.codes

    # df["LotAreaFrontage"] = df.LotArea * (df.LotFrontage + 21.0/10)  
    #                                     # adding a small value to avoid effect of 0 LotFrontage. 
    #                                     # 21 is minimum LotFrontage before replacing NA with 0
    # df["LotAreaFrontage"] = df.LotArea * df.LotFrontage

    # df["Age_with_quality"] = (df.YrSold - df.YearBuilt) * df.OverallQual.cat.codes 

    # df["TotalBathrooms"] = df.FullBath + (0.5 * df.HalfBath) + df.BsmtFullBath + (0.5 * df.BsmtHalfBath)

    df["GarageAreaPerCar"] = df.GarageArea / (df.GarageCars + 0.1)
    # print(df["GarageAreaPerCar"].describe())
    
    return df

# Load data and process

In [8]:
X, y = load_and_preprocess_data()
X_test, _ = load_and_preprocess_data(train_data = False)

print("removing less important features")
features_to_drop = ['PoolQC', 'MiscVal', 'MoSold', 'PoolArea', 'MiscFeature', 'Utilities']
X.drop(columns = features_to_drop, inplace = True)
X_test.drop(columns = features_to_drop, inplace = True)
print(X.shape)
print(X_test.shape)

print("appending features")
X = append_features(X)
print(X.shape)
X_test = append_features(X_test)
print(X_test.shape)

def remove_columns_from_list(orig_list, to_remove):
    return [f for f in orig_list if f not in to_remove]
    
ordinal_categorical_cols = remove_columns_from_list(ordered_levels.keys(), features_to_drop)
features_nom = remove_columns_from_list(features_nom, features_to_drop)

Train data
Loaded data
(1460, 79)
Test data
Loaded data
(1459, 79)
removing less important features
(1460, 73)
(1459, 73)
appending features
(1460, 78)
(1459, 78)


# Append Cluster information as training features

In [9]:
class AppendKMeans(BaseEstimator, TransformerMixin):
    def __init__(self, cluster_columns, n_clusters=20, return_cluster=True, return_distances=False):
        self.cluster_columns = cluster_columns
        self.n_clusters = n_clusters
        self.return_cluster = return_cluster
        self.return_distances = return_distances

    def fit(self, X, y=None):
        X = X.copy()
        for colname in X.select_dtypes(["category"]):
            X[colname] = X[colname].cat.codes
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X[self.cluster_columns])  # Scale features
        self.kmeans = KMeans(n_clusters=self.n_clusters, n_init=10, random_state=SEED)
        self.kmeans.fit(X_scaled)  # Fit K-Means on scaled features
        return self

    def transform(self, X):
        result = X.copy()
        X = X.copy()
        for colname in X.select_dtypes(["category"]):
            X[colname] = X[colname].cat.codes
        X_scaled = self.scaler.transform(X[self.cluster_columns])  # Apply same scaling as training
        if self.return_cluster:
            result["Cluster"] = self.kmeans.predict(X_scaled)  # Get cluster
        if self.return_distances:
            cluster_distances = self.kmeans.transform(X_scaled)
            cluster_distances = pd.DataFrame(
                    cluster_distances, columns=[f"distance_centroid_{i}" for i in range(cluster_distances.shape[1])]
            )
            cluster_distances.set_index(X.index, inplace = True)
            result = result.join(cluster_distances)
        return result

# Append PCA

In [10]:
class AppendPCA(BaseEstimator, TransformerMixin):
    def __init__(self, pca_columns, n_components=2, pca_col_prefix="PCA"):
        self.pca_columns = pca_columns
        self.n_components = n_components
        self.pca_col_prefix = pca_col_prefix

    def fit(self, X, y=None):
        X = X.copy()
        for colname in X.select_dtypes(["category"]):
            X[colname] = X[colname].cat.codes
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X[self.pca_columns])  # Scale features
        self.pca = PCA(n_components=self.n_components, random_state=SEED)
        self.pca.fit(X_scaled)  # Fit PCA on scaled features
        return self

    def transform(self, X):
        result = X.copy()
        X = X.copy()
        for colname in X.select_dtypes(["category"]):
            X[colname] = X[colname].cat.codes
        X_scaled = self.scaler.transform(X[self.pca_columns])  # Apply same scaling as training
        pca_components = self.pca.transform(X_scaled)  # Apply PCA
        # print(self.pca.explained_variance_ratio_)
        # print(np.cumsum(self.pca.explained_variance_ratio_))
        pca_components = pd.DataFrame(
                    pca_components, columns=[f"{self.pca_col_prefix}_{i}" for i in range(pca_components.shape[1])]
        )
        pca_components.set_index(X.index, inplace = True)
        result = result.join(pca_components)
        return result

# Target Encoding

In [11]:
class CrossFoldEncoder(BaseEstimator, TransformerMixin):
    
    #encoder_other_params should be a dict of argument_name and value
    # This is done to ensure it works properly within Pipeline
    # Not passing it as kwargs, because Pipeline uses sklearn.base.clone() and clone does not retain kwargs
    def __init__(self, cols, encoder, encoder_other_params):
        self.cols = cols
        self.encoder = encoder
        self.cv = KFold(n_splits=5)
        self.encoder_other_params = encoder_other_params  
        
    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit(self, X, y):
        self.fitted_encoders_ = []
        X_encoded = []
        for idx_encode, _ in self.cv.split(X):
            fitted_encoder = self.encoder(cols=self.cols, **self.encoder_other_params)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            self.fitted_encoders_.append(fitted_encoder)
        return self

    # To transform the data, average the encodings learned from
    # each fold.
    def transform(self, X):
        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        #drop columns for which target encoding has been created and join with target encodings
        return X.drop(columns=self.cols).join(X_encoded)   

# Training pipeline

Lets define a Transformer to convert categorical columns to their codes

In [12]:
class OrdinalEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        result = X.copy()
        for col in result.columns:
            result[col] = result[col].cat.codes
        return result

In [13]:
categorical_cols = [cname for cname in X.columns if
                    X[cname].dtype == "category"]

numerical_cols = [cname for cname in X.columns if 
                X[cname].dtype in ['int64', 'float64']]

small_cat_categorical_cols = [cname for cname in categorical_cols if
                             X[cname].nunique() < 10 and cname not in ordinal_categorical_cols]
large_cat_categorical_cols = [cname for cname in categorical_cols if
                             X[cname].nunique() >= 10 and cname not in ordinal_categorical_cols]

print(len(ordinal_categorical_cols))
print(len(small_cat_categorical_cols))
print(len(large_cat_categorical_cols))
print(len(categorical_cols))  
print(len(numerical_cols))

20
21
4
45
33


Train and test data have same distribution for all of the important variables as seen in the plots in https://www.kaggle.com/code/abhivij/housing-price-prediction-part-2-exploratory

While creating, train-valid splits we'll simulate this by binning sale price (i.e. the target variable) and ensuring that train and valid have same proportion of samples from each of these bins.

(I did explore creating a new concatenated column of OverallQual, Neighborhood, binned GarageArea, binned GrLivArea, binned YearBuilt, binned YearRemodAdd, binned YrSold. But the concatenated column had a large number of unique values with only 1 occurence, which caused issues during train-valid split based on this column. This issue persisted even after decreasing bin size, using fewer columns to obtain the concatenated column. So decided to bin sale price for train-valid split.) 

In [14]:
strat_y = pd.qcut(np.log(y), q=20, labels=False)
strat_y.value_counts().sort_index()

SalePrice
0     75
1     71
2     76
3     73
4     70
5     74
6     79
7     71
8     78
9     65
10    75
11    69
12    75
13    71
14    76
15    73
16    72
17    72
18    72
19    73
Name: count, dtype: int64

In [15]:
def score_dataset(X, y, model=XGBRegressor()):
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    
    log_y = np.log(y)
    strat_y = pd.qcut(log_y, q=20, labels=False)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    splits = list(skf.split(X, strat_y))
    
    score = cross_val_score(
        model, X, log_y, cv=splits, scoring="neg_root_mean_squared_error"
    )

    # score = cross_val_score(
    #     model, X, log_y, cv=5, scoring="neg_root_mean_squared_error"
    # )    
    
    print(score)
    print(-1*np.median(score))
    print(np.std(score))
    score = -1 * np.mean(score)
    return score

In [16]:
#Pipeline 1
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('small_cat_catcode', OrdinalEncoder())
])

pipeline_xgb1 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', XGBRegressor(random_state = SEED))         
])
# score_dataset(X, y, pipeline_xgb1)

# [-0.12334956 -0.13390977 -0.13267923 -0.11532492 -0.12691669]
# 0.12691668822938887
# 0.006753496607733489
# 0.12643603427331637

# skf score
# [-0.12785618 -0.13046663 -0.14520162 -0.17736176 -0.11825496]
# 0.13046663079150023
# 0.02065921056003522
# 0.13982822954073137

In [17]:
# Pipeline 2
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder()),
    ('scaler', StandardScaler())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


pipeline_xgb2 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols+[col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', XGBRegressor(random_state = SEED))         
])

# score_dataset(X, y, pipeline)

# # [-0.12328024 -0.14403148 -0.14249333 -0.11536911 -0.12724898]
# # 0.12724898077297406
# # 0.011122748447938504
# # 0.1304846298772378

In [18]:
# Pipeline 3
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder()),
    ('scaler', StandardScaler())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


pipeline_xgb3 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)), 
    ('append_kmeans', AppendKMeans([f"PCA_{i}" for i in range(5)], 
                                   n_clusters = 10,
                                   return_cluster=True, return_distances=True)),
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, (numerical_cols + 
                                           [col + "_encoded" for col in large_cat_categorical_cols] + 
                                           [f"PCA_{i}" for i in range(5)] + 
                                           [f"distance_centroid_{i}" for i in range(10)])
            ),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols+['Cluster'])
        ],
        remainder="passthrough")
    ),
    ('model', XGBRegressor(random_state = SEED))         
])

# score_dataset(X, y, pipeline_xgb3)

# # [-0.12575089 -0.14977669 -0.14573617 -0.11280657 -0.12453592]
# # 0.12575089246295004
# # 0.013910111600061406
# # 0.13172124644075708

In [19]:
# # Pipeline 4 - RandomForestRegressor
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('small_cat_catcode', OrdinalEncoder())
])

pipeline_rf = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', RandomForestRegressor(random_state = SEED))         
])
# # score_dataset(X, y, pipeline)
# # [-0.13334223 -0.14826003 -0.14243637 -0.12530829 -0.13019875]
# # 0.13334223315889074
# # 0.008327364089523639
# # 0.1359091355635815

In [20]:
# Pipeline 5
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

pipeline_xgb5 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', XGBRegressor(random_state = SEED))         
])

# score_dataset(X, y, pipeline)

# [-0.12651235 -0.13380629 -0.13357361 -0.11428085 -0.12639507]
# 0.12651234561050542
# 0.007097623139040923
# 0.126913633236225

# med_cat [> 5 and < 10]
# [-0.12506024 -0.14160065 -0.13534954 -0.11813159 -0.11999781]
# 0.12506024282103534
# 0.00904491349257496
# 0.12802796794855642

# med_cat [> 6 and < 10]
# [-0.1277651  -0.14225836 -0.13436323 -0.11917849 -0.12597884]
# 0.12776509761340055
# 0.007842616722643862
# 0.12990880359437723

# med_cat [> 4 and < 10]
# [-0.12548204 -0.142939   -0.13416994 -0.11402189 -0.12682665]
# 0.12682664692146164
# 0.009612285820809875
# 0.12868790109795897

In [21]:
# Pipeline 6
numerical_transformer = Pipeline(steps=[
    ('skew_handler', PowerTransformer(method='yeo-johnson', standardize=False)),
    ('scaler', RobustScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder()),
    ('scaler', RobustScaler())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


pipeline_xgb6 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols+[col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', XGBRegressor(random_state = SEED))         
])

# score_dataset(X, y, pipeline)

# [-0.12212421 -0.14414503 -0.14381811 -0.11467164 -0.12970438]
# 0.12970437937124102
# 0.011697107907623815
# 0.13089267445607045

In [22]:
# Pipeline 7 - only k-means

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


pipeline_xgb7 = Pipeline([
    ('append_kmeans', AppendKMeans(X.columns, 
                                   n_clusters = 10,
                                   return_cluster=True, return_distances=False)),
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, ([col + "_encoded" for col in large_cat_categorical_cols])
            ),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', XGBRegressor(random_state = SEED))         
])

# score_dataset(X, y, pipeline_xgb7)

# only centroid distance
# [-0.13326775 -0.14492401 -0.14185682 -0.12436563 -0.13897696]
# 0.13897696253715167
# 0.007257085655132953
# 0.13667823374117763

# only Cluster  
# [-0.13092108 -0.14709257 -0.14129975 -0.12789252 -0.13499367]
# 0.13499367109437677
# 0.006967125977370855
# 0.1364399168990166

# only cluster - left untouched    - choosing this one
# [-0.12070129 -0.14202249 -0.14128743 -0.13420726 -0.12928795]
# 0.13420725736860215
# 0.007941353221420946
# 0.13350128260930716

In [23]:
# Pipeline 8 - only 1-hot encode for XGBRegressor

ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('smallcat_onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
large_categorical_transformer = Pipeline(steps=[
    ('largecat_onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

pipeline_xgb8 = Pipeline([ 
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols),
            ('large_cat', large_categorical_transformer, large_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', XGBRegressor(random_state = SEED))         
])

# score_dataset(X, y, pipeline_xgb8)
# [-0.13613093 -0.15101244 -0.13792802 -0.11980567 -0.13703808]
# 0.13703808145019195
# 0.00991673402099732
# 0.13638302697986404

In [24]:
# Pipeline 9
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
large_categorical_transformer = Pipeline(steps=[
    ('largecat_onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


pipeline_xgb9 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)), 
    ('append_kmeans', AppendKMeans([f"PCA_{i}" for i in range(5)], 
                                   n_clusters = 10,
                                   return_cluster=True, return_distances=True)),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, (numerical_cols + 
                                           [f"PCA_{i}" for i in range(5)] + 
                                           [f"distance_centroid_{i}" for i in range(10)])
            ),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols+['Cluster']),
            ('large_cat', large_categorical_transformer, large_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', XGBRegressor(random_state = SEED))         
])

# score_dataset(X, y, pipeline_xgb9)

# [-0.12723568 -0.15257682 -0.1454333  -0.11587776 -0.12868696]
# 0.12868696332256468
# 0.013253012178097935
# 0.13396210582881893

# skf score
# [-0.13190184 -0.1366358  -0.14314017 -0.17554378 -0.12275222]
# 0.13663580215769133
# 0.01804277868680093
# 0.1419947618503093

In [25]:
# Pipeline 10
pipeline_xgb10 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)), 
    ('append_kmeans', AppendKMeans([f"PCA_{i}" for i in range(5)], 
                                   n_clusters = 10,
                                   return_cluster=False, return_distances=True)),
    ('model', XGBRegressor(random_state = SEED, enable_categorical = True))         
])

# score_dataset(X, y, pipeline_xgb10)
# [-0.12726289 -0.15104159 -0.14276744 -0.1217938  -0.12480024]
# 0.12726288804018787
# 0.011359663294115332
# 0.13353319178113301

# skf split
# [-0.13683021 -0.12144668 -0.14358679 -0.17946046 -0.12809528]
# 0.13683021145563887
# 0.02023977746270551
# 0.14188388439875155

In [26]:
# Pipeline CatBoost 1
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('small_cat_catcode', OrdinalEncoder())
])

pipeline_cb1 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', CatBoostRegressor(random_state = SEED, verbose = False))         
])
# score_dataset(X, y, pipeline_cb1)

# [-0.1085442  -0.12963785 -0.12813562 -0.11028954 -0.11625041]
# 0.11625041178886363
# 0.008814324369377749
# 0.11857152416791084

# skf score
# [-0.11624901 -0.11453878 -0.12176986 -0.1552284  -0.10681426]
# 0.11624900798690328
# 0.016848130288228083
# 0.1229200608367171

In [27]:
# Pipeline CatBoost 2 - catboost of Pipeline 5
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

pipeline_cb2 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', CatBoostRegressor(random_state = SEED, verbose = False))         
])


# score_dataset(X, y, pipeline_cb2)
# [-0.10855696 -0.1291194  -0.12396306 -0.11102852 -0.11627649]
# 0.11627649414347431
# 0.007742341323149529
# 0.11778888615576621

# skf score
# [-0.11657571 -0.11399993 -0.12022822 -0.151068   -0.10718364]
# 0.11657570939426244
# 0.015237178152076106
# 0.12181110094841437

In [28]:
# Pipeline Catboost 3 - CatBoost of k-means only pipeline

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


pipeline_cb3 = Pipeline([
    ('append_kmeans', AppendKMeans(X.columns, 
                                   n_clusters = 10,
                                   return_cluster=True, return_distances=False)),
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, ([col + "_encoded" for col in large_cat_categorical_cols])
            ),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', CatBoostRegressor(random_state = SEED, verbose = False))         
])

# score_dataset(X, y, pipeline_cb3)

# [-0.10820295 -0.13116993 -0.12583494 -0.11234789 -0.11822297]
# 0.11822296922270072
# 0.008439584476879475
# 0.11915573581349706

In [29]:
# Pipeline Catboost 4

pipeline_cb4 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('model', CatBoostRegressor(random_state = SEED, verbose = False, cat_features = categorical_cols))         
])

#score_dataset(X, y, pipeline_cb4)
# [-0.11708812 -0.13720957 -0.12798973 -0.11292743 -0.12335977]
# 0.12335977226121901
# 0.008494589516422364
# 0.12371492410358331

# without PCA
# [-0.11357924 -0.14151363 -0.12863115 -0.11429514 -0.12462189]
# 0.1246218915669454
# 0.010295076083013943
# 0.12452821126916189

#skf score
# [-0.1245166  -0.12251158 -0.12408372 -0.15749913 -0.11058973]
# 0.124083723980751
# 0.015689374060298498
# 0.12784015103401533

# skf score with best params from previous random cv split
# [-0.12479513 -0.11602021 -0.12677251 -0.16181625 -0.10792355]
# 0.1247951298785159
# 0.01844162469894824
# 0.12746553160273738

In [30]:
# Pipeline CatBoost 5
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('smallcat_onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
large_categorical_transformer = Pipeline(steps=[
    ('largecat_onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

pipeline_cb5 = Pipeline([ 
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols),
            ('large_cat', large_categorical_transformer, large_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', CatBoostRegressor(random_state = SEED, verbose = False))         
])

# score_dataset(X, y, pipeline)

# all one-hot
# [-0.10672922 -0.13633214 -0.12457854 -0.10782042 -0.12033838]
# 0.12033838274832265
# 0.011033964213400175
# 0.11915974164505086

# small_cat one-hot, large_cat ord
# [-0.11096583 -0.13340597 -0.12563391 -0.108273   -0.12293233]
# 0.12293232865898586
# 0.009369152248574352
# 0.12024220711125147

# no PCA                                                       - choosing this as best
# [-0.10711505 -0.13372507 -0.12441688 -0.10847305 -0.12105381]
# 0.12105381144002565
# 0.010024449857969605
# 0.11895677082508065

# no PCA, but with numerical standard scaling
# [-0.10507992 -0.13093361 -0.12520356 -0.10951426 -0.12321157]
# 0.1232115691438285
# 0.009819949434289373
# 0.1187885845004681

In [31]:
# Pipeline Catboost 6

pipeline_cb6 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)), 
    ('append_kmeans', AppendKMeans([f"PCA_{i}" for i in range(5)], 
                                   n_clusters = 10,
                                   return_cluster=False, return_distances=True)),
    ('model', CatBoostRegressor(random_state = SEED, verbose = False, cat_features = categorical_cols))         
])

# skf
#score_dataset(X, y, pipeline_cb6)
# [-0.12623897 -0.12059074 -0.12858833 -0.15933993 -0.11112165]
# 0.1262389656010505
# 0.0162388462195836
# 0.12917592388929722



In [32]:
# Pipeline LGBM 1
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('small_cat_catcode', OrdinalEncoder())
])

pipeline_lgbm1 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', LGBMRegressor(random_state = SEED, bagging_seed = SEED, verbose = -1))         
])
# score_dataset(X, y, pipeline_lgbm1)

# [-0.12680761 -0.13677544 -0.13297276 -0.12339225 -0.12447066]
# 0.126807605437858
# 0.0051563769780464805
# 0.12888374307590683

# skf score
# [-0.12668316 -0.1288637  -0.13902374 -0.16449322 -0.11501904]
# 0.12886369701337194
# 0.016684932158762624
# 0.1348165713973164

In [33]:
# Pipeline LGBM 2
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

pipeline_lgbm2 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', LGBMRegressor(random_state = SEED, bagging_seed = SEED, verbose = -1))         
])
# score_dataset(X, y, pipeline_lgbm2)

# [-0.12443105 -0.13954521 -0.13391999 -0.12133567 -0.12435406]
# 0.12443105229520869
# 0.006873280279288598
# 0.12871719775451934

#skf split
# [-0.12638687 -0.13201789 -0.13919122 -0.16584972 -0.11779284]
# 0.1320178902768938
# 0.016374693948860188
# 0.13624770822490476

In [34]:
# Pipeline LGBM 3
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder()),
    ('scaler', StandardScaler())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


pipeline_lgbm3 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)), 
    ('append_kmeans', AppendKMeans([f"PCA_{i}" for i in range(5)], 
                                   n_clusters = 10,
                                   return_cluster=True, return_distances=True)),
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, (numerical_cols + 
                                           [col + "_encoded" for col in large_cat_categorical_cols] + 
                                           [f"PCA_{i}" for i in range(5)] + 
                                           [f"distance_centroid_{i}" for i in range(10)])
            ),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols+['Cluster'])
        ],
        remainder="passthrough")
    ),
    ('model', LGBMRegressor(random_state = SEED, bagging_seed = SEED, verbose = -1))         
])

# score_dataset(X, y, pipeline_lgbm3)

# [-0.12556062 -0.14073109 -0.13393737 -0.12028624 -0.12396851]
# 0.12556061733110063
# 0.007416435833493804
# 0.12889676305259004

In [35]:
# Pipeline LGBM 4 - kmeans only
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


pipeline_lgbm4 = Pipeline([
    ('append_kmeans', AppendKMeans(X.columns, 
                                   n_clusters = 10,
                                   return_cluster=True, return_distances=False)),
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, ([col + "_encoded" for col in large_cat_categorical_cols])
            ),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', LGBMRegressor(random_state = SEED, bagging_seed = SEED, verbose = -1))         
])

#score_dataset(X, y, pipeline)

# [-0.11694906 -0.14512712 -0.13344501 -0.1199983  -0.13515   ]
# 0.13344501041063725
# 0.010368142649721615
# 0.1301338982050343

In [36]:
# Pipeline LGBM 5 - only 1-hot encoding

ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('smallcat_onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
large_categorical_transformer = Pipeline(steps=[
    ('largecat_onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

pipeline_lgbm5 = Pipeline([ 
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols),
            ('large_cat', large_categorical_transformer, large_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', LGBMRegressor(random_state = SEED, bagging_seed = SEED, verbose = -1))         
])

# score_dataset(X, y, pipeline)
# [-0.12025565 -0.14658346 -0.13970263 -0.11828571 -0.1324062 ]
# 0.13240620276420592
# 0.010923859302942571
# 0.1314467317752831

In [37]:
# Pipeline LGBM 6 
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
large_categorical_transformer = Pipeline(steps=[
    ('largecat_onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


pipeline_lgbm6 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)), 
    ('append_kmeans', AppendKMeans([f"PCA_{i}" for i in range(5)], 
                                   n_clusters = 10,
                                   return_cluster=True, return_distances=True)),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, (numerical_cols + 
                                           [f"PCA_{i}" for i in range(5)] + 
                                           [f"distance_centroid_{i}" for i in range(10)])
            ),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols+['Cluster']),
            ('large_cat', large_categorical_transformer, large_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', LGBMRegressor(random_state = SEED, bagging_seed = SEED, verbose = -1))         
])

# score_dataset(X, y, pipeline_lgbm6)

# [-0.12594525 -0.13956178 -0.13658555 -0.11936408 -0.125848  ]
# 0.12594525284059535
# 0.007485281057156706
# 0.12946093171067496

In [38]:
# BayesianRidge

# numerical_transformer = Pipeline(steps=[
#     ('scaler', StandardScaler())
# ])
# ord_categorical_transformer = Pipeline(steps=[
#     ('catcode', OrdinalEncoder())
# ])
# small_categorical_transformer = Pipeline(steps=[
#     ('small_cat_catcode', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])

# pipeline = Pipeline([
#     ('append_pca', AppendPCA(X.columns, n_components = 5)),  
#     ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
#                                         encoder=MEstimateEncoder, 
#                                         encoder_other_params={"m":10.0})),
#     ('encoder_scaler', ColumnTransformer(
#         transformers=[
#             ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
#             ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
#             ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
#         ],
#         remainder="passthrough")
#     ),
#     ('model', BayesianRidge())         
# ])

# score_dataset(X, y, pipeline)

# with ord small cat
# [-0.11571775 -0.1404684  -0.13144348 -0.12177303 -0.1757111 ]
# 0.13144348097093617
# 0.021100927807127443
# 0.137022751261448

# with one-hot encoded small cat
# [-0.11508182 -0.13797263 -0.12967817 -0.11685041 -0.16775914]
# 0.12967817316312194
# 0.01910357166131022
# 0.1334684342946869

In [39]:
#ExtraTreesRegressor Pipeline 1

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


pipeline_et1 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)), 
    ('append_kmeans', AppendKMeans([f"PCA_{i}" for i in range(5)], 
                                   n_clusters = 10,
                                   return_cluster=True, return_distances=True)),
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, (numerical_cols + 
                                           [col + "_encoded" for col in large_cat_categorical_cols] + 
                                           [f"PCA_{i}" for i in range(5)] + 
                                           [f"distance_centroid_{i}" for i in range(10)])
            ),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols+['Cluster'])
        ],
        remainder="passthrough")
    ),
    ('model', ExtraTreesRegressor(random_state = SEED))         
])

# score_dataset(X, y, pipeline_et1)

# [-0.11826076 -0.13937141 -0.13826047 -0.11569308 -0.12294652]
# 0.12294652160064375
# 0.010004575982814566
# 0.12690644772709053

In [40]:
#ExtraTreesRegressor Pipeline 2

# numerical_transformer = Pipeline(steps=[
#     ('scaler', StandardScaler())
# ])
# ord_categorical_transformer = Pipeline(steps=[
#     ('catcode', OrdinalEncoder())
# ])
# small_categorical_transformer = Pipeline(steps=[
#     ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])
# large_categorical_transformer = Pipeline(steps=[
#     ('largecat_onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])


# pipeline_et2 = Pipeline([
#     ('append_pca', AppendPCA(X.columns, n_components = 5)), 
#     ('append_kmeans', AppendKMeans([f"PCA_{i}" for i in range(5)], 
#                                    n_clusters = 10,
#                                    return_cluster=True, return_distances=True)),
#     ('encoder_scaler', ColumnTransformer(
#         transformers=[
#             ('num', numerical_transformer, (numerical_cols + 
#                                            [f"PCA_{i}" for i in range(5)] + 
#                                            [f"distance_centroid_{i}" for i in range(10)])
#             ),
#             ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
#             ('small_cat', small_categorical_transformer, small_cat_categorical_cols+['Cluster']),
#             ('large_cat', large_categorical_transformer, large_cat_categorical_cols)
#         ],
#         remainder="passthrough")
#     ),
#     ('model', ExtraTreesRegressor(random_state = SEED))         
# ])

# score_dataset(X, y, pipeline_et2)
# [-0.11921246 -0.14580891 -0.13797916 -0.1151756  -0.12521963]
# 0.1252196303001916
# 0.011522595856112235
# 0.1286791528690898

In [41]:
#HistGradientBoostingRegressor
# numerical_transformer = Pipeline(steps=[
#     ('scaler', StandardScaler())
# ])
# ord_categorical_transformer = Pipeline(steps=[
#     ('catcode', OrdinalEncoder())
# ])
# small_categorical_transformer = Pipeline(steps=[
#     ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])


# pipeline_h1 = Pipeline([
#     ('append_pca', AppendPCA(X.columns, n_components = 5)), 
#     ('append_kmeans', AppendKMeans([f"PCA_{i}" for i in range(5)], 
#                                    n_clusters = 10,
#                                    return_cluster=True, return_distances=True)),
#     ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
#                                         encoder=MEstimateEncoder, 
#                                         encoder_other_params={"m":10.0})),
#     ('encoder_scaler', ColumnTransformer(
#         transformers=[
#             ('num', numerical_transformer, (numerical_cols + 
#                                            [col + "_encoded" for col in large_cat_categorical_cols] + 
#                                            [f"PCA_{i}" for i in range(5)] + 
#                                            [f"distance_centroid_{i}" for i in range(10)])
#             ),
#             ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
#             ('small_cat', small_categorical_transformer, small_cat_categorical_cols+['Cluster'])
#         ],
#         remainder="passthrough")
#     ),
#     ('model', HistGradientBoostingRegressor(random_state = SEED))         
# ])

# score_dataset(X, y, pipeline_h1)
# [-0.12465843 -0.14172517 -0.13859008 -0.12073168 -0.1259393 ]
# 0.12593930127670075
# 0.00826621388215706
# 0.13032893294008452

In [42]:
# numerical_transformer = Pipeline(steps=[
#     ('scaler', StandardScaler())
# ])
# ord_categorical_transformer = Pipeline(steps=[
#     ('catcode', OrdinalEncoder())
# ])
# small_categorical_transformer = Pipeline(steps=[
#     ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])
# large_categorical_transformer = Pipeline(steps=[
#     ('largecat_onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])


# pipeline_h2 = Pipeline([
#     ('append_pca', AppendPCA(X.columns, n_components = 5)), 
#     ('append_kmeans', AppendKMeans([f"PCA_{i}" for i in range(5)], 
#                                    n_clusters = 10,
#                                    return_cluster=True, return_distances=True)),
#     ('encoder_scaler', ColumnTransformer(
#         transformers=[
#             ('num', numerical_transformer, (numerical_cols + 
#                                            [f"PCA_{i}" for i in range(5)] + 
#                                            [f"distance_centroid_{i}" for i in range(10)])
#             ),
#             ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
#             ('small_cat', small_categorical_transformer, small_cat_categorical_cols+['Cluster']),
#             ('large_cat', large_categorical_transformer, large_cat_categorical_cols)
#         ],
#         remainder="passthrough")
#     ),
#     ('model', HistGradientBoostingRegressor(random_state = SEED))         
# ])

# score_dataset(X, y, pipeline_h2)

# [-0.12380911 -0.14421526 -0.13960545 -0.12043507 -0.12576276]
# 0.12576275596783773
# 0.00937205999380419
# 0.13076552944033673

In [43]:
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
])

pipeline_columns = (numerical_cols + [f"PCA_{i}" for i in range(5)] + [f"distance_centroid_{i}" for i in range(10)]
                    + ordinal_categorical_cols + small_cat_categorical_cols + large_cat_categorical_cols + ["Cluster"])
columns_to_be_categorical = small_cat_categorical_cols + large_cat_categorical_cols + ["Cluster"]
categorical_mask = [col in columns_to_be_categorical for col in pipeline_columns]

pipeline_h3 = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)), 
    ('append_kmeans', AppendKMeans([f"PCA_{i}" for i in range(5)], 
                                   n_clusters = 10,
                                   return_cluster=True, return_distances=True)),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, (numerical_cols + 
                                           [f"PCA_{i}" for i in range(5)] + 
                                           [f"distance_centroid_{i}" for i in range(10)])
            ),
            ('cat', categorical_transformer, ordinal_categorical_cols + small_cat_categorical_cols + large_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', HistGradientBoostingRegressor(random_state = SEED, categorical_features=categorical_mask))         
])
# score_dataset(X, y, pipeline_h3)
# [-0.12352474 -0.1375335  -0.13739537 -0.12233368 -0.12430674]
# 0.12430674420627773
# 0.006924536123423835
# 0.12901880690726783

# ordinal_categorical_cols included in categorical features passed to HistGradientBoostingRegressor
# [-0.12586412 -0.13996687 -0.13777029 -0.12278049 -0.12320786]
# 0.1258641172460257
# 0.0074167660941668375
# 0.1299179256632085

# Optimize hyperparameters

In [44]:
# def xgb_objective(trial):  
    
#     params = {
#         'model__n_estimators':       trial.suggest_int('model__n_estimators', 500, 2000, step = 50),
#         'model__learning_rate':      trial.suggest_float('model__learning_rate', 1e-4, 0.1, log=True),
#         'model__max_depth':          trial.suggest_int('model__max_depth', 0, 16),
#         'model__min_child_weight':   trial.suggest_int('model__min_child_weight', 1, 10),
#         'model__lambda':             trial.suggest_float('model__lambda', 1e-4, 10.0, log = True),
#         'model__alpha':              trial.suggest_float('model__alpha', 1e-4, 10.0, log = True),
#         'model__subsample':          trial.suggest_float('model__subsample', 0.4, 1.0, step = 0.01),
#         'model__colsample_bytree':   trial.suggest_float('model__colsample_bytree', 0.4, 1.0, step = 0.01)
#     }
#     pipeline_clone = clone(pipeline)
#     pipeline_clone.set_params(**params)

#     val_score = score_dataset(X, y, pipeline_clone)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')
# study.optimize(xgb_objective, n_trials = 300)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"XGB tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [45]:
# #defining a separate objective function with additional params for categorical data handling

# def xgb10_objective(trial):  
    
#     params = {
#         'model__n_estimators':       trial.suggest_int('model__n_estimators', 500, 2000, step = 50),
#         'model__learning_rate':      trial.suggest_float('model__learning_rate', 1e-4, 0.1, log=True),
#         'model__max_depth':          trial.suggest_int('model__max_depth', 0, 16),
#         'model__min_child_weight':   trial.suggest_int('model__min_child_weight', 1, 10),
#         'model__lambda':             trial.suggest_float('model__lambda', 1e-4, 10.0, log = True),
#         'model__alpha':              trial.suggest_float('model__alpha', 1e-4, 10.0, log = True),
#         'model__subsample':          trial.suggest_float('model__subsample', 0.4, 1.0, step = 0.01),
#         'model__colsample_bytree':   trial.suggest_float('model__colsample_bytree', 0.4, 1.0, step = 0.01),
#         'model__max_cat_to_onehot':  trial.suggest_int('model__max_cat_to_onehot', 2, 25),
#         'model__max_cat_threshold':  trial.suggest_int('model__max_cat_threshold', 2, 32),

#     }
#     pipeline_clone = clone(pipeline_xgb10)
#     pipeline_clone.set_params(**params)

#     val_score = score_dataset(X, y, pipeline_clone)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')

# study.enqueue_trial({'model__n_estimators': 1800, 'model__learning_rate': 0.009314521918435957, 
#                      'model__max_depth': 4, 'model__min_child_weight': 2, 
#                      'model__lambda': 0.05452883626379855, 'model__alpha': 0.0002684304975102518, 
#                      'model__subsample': 0.76, 'model__colsample_bytree': 0.46, 
#                      'model__max_cat_to_onehot': 64, 'model__max_cat_threshold': 4})
# study.enqueue_trial({'model__n_estimators': 950, 'model__learning_rate': 0.024273156624668824, 
#                      'model__max_depth': 4, 'model__min_child_weight': 4, 
#                      'model__lambda': 0.001385577354934915, 'model__alpha': 0.0004628157478762704, 
#                      'model__subsample': 0.78, 'model__colsample_bytree': 0.4, 
#                      'model__max_cat_to_onehot': 5, 'model__max_cat_threshold': 16})
# study.enqueue_trial({'model__n_estimators': 1800, 'model__learning_rate': 0.009314521918435957, 
#                      'model__max_depth': 4, 'model__min_child_weight': 2, 
#                      'model__lambda': 0.05452883626379855, 'model__alpha': 0.0002684304975102518, 
#                      'model__subsample': 0.76, 'model__colsample_bytree': 0.46, 
#                      'model__max_cat_to_onehot': 25, 'model__max_cat_threshold': 4})

# study.optimize(xgb10_objective, n_trials = 300)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"XGB10 tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [46]:
# def rf_objective(trial):  
    
#     params = {
#         'model__n_estimators':          trial.suggest_int('model__n_estimators', 500, 10000, step = 100),
#         'model__max_depth':             trial.suggest_categorical('model__max_depth', [None] + list(range(4, 9))),
#         'model__min_samples_split':     trial.suggest_int('model__min_samples_split', 2, 20),
#         'model__min_samples_leaf':      trial.suggest_int('model__min_samples_leaf', 1, 20),
#         'model__max_features':          trial.suggest_categorical('model__max_features', ["sqrt", "log2", None]),
#         'model__min_impurity_decrease': trial.suggest_categorical('model__min_impurity_decrease',
#                                                                   [0, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2])
#     }
#     pipeline_clone = clone(pipeline)
#     pipeline_clone.set_params(**params)

#     val_score = score_dataset(X, y, pipeline_clone)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')
# study.optimize(rf_objective, n_trials = 100)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"RF tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [47]:
# def cb_objective(trial):  
    
#     params = {
#         'model__n_estimators':       trial.suggest_int('model__n_estimators', 500, 2000, step = 50),
#         'model__learning_rate':      trial.suggest_float('model__learning_rate', 1e-4, 0.1, log=True),
#         'model__l2_leaf_reg':        trial.suggest_float('model__l2_leaf_reg', 1e-3, 10.0, log = True),
#         'model__min_data_in_leaf':   trial.suggest_int('model__min_data_in_leaf', 1, 50),
#         'model__max_depth':          trial.suggest_int('model__max_depth', 4, 16),
#         'model__subsample':          trial.suggest_float('model__subsample', 0.4, 1.0, step = 0.01),
#         'model__colsample_bylevel':  trial.suggest_float('model__colsample_bylevel', 0.4, 1.0, step = 0.01)
#     }

#     pipeline_clone = clone(pipeline)
#     pipeline_clone.set_params(**params)

#     val_score = score_dataset(X, y, pipeline_clone)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')

# #cb4
# study.enqueue_trial({'model__n_estimators': 1900, 'model__learning_rate': 0.09883553017892341, 
#  'model__l2_leaf_reg': 3.5846130127386293, 'model__min_data_in_leaf': 14, 
#  'model__max_depth': 5, 'model__subsample': 0.62, 
#  'model__colsample_bylevel': 0.8})
# study.enqueue_trial({'model__n_estimators': 1900, 'model__learning_rate': 0.023875793128268784, 
#                    'model__l2_leaf_reg': 0.20539579640233013, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.9400000000000001, 
#                    'model__colsample_bylevel': 0.8500000000000001})
# study.enqueue_trial({'model__n_estimators': 1900, 'model__learning_rate': 0.023875793128268784, 
#                    'model__l2_leaf_reg': 0.20539579640233013, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85})

# #cb6
# # study.enqueue_trial({'model__n_estimators': 1700, 'model__learning_rate': 0.016184394757080008, 
# #                'model__l2_leaf_reg': 0.01949463452385371, 'model__min_data_in_leaf': 6, 
# #                'model__max_depth': 5, 'model__subsample': 0.5700000000000001, 
# #                'model__colsample_bylevel': 0.73})

# study.optimize(cb_objective, n_trials = 50)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"CB tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [48]:
# def lgbm_objective(trial):  
    
#     params = {
#          'model__n_estimators':       trial.suggest_int('model__n_estimators', 100, 2000, step = 50),
#          'model__learning_rate':      trial.suggest_float('model__learning_rate', 1e-4, 0.1, log=True),
#          'model__num_leaves':         trial.suggest_int('model__num_leaves', 16, 256),
#          'model__max_depth':          trial.suggest_int('model__max_depth', 0, 16),
#          'model__min_data_in_leaf':   trial.suggest_int('model__min_data_in_leaf', 1, 50),
#          'model__bagging_freq':       trial.suggest_int('model__bagging_freq', 0, 7),
#          'model__bagging_fraction':   trial.suggest_float('model__bagging_fraction', 0.5, 1.0, step = 0.05),
#          'model__reg_alpha':          trial.suggest_float('model__reg_alpha', 1e-4, 10.0, log = True),
#          'model__reg_lambda':         trial.suggest_float('model__reg_lambda', 1e-4, 10.0, log = True),
#          'model__colsample_bytree':   trial.suggest_float('model__colsample_bytree', 0.4, 1.0, step = 0.01)
#     }

#     pipeline_clone = clone(pipeline)
#     pipeline_clone.set_params(**params)

#     val_score = score_dataset(X, y, pipeline_clone)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')
# study.optimize(lgbm_objective, n_trials = 300)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"LGBM tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [49]:
# def bayesian_ridge_objective(trial):  
    
#     params = {
#          'model__n_iter':         trial.suggest_int('model__n_iter', 100, 2000, step = 50),
#          'model__tol':            trial.suggest_float('model__tol', 1e-4, 0.1, log=True),
#          'model__alpha_1':        trial.suggest_float('model__alpha_1', 1e-7, 0.1, log = True),
#          'model__alpha_2':        trial.suggest_float('model__alpha_2', 1e-7, 0.1, log = True),
#          'model__lambda_1':       trial.suggest_float('model__lambda_1', 1e-7, 0.1, log = True),
#          'model__lambda_2':       trial.suggest_float('model__lambda_2', 1e-7, 0.1, log = True)
#     }

#     pipeline_clone = clone(pipeline)
#     pipeline_clone.set_params(**params)

#     val_score = score_dataset(X, y, pipeline_clone)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')
# study.optimize(bayesian_ridge_objective, n_trials = 100)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"Bayesian Ridge tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [50]:
# def et_objective(trial):  
    
#     params = {
#         'model__n_estimators':          trial.suggest_int('model__n_estimators', 100, 3000, step = 50),
#         'model__max_depth':             trial.suggest_categorical('model__max_depth', [None] + list(range(4, 16))),
#         'model__min_samples_split':     trial.suggest_int('model__min_samples_split', 2, 20),
#         'model__min_samples_leaf':      trial.suggest_int('model__min_samples_leaf', 1, 50),
#         'model__max_features':          trial.suggest_categorical('model__max_features', ["sqrt", "log2", None, 0.5]),
#         'model__bootstrap':             trial.suggest_categorical('model__bootstrap', [True, False]),
#         'model__min_impurity_decrease': trial.suggest_categorical('model__min_impurity_decrease',
#                                                                   [0, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1])
#     }
#     pipeline_clone = clone(pipeline)
#     pipeline_clone.set_params(**params)

#     val_score = score_dataset(X, y, pipeline_clone)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')
# study.optimize(et_objective, n_trials = 300)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"ET tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [51]:
# def h_objective(trial):  

#     params = {
#          'model__learning_rate':        trial.suggest_float('model__learning_rate', 1e-4, 0.1, log=True),
#          'model__max_iter':             trial.suggest_int('model__max_iter', 100, 2000, step = 50),
#          'model__max_leaf_nodes':       trial.suggest_int('model__max_leaf_nodes', 16, 256),
#          'model__max_depth':            trial.suggest_categorical('model__max_depth', [None] + list(range(4, 16))),
#          'model__min_samples_leaf':     trial.suggest_int('model__min_samples_leaf', 1, 50),
#          'model__l2_regularization':    trial.suggest_float('model__l2_regularization', 1e-6, 10.0, log = True)
#     }
    
#     pipeline_clone = clone(pipeline)
#     pipeline_clone.set_params(**params)

#     val_score = score_dataset(X, y, pipeline_clone)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')
# study.optimize(h_objective, n_trials = 300)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"H tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [52]:
# # Best params
# best_params = {'model__n_estimators': 650, 'model__learning_rate': 0.02384699181391458, 
#                'model__max_depth': 4, 'model__min_child_weight': 1, 
#                'model__lambda': 3.6219467321680083, 'model__alpha': 0.00555739253376164,
#                'model__subsample': 0.5767916833020451, 'model__colsample_bytree': 0.799700431685384, 
#                'model__colsample_bylevel': 0.8676463787826333, 'model__colsample_bynode': 0.8271978509268489}

# Best is trial 62 finished with value: 0.11742375540973124 
# [-0.10837224 -0.12998627 -0.12850005 -0.10610519 -0.11415502]
# 0.11415502442730315
# 0.010012248568044236

# # Best params
# best_params = {'model__n_estimators': 9000, 'model__learning_rate': 0.002221965665898899, 
#                'model__max_depth': 4, 'model__min_child_weight': 5, 
#                'model__lambda': 0.3279, 'model__alpha': 0.2417, 
#                'model__subsample': 0.8049000000000001, 'model__colsample_bytree': 0.9475, 
#                'model__colsample_bylevel': 0.4768, 'model__colsample_bynode': 0.4454}

# Best is trial 92 with value: 0.11728844905814861.
# [-0.1069524  -0.12825763 -0.12633835 -0.10719973 -0.11769413]
# 0.11769413498310685
# 0.009066468479194328

# best_params = {'model__n_estimators': 1000, 'model__learning_rate': 0.02384699181391458, 
#                'model__max_depth': 4, 'model__min_child_weight': 1, 
#                'model__lambda': 3.6219467321680083, 'model__alpha': 0.00555739253376164,
#                'model__subsample': 0.5767916833020451, 'model__colsample_bytree': 0.799700431685384, 
#                'model__colsample_bylevel': 0.8676463787826333, 'model__colsample_bynode': 0.8271978509268489}

# [-0.10795649 -0.13002858 -0.12830252 -0.10569348 -0.11434615]
# 0.11434615405227047
# 0.010137099509793399
# 0.11726544444298999

# best_params = {'model__n_estimators': 1650, 'model__learning_rate': 0.02234288131834421, 
#                'model__max_depth': 3, 'model__min_child_weight': 2, 
#                'model__lambda': 6.1143, 'model__alpha': 0.009000000000000001, 
#                'model__subsample': 0.4839, 'model__colsample_bytree': 0.41910000000000003, 
#                'model__colsample_bylevel': 0.7788, 'model__colsample_bynode': 0.7030000000000001}

# Best is trial 82 with value: 0.11790924808934752.
# [-0.10657961 -0.12788791 -0.12635717 -0.10699523 -0.12172632]
# 0.12172631618375584
# 0.009305776801589548

# best_params_xgb1 = {'model__n_estimators': 1650, 'model__learning_rate': 0.007016525014732306,
#                     'model__max_depth': 5, 'model__min_child_weight': 1,
#                     'model__lambda': 0.0003700954457814859, 'model__alpha': 0.001536806691225178,
#                     'model__subsample': 0.8200000000000001, 'model__colsample_bytree': 0.45}
# # Best is trial 67 with value: 0.11733104229147182.
# # [-0.110808   -0.129472   -0.12964501 -0.10534453 -0.11138566]
# # 0.1113856557342387
# # 0.010204027122436335

# skf
best_params_xgb1 = {'model__n_estimators': 1850, 'model__learning_rate': 0.0164692960710159, 
                    'model__max_depth': 4, 'model__min_child_weight': 2, 
                    'model__lambda': 0.00030967125261382463, 'model__alpha': 0.009462027221582257, 
                    'model__subsample': 0.64, 'model__colsample_bytree': 0.46}
# Best is trial 212 with value: 0.12096327119801184.
# [-0.11749978 -0.11468448 -0.11998631 -0.16015004 -0.09249576]
# 0.1174997758651165
# 0.02190148844115302

In [53]:
# # Best params
# with Age_since_mod feature

# best_params = {'model__n_estimators': 850, 'model__learning_rate': 0.017442598532274846, 
#                'model__max_depth': 4, 'model__min_child_weight': 1, 
#                'model__lambda': 0.8255067561921624, 'model__alpha': 0.004278929472964361, 
#                'model__subsample': 0.8974473156207723, 'model__colsample_bytree': 0.6523619866997316, 
#                'model__colsample_bylevel': 0.996362051631553, 'model__colsample_bynode': 0.4628476409826708}

# Best is trial 61 with value: 0.11779737989132073.
# [-0.10790906 -0.1318243  -0.12853275 -0.10425928 -0.11646151]
# 0.11646151088245223
# 0.010907303679586703

In [54]:
# # Best params
# with pipeline2

# best_params = {'model__n_estimators': 1000, 'model__learning_rate': 0.020124989124474368, 
#                'model__max_depth': 4, 'model__min_child_weight': 6, 
#                'model__lambda': 1.732780164845022, 'model__alpha': 0.03214159796407795, 
#                'model__subsample': 0.618267265549825, 'model__colsample_bytree': 0.7018360769686597, 
#                'model__colsample_bylevel': 0.5343572892012461, 'model__colsample_bynode': 0.9702456455515944}

#Best is trial 85 with value: 0.11782913284946137.
# [-0.1100461  -0.12623955 -0.12463757 -0.11017505 -0.1180474 ]
# 0.11804740104228104
# 0.0068745661426423185


# best_params = {'model__n_estimators': 8500, 'model__learning_rate': 0.0024356711073752263, 
#                'model__max_depth': 6, 'model__min_child_weight': 7, 
#                'model__lambda': 1.0561, 'model__alpha': 0.0106, 
#                'model__subsample': 0.7481, 'model__colsample_bytree': 0.6855, 
#                'model__colsample_bylevel': 0.8596, 'model__colsample_bynode': 0.7395}

# Best is trial 86 with value: 0.11844810379526453.
# [-0.11185226 -0.12871867 -0.12804039 -0.10766969 -0.11595951]
# 0.11595951124551873
# 0.008524891860194693


# best_params = {'model__n_estimators': 1600, 'model__learning_rate': 0.020124989124474368, 
#                'model__max_depth': 4, 'model__min_child_weight': 6, 
#                'model__lambda': 1.732780164845022, 'model__alpha': 0.03214159796407795, 
#                'model__subsample': 0.618267265549825, 'model__colsample_bytree': 0.7018360769686597, 
#                'model__colsample_bylevel': 0.5343572892012461, 'model__colsample_bynode': 0.9702456455515944}

# pipeline 2
# 0.11802772081955834  2000
# 0.11790519839005424  1800
# 0.11783945720039994  1700
# 0.11775646400190418  1600
# 0.11777810773615957  1500
# 0.11777453916635619  1400
# 0.11776996564553088  1300
# 0.1177948484577633   1200


#skf 
best_params_xgb2 ={'model__n_estimators': 1850, 'model__learning_rate': 0.007663483674441529, 
                   'model__max_depth': 4, 'model__min_child_weight': 1, 
                   'model__lambda': 1.3221982712197484, 'model__alpha': 0.0005031631526708031, 
                   'model__subsample': 0.66, 'model__colsample_bytree': 0.5900000000000001}

# Best is trial 169 with value: 0.12121695838464552.
# [-0.1190666  -0.11047241 -0.12027858 -0.15368264 -0.10258457]
# 0.11906659648415295
# 0.0174478434513716

In [55]:
# # Best params
# with pipeline3

# best_params_xgb3 = {'model__n_estimators': 1800, 'model__learning_rate': 0.00851634744921019,
#                     'model__max_depth': 4, 'model__min_child_weight': 4,
#                     'model__lambda': 0.001364771669279719, 'model__alpha': 0.0003587496054225881,
#                     'model__subsample': 0.74, 'model__colsample_bytree': 0.45}

# Best is trial 94 with value: 0.11586713304296034.
# [-0.10719354 -0.12648766 -0.12653823 -0.103357   -0.11575924]
# 0.11575923785884161
# 0.00957511471362708

# best_params = {'model__n_estimators': 3000, 'model__learning_rate': 0.011132262126243759, 
#                'model__max_depth': 4, 'model__min_child_weight': 3, 
#                'model__lambda': 0.0009857953750043423, 'model__alpha': 0.020320630432170268, 
#                'model__subsample': 0.79, 'model__colsample_bytree': 0.48000000000000004}

# Best is trial 53 with value: 0.11543456407868449.
# [-0.10707274 -0.12636534 -0.12598445 -0.10213922 -0.11561108]
# 0.11561107564972747
# 0.009772403947525961


# best_params_xgb3 = {'model__n_estimators': 2000, 'model__learning_rate': 0.01267380480160164, 
#                     'model__max_depth': 4, 'model__min_child_weight': 3, 
#                     'model__lambda': 0.0009631833544314171, 'model__alpha': 0.009342148308972915, 
#                     'model__subsample': 0.9600000000000001, 'model__colsample_bytree': 0.45}

# # Best is trial 216 with value: 0.11540221675361506.
# # [-0.10548422 -0.12680313 -0.12650509 -0.10255713 -0.11566151]
# # 0.11566150694345757
# # 0.010165479749796716


# with skf split
best_params_xgb3 = {'model__n_estimators': 1350, 'model__learning_rate': 0.019180364463132607, 
                    'model__max_depth': 3, 'model__min_child_weight': 1, 
                    'model__lambda': 0.380879553148292, 'model__alpha': 0.1694031754175522, 
                    'model__subsample': 0.52, 'model__colsample_bytree': 0.8300000000000001}

# Best is trial 255 with value: 0.12291040055737397.
# [-0.1207175  -0.11784717 -0.12084213 -0.15751711 -0.09762809]
# 0.12071750059786712
# 0.01934705328976381

In [56]:
# #pipeline 4 best params
# {'model__n_estimators': 500, 'model__max_depth': None, 
#  'model__min_samples_split': 4, 'model__min_samples_leaf': 1, 
#  'model__max_features': None, 'model__min_impurity_decrease': 1e-05}

# Best is trial 69 with value: 0.13452078571895393.
# [-0.13265846 -0.14837583 -0.13993977 -0.12230483 -0.12932503]
# 0.13265846285872251
# 0.00895568242880693

#skf
best_params_rf = {'model__n_estimators': 8100, 'model__max_depth': None, 
                  'model__min_samples_split': 2, 'model__min_samples_leaf': 1, 
                  'model__max_features': 'log2', 'model__min_impurity_decrease': 1e-06}
# Best is trial 99 with value: 0.13801475483285586.
# [-0.1305587  -0.12929222 -0.14595504 -0.15818268 -0.12608512]
# 0.13055870069685813
# 0.0121965965512414

In [57]:
# Pipeline 5 best params
# best_params = {'model__n_estimators': 600, 'model__learning_rate': 0.0432035923480985, 
#                'model__max_depth': 5, 'model__min_child_weight': 4, 
#                'model__lambda': 1.6142, 'model__alpha': 0.014, 
#                'model__subsample': 0.6325000000000001, 'model__colsample_bytree': 0.4691, 
#                'model__colsample_bylevel': 0.4232, 'model__colsample_bynode': 0.9493}

# Best is trial 91 with value: 0.11773786792108754.
# [-0.10644425 -0.12944014 -0.12619847 -0.10790766 -0.11869882]
# 0.11869881720548991
# 0.009312649560497113


# best_params = {'model__n_estimators': 1050, 'model__learning_rate': 0.01062872306058349, 
#                'model__max_depth': 11, 'model__min_child_weight': 4, 
#                'model__lambda': 2.1802052986711025, 'model__alpha': 0.00921920490425625, 
#                'model__subsample': 0.47000000000000003, 'model__colsample_bytree': 0.44}

# Best is trial 58 with value: 0.11967778326753713.
# [-0.10904947 -0.134964   -0.12691889 -0.10738619 -0.12007037]
# 0.12007036705813962
# 0.010490951150130552


#skf
best_params_xgb5 = {'model__n_estimators': 1700, 'model__learning_rate': 0.016890178924066624,
                    'model__max_depth': 3, 'model__min_child_weight': 2,
                    'model__lambda': 0.006476272438913827, 'model__alpha': 0.00020362981148242182,
                    'model__subsample': 0.55, 'model__colsample_bytree': 0.62}

# Best is trial 251 with value: 0.12121663671265123.
# [-0.1183398  -0.11764894 -0.11819146 -0.15724026 -0.09466274]
# 0.11819145566235034
# 0.02016416246342805

In [58]:
# Pipeline 6 best params
# best_params = {'model__n_estimators': 1150, 'model__learning_rate': 0.03352307389535568, 
#                'model__max_depth': 3, 'model__min_child_weight': 6, 
#                'model__lambda': 6.661700000000001, 'model__alpha': 0.045700000000000005, 
#                'model__subsample': 0.43370000000000003, 'model__colsample_bytree': 0.9654, 
#                'model__colsample_bylevel': 0.6141000000000001, 'model__colsample_bynode': 0.6744000000000001}
# Best is trial 93 with value: 0.11865236004232849.
# [-0.11241793 -0.12547345 -0.12666839 -0.10996032 -0.11874172]
# 0.11874172303964096
# 0.006711357597087486

# skf split - no run is successful, all fail. Default 5 fold CV split does give result

In [59]:
# Pipeline 7 best params
# best_params_xgb7 = {'model__n_estimators': 1100, 'model__learning_rate': 0.025937951529056733,
#                     'model__max_depth': 4, 'model__min_child_weight': 4,
#                     'model__lambda': 2.7752000000000003, 'model__alpha': 0.0146,
#                     'model__subsample': 0.44530000000000003, 'model__colsample_bytree': 0.6413,
#                     'model__colsample_bylevel': 0.4937, 'model__colsample_bynode': 0.7684}
# Best is trial 74 with value: 0.11660322347801237.
# [-0.1039625  -0.12587415 -0.12546581 -0.10753416 -0.12017951]
# 0.1201795084009607
# 0.009157741226268295

# best_params = {'model__n_estimators': 1950, 'model__learning_rate': 0.013543890928772154, 
#                'model__max_depth': 4, 'model__min_child_weight': 7, 
#                'model__lambda': 0.010221196759663355, 'model__alpha': 0.0013603517427302739, 
#                'model__subsample': 0.8500000000000001, 'model__colsample_bytree': 0.41000000000000003}

# Best is trial 86 with value: 0.1187055587523349.
# [-0.10663061 -0.12755437 -0.12829783 -0.10752408 -0.12352092]
# 0.1235209150589082
# 0.009636708582726212

#skf
best_params_xgb7 = {'model__n_estimators': 900, 'model__learning_rate': 0.034132716349379644, 
                    'model__max_depth': 4, 'model__min_child_weight': 3, 
                    'model__lambda': 0.07598434952799418, 'model__alpha': 0.006227713546260122, 
                    'model__subsample': 0.62, 'model__colsample_bytree': 0.42000000000000004}
# Best is trial 290 with value: 0.12016913498104102.
# [-0.1231054  -0.10620493 -0.11861282 -0.15727291 -0.09564962]
# 0.11861282156685948
# 0.020899470502765815

In [60]:
# # Pipeline 8 best params
# best_params_xgb8 = {'model__n_estimators': 1950, 'model__learning_rate': 0.013920563709541603,
#                     'model__max_depth': 4, 'model__min_child_weight': 3,
#                     'model__lambda': 0.0047096596141103335, 'model__alpha': 0.01205623249289589,
#                     'model__subsample': 0.69, 'model__colsample_bytree': 0.49}

# Best is trial 83 with value: 0.11839759395186619.
# [-0.1051799  -0.13080341 -0.12716429 -0.10431576 -0.12452461]
# 0.12452460859487571
# 0.01132523820187055

#skf
best_params_xgb8 = {'model__n_estimators': 2000, 'model__learning_rate': 0.013062870439665343, 
                    'model__max_depth': 4, 'model__min_child_weight': 1, 
                    'model__lambda': 0.6898851881426148, 'model__alpha': 0.01942788510854367, 
                    'model__subsample': 0.45, 'model__colsample_bytree': 0.66}
# Best is trial 238 with value: 0.12218887615283583.
# [-0.12311578 -0.10393904 -0.11978129 -0.16208063 -0.10202764]
# 0.11978128899996028
# 0.021621929170878792

In [61]:
# Pipeline 9 best params
# best_params_xgb9 = {'model__n_estimators': 950, 'model__learning_rate': 0.03759525209419164, 
#                     'model__max_depth': 4, 'model__min_child_weight': 2, 
#                     'model__lambda': 0.018733955186589393, 'model__alpha': 0.0010451677005228284, 
#                     'model__subsample': 0.78, 'model__colsample_bytree': 0.66}

# # Best is trial 210 with value: 0.11519055282052985.
# # [-0.10426655 -0.12730364 -0.12830147 -0.10071816 -0.11536294]
# # 0.11536294365393755
# # 0.01137924965344543


# with skf split
best_params_xgb9 = {'model__n_estimators': 1300, 'model__learning_rate': 0.017812751084206546, 
                    'model__max_depth': 4, 'model__min_child_weight': 1, 
                    'model__lambda': 0.5374808280674902, 'model__alpha': 0.014717951591959225, 
                    'model__subsample': 0.42000000000000004, 'model__colsample_bytree': 0.5800000000000001}

# Best is trial 262 with value: 0.12289801572950965.
# [-0.11956882 -0.11307731 -0.12082082 -0.16199867 -0.09902446]
# 0.1195688216223085
# 0.02102690141132909

In [62]:
# best_params_xgb10 = {'model__n_estimators': 1250, 'model__learning_rate': 0.017503143345998808, 
#                      'model__max_depth': 4, 'model__min_child_weight': 4, 
#                      'model__lambda': 0.12006665662804136, 'model__alpha': 0.002899058549350107, 
#                      'model__subsample': 0.54, 'model__colsample_bytree': 0.68, 
#                      'model__max_cat_to_onehot': 4, 'model__max_cat_threshold': 28}

# # Best is trial 283 with value: 0.1157357979499694.
# # [-0.10644924 -0.12623314 -0.1276332  -0.10385512 -0.11450829]
# # 0.11450829223935358
# # 0.009804487488274161

#skf split
best_params_xgb10 = {'model__n_estimators': 1800, 'model__learning_rate': 0.009314521918435957, 
                     'model__max_depth': 4, 'model__min_child_weight': 2, 
                     'model__lambda': 0.05452883626379855, 'model__alpha': 0.0002684304975102518, 
                     'model__subsample': 0.76, 'model__colsample_bytree': 0.46}
# Best is trial 253 with value: 0.12189723833484438.
# [-0.11769908 -0.11210582 -0.12542139 -0.15399192 -0.10026798]
# 0.11769907780421265
# 0.018021150486328682

#skf split with optuna with cat params
# best_params_xgb10 = {'model__n_estimators': 950, 'model__learning_rate': 0.024273156624668824, 
#                      'model__max_depth': 4, 'model__min_child_weight': 4, 
#                      'model__lambda': 0.001385577354934915, 'model__alpha': 0.0004628157478762704, 
#                      'model__subsample': 0.78, 'model__colsample_bytree': 0.4, 
#                      'model__max_cat_to_onehot': 5, 'model__max_cat_threshold': 16}
# # Best is trial 150 with value: 0.1227792632215197.
# # [-0.11871171 -0.11198431 -0.1244113  -0.15544868 -0.10334032]
# # 0.11871171137791076
# # 0.017781468360262627

In [63]:
# CatBoost Pipeline 1

# best_params = {'model__n_estimators': 1700, 'model__learning_rate': 0.016728902877239903, 
#                'model__l2_leaf_reg': 0.040835135117464366, 'model__min_data_in_leaf': 9, 
#                'model__max_depth': 6, 'model__subsample': 0.4963, 
#                'model__colsample_bylevel': 0.4702}

# Best is trial 34 with value: 0.11666610454039741.
# [-0.10795215 -0.12487765 -0.12579047 -0.10819275 -0.1165175 ]
# 0.11651750103817046
# 0.007725754808568546

# skf split
best_params_cb1 = {'model__n_estimators': 1600, 'model__learning_rate': 0.013567817011450631, 
                   'model__l2_leaf_reg': 2.352528904227011, 'model__min_data_in_leaf': 33, 
                   'model__max_depth': 7, 'model__subsample': 0.62, 
                   'model__colsample_bylevel': 0.9400000000000001}
# Best is trial 26 with value: 0.12146870978524071.
# [-0.11689243 -0.11222937 -0.1195251  -0.1484336  -0.11026306]
# 0.11689242795517175
# 0.013876291530023033

In [64]:
# CatBoost Pipeline 2

# best_params_cb2 = {'model__n_estimators': 1150, 'model__learning_rate': 0.017712553265578305,
#                    'model__l2_leaf_reg': 0.08343115659041794, 'model__min_data_in_leaf': 4,
#                    'model__max_depth': 5, 'model__subsample': 0.44410000000000005,
#                    'model__colsample_bylevel': 0.4676}

# # Best is trial 41 with value: 0.115131021385492.
# # [-0.10631423 -0.12341525 -0.12524074 -0.10716469 -0.11352019]
# # 0.11352019431883034
# # 0.00793256565121934

#skf score
best_params_cb2 = {'model__n_estimators': 1000, 'model__learning_rate': 0.020766651766510266, 
                   'model__l2_leaf_reg': 0.0031728824818652163, 'model__min_data_in_leaf': 50, 
                   'model__max_depth': 6, 'model__subsample': 0.8, 
                   'model__colsample_bylevel': 0.91}

# Best is trial 12 with value: 0.12063624125827858.
# [-0.1156934  -0.10911198 -0.12282546 -0.15034066 -0.1052097 ]
# 0.1156933974068409
# 0.01601476634147856

In [65]:
# CatBoost Pipeline 3

# best_params = {'model__n_estimators': 2000, 'model__learning_rate': 0.007906985518057994, 
#                'model__l2_leaf_reg': 0.0922415789070384, 'model__min_data_in_leaf': 31, 
#                'model__max_depth': 4, 'model__subsample': 0.8200000000000001, 
#                'model__colsample_bylevel': 0.76}
# Best is trial 12 with value: 0.11855774737155153.
# [-0.10514406 -0.12587876 -0.12669242 -0.10977513 -0.12529837]
# 0.12529837064895213
# 0.00918986104825868

# best_params_cb3 = {'model__n_estimators': 1750, 'model__learning_rate': 0.02157894921249164,
#                    'model__l2_leaf_reg': 0.3194559118452819, 'model__min_data_in_leaf': 38,
#                    'model__max_depth': 5, 'model__subsample': 0.43460000000000004,
#                    'model__colsample_bylevel': 0.9166000000000001}
# # Best is trial 31 with value: 0.11619643846168552.
# # [-0.10281654 -0.126599   -0.12371644 -0.10972694 -0.11812327]
# # 0.11812327086423803
# # 0.008824425231554592

#skf
best_params_cb3 = {'model__n_estimators': 700, 'model__learning_rate': 0.030625945706803, 
                   'model__l2_leaf_reg': 0.2906471745862896, 'model__min_data_in_leaf': 26, 
                   'model__max_depth': 6, 'model__subsample': 0.66, 
                   'model__colsample_bylevel': 0.64}
# Best is trial 43 with value: 0.12117629660914572.
# [-0.11685888 -0.11011589 -0.1235822  -0.1489466  -0.10637792]
# 0.11685887726354985
# 0.015079407156906926

In [66]:
# CatBoost Pipeline 4

# best_params_cb4 = {'model__n_estimators': 1350, 'model__learning_rate': 0.024279894068992076,
#                    'model__l2_leaf_reg': 0.07703432557306762, 'model__min_data_in_leaf': 22,
#                    'model__max_depth': 6, 'model__subsample': 0.98,
#                    'model__colsample_bylevel': 0.81}
# Best is trial 1 with value: 0.1214007095620582.
# [-0.11295343 -0.13260574 -0.13311869 -0.11034042 -0.11798527]
# 0.11798526791881647
# 0.009676935519162008

# best_params_cb4 = {'model__n_estimators': 1900, 'model__learning_rate': 0.023875793128268784, 
#                    'model__l2_leaf_reg': 0.20539579640233013, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.9400000000000001, 
#                    'model__colsample_bylevel': 0.8500000000000001}
# Best is trial 26 with value: 0.11973587332021315.
# [-0.11301114 -0.12934981 -0.12627155 -0.10764008 -0.12240679]
# 0.12240678985145056
# 0.008170764103107636

# best_params_cb4 = {'model__n_estimators': 1900, 'model__learning_rate': 0.02, 
#                    'model__l2_leaf_reg': 0.20539579640233013, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11201021 -0.13352178 -0.12705421 -0.10952506 -0.11527173]
# 0.1152717345809794
# 0.00924281603375057
# 0.11947660122245345


# best_params_cb4 = {'model__n_estimators': 1900, 'model__learning_rate': 0.02, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11188239 -0.13235365 -0.12709226 -0.10918557 -0.11390662]
# 0.11390662005291685
# 0.009128671332061692
# 0.11888409741568533

# best_params_cb4 = {'model__n_estimators': 1950, 'model__learning_rate': 0.02, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}

# [-0.11189285 -0.13239772 -0.12709855 -0.1091963  -0.11390332]
# 0.1139033216933719
# 0.009139291670199706
# 0.11889774821034231

# best_params_cb4 = {'model__n_estimators': 2000, 'model__learning_rate': 0.02, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11187549 -0.13243271 -0.12707561 -0.10919286 -0.11377302]
# 0.11377302074072512
# 0.009163281155937423
# 0.11886993653590813

# best_params_cb4 = {'model__n_estimators': 2050, 'model__learning_rate': 0.02, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11189929 -0.13239579 -0.12705958 -0.10902376 -0.1137576 ]
# 0.11375760104627411
# 0.009183495070161809
# 0.11882720488252445

# best_params_cb4 = {'model__n_estimators': 2100, 'model__learning_rate': 0.02, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11195553 -0.13240284 -0.12707245 -0.1088707  -0.11380872]
# 0.11380872083482574
# 0.009206725554952992
# 0.11882204728095043

# best_params_cb4 = {'model__n_estimators': 2150, 'model__learning_rate': 0.02, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11188727 -0.13229883 -0.12710833 -0.10883597 -0.1138046 ]
# 0.11380459842870366
# 0.009200734837444853
# 0.11878700098596284

# best_params_cb4 = {'model__n_estimators': 2200, 'model__learning_rate': 0.02, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11187491 -0.13226491 -0.12717628 -0.10877656 -0.11378779]
# 0.11378778899899722
# 0.009219668505541253
# 0.11877609053306046

# best_params_cb4 = {'model__n_estimators': 2250, 'model__learning_rate': 0.02, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11188312 -0.13237645 -0.12718839 -0.10881034 -0.11384685]
# 0.11384684816994045
# 0.009239622081836861
# 0.11882102814314441

# best_params_cb4 = {'model__n_estimators': 2300, 'model__learning_rate': 0.02, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11185689 -0.13236175 -0.12719613 -0.10874102 -0.11392465]
# 0.11392464898140518
# 0.009247420514116129
# 0.11881608785479755

# best_params_cb4 = {'model__n_estimators': 2350, 'model__learning_rate': 0.02, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11194388 -0.13233419 -0.12720186 -0.10868674 -0.11405942]
# 0.11405942235383687
# 0.009225104178520382
# 0.11884521798601644

# best_params_cb4 = {'model__n_estimators': 1900, 'model__learning_rate': 0.018, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11274411 -0.13109232 -0.13000153 -0.10778541 -0.11933193]
# 0.11933193350986915
# 0.009221475370939702
# 0.12019106130273345

# best_params_cb4 = {'model__n_estimators': 1900, 'model__learning_rate': 0.019, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11430754 -0.13017581 -0.12899373 -0.10906591 -0.11834235]
# 0.11834234952337444
# 0.008233877845389339
# 0.12017706659772671

# best_params_cb4 = {'model__n_estimators': 1900, 'model__learning_rate': 0.017, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11610427 -0.13273477 -0.12919228 -0.10891501 -0.11945877]
# 0.11945877076840226
# 0.008681174676817551
# 0.12128101783289294

# best_params_cb4 = {'model__n_estimators': 1900, 'model__learning_rate': 0.016, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11476089 -0.12941193 -0.12948647 -0.11108606 -0.11869038]
# 0.11869037844886919
# 0.007547693989539927
# 0.1206871467033112

# best_params_cb4 = {'model__n_estimators': 1900, 'model__learning_rate': 0.015, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}
# [-0.11363649 -0.13034886 -0.12870418 -0.10957033 -0.11996888]
# 0.11996887759049966
# 0.008138077449944102
# 0.12044574755157633

# best_params_cb4 = {'model__n_estimators': 2000, 'model__learning_rate': 0.015, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}

# [-0.11370692 -0.13021425 -0.12852065 -0.10942142 -0.11982283]
# 0.11982283435274602
# 0.00809820296285674
# 0.12033721580343484

# best_params_cb4 = {'model__n_estimators': 2000, 'model__learning_rate': 0.01, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}

# [-0.11432627 -0.13146917 -0.12834248 -0.11089592 -0.12130761]
# 0.12130761373334997
# 0.007872502061877364
# 0.12126829037306623

# best_params_cb4 = {'model__n_estimators': 2000, 'model__learning_rate': 0.005, 
#                    'model__l2_leaf_reg': 0.205, 'model__min_data_in_leaf': 5, 
#                    'model__max_depth': 5, 'model__subsample': 0.94, 
#                    'model__colsample_bylevel': 0.85}

# [-0.11695995 -0.13324756 -0.13340318 -0.11716379 -0.12433965]
# 0.12433964901078248
# 0.007281728167621455
# 0.12502282693062136


best_params_cb4_1 = {'model__n_estimators': 1900, 'model__learning_rate': 0.023875793128268784, 
                     'model__l2_leaf_reg': 0.20539579640233013, 'model__min_data_in_leaf': 5, 
                     'model__max_depth': 5, 'model__subsample': 0.9400000000000001, 
                     'model__colsample_bylevel': 0.8500000000000001}
# # Best is trial 1 with value: 0.12746553160273738
# # [-0.12479513 -0.11602021 -0.12677251 -0.16181625 -0.10792355]
# # 0.1247951298785159
# # 0.01844162469894824

best_params_cb4_2 = {'model__n_estimators': 1400, 'model__learning_rate': 0.05291628705588727, 
                     'model__l2_leaf_reg': 0.11661179025482198, 'model__min_data_in_leaf': 10, 
                     'model__max_depth': 4, 'model__subsample': 0.99, 
                     'model__colsample_bylevel': 0.67}
# Best is trial 13 with value: 0.12657391647714436.
# [-0.12250947 -0.11908302 -0.12198717 -0.16438165 -0.10490827]
# 0.12198717243937014
# 0.019962446238785396

In [67]:
# CatBoost Pipeline 5

# best_params = {'model__n_estimators': 1700, 'model__learning_rate': 0.02945430207184643, 
#                'model__l2_leaf_reg': 0.027563217977551877, 'model__min_data_in_leaf': 9, 
#                'model__max_depth': 7, 'model__subsample': 0.69, 
#                'model__colsample_bylevel': 0.8700000000000001}

# Best is trial 15 with value: 0.11733894066927993.
# [-0.10479313 -0.12909713 -0.12442671 -0.10861599 -0.11976175]
# 0.11976175179720044
# 0.009250393243949715

#skf 
best_params_cb5 = {'model__n_estimators': 1850, 'model__learning_rate': 0.03310538371449341, 
                   'model__l2_leaf_reg': 0.5692385591345918, 'model__min_data_in_leaf': 5, 
                   'model__max_depth': 5, 'model__subsample': 0.53, 
                   'model__colsample_bylevel': 0.73}
# Best is trial 33 with value: 0.12232080734704395.
# [-0.11730409 -0.10672066 -0.12218277 -0.15967962 -0.1057169 ]
# 0.11730408572695568
# 0.019697317031556482

In [68]:
# CatBoost Pipeline 6

best_params_cb6 = {'model__n_estimators': 1700, 'model__learning_rate': 0.016184394757080008,
                   'model__l2_leaf_reg': 0.01949463452385371, 'model__min_data_in_leaf': 6,
                   'model__max_depth': 5, 'model__subsample': 0.5700000000000001,
                   'model__colsample_bylevel': 0.73}
# Best is trial 33 with value: 0.1274473054822071
# [-0.12410766 -0.11949513 -0.12663447 -0.15884384 -0.10815544]
# 0.124107658094659
# 0.01692908485762067

In [69]:
# Pipeline LGBM 1

# best_params_lgbm1 = {'model__n_estimators': 1850, 'model__learning_rate': 0.00919961299356949,
#                      'model__num_leaves': 16, 'model__max_depth': 15,
#                      'model__min_data_in_leaf': 1, 'model__bagging_freq': 3,
#                      'model__bagging_fraction': 0.5, 'model__reg_alpha': 0.006780286384457145,
#                      'model__reg_lambda': 0.009128122260443947, 'model__colsample_bytree': 0.52}

# Best is trial 66 with value: 0.11588344796374961.
# [-0.10897157 -0.12806925 -0.12505263 -0.10505543 -0.11226835]
# 0.11226835473674715
# 0.009062634359964232

# best_params = {'model__n_estimators': 900, 'model__learning_rate': 0.016368209778087922, 
#                'model__num_leaves': 26, 'model__max_depth': 4, 
#                'model__min_data_in_leaf': 1, 'model__bagging_freq': 4, 
#                'model__bagging_fraction': 0.8500000000000001, 'model__reg_alpha': 0.07073139029094895, 
#                'model__reg_lambda': 0.9864368067144933, 'model__colsample_bytree': 0.48000000000000004}

# Best is trial 31 with value: 0.11633039703943031.
# [-0.10787882 -0.12904064 -0.12713678 -0.10639222 -0.11120351]
# 0.11120351159416869
# 0.009744842136375179

#skf split
best_params_lgbm1 = {'model__n_estimators': 1600, 'model__learning_rate': 0.009731152795312595, 
                     'model__num_leaves': 114, 'model__max_depth': 4, 
                     'model__min_data_in_leaf': 1, 'model__bagging_freq': 7, 
                     'model__bagging_fraction': 0.55, 'model__reg_alpha': 0.1807157822128309, 
                     'model__reg_lambda': 0.09215157159968763, 'model__colsample_bytree': 0.4}
# Best is trial 243 with value: 0.12052610219793292.
# [-0.11954391 -0.11328187 -0.11886363 -0.15219455 -0.09874655]
# 0.11886362989527258
# 0.01751254964107485

In [70]:
# Pipeline LGBM 2

# best_params_lgbm2 = {'model__n_estimators': 1900, 'model__learning_rate': 0.022528140118977913,
#                      'model__num_leaves': 18, 'model__max_depth': 4,
#                      'model__min_data_in_leaf': 1, 'model__bagging_freq': 0,
#                      'model__bagging_fraction': 0.8, 'model__reg_alpha': 0.0005985326337395847,
#                      'model__reg_lambda': 0.0002363245035597099, 'model__colsample_bytree': 0.42000000000000004}

# # Best is trial 38 with value: 0.11560446553278303.
# # [-0.10663778 -0.12916449 -0.1261197  -0.10309439 -0.11300596]
# # 0.11300596297350247
# # 0.010374005785838268

# with skf split
best_params_lgbm2 = {'model__n_estimators': 1050, 'model__learning_rate': 0.014161850233991993, 
                     'model__num_leaves': 203, 'model__max_depth': 5, 
                     'model__min_data_in_leaf': 2, 'model__bagging_freq': 3, 
                     'model__bagging_fraction': 0.65, 'model__reg_alpha': 0.0034654923995791806, 
                     'model__reg_lambda': 0.06528414476899567, 'model__colsample_bytree': 0.43000000000000005}
# Best is trial 288 with value: 0.12006520251936215.
# [-0.1175394  -0.10907231 -0.12194504 -0.15602383 -0.09574543]
# 0.11753940364579261
# 0.020076009133654676

In [71]:
# Pipeline LGBM 3 - (with kmeans)

# best_params_lgbm3 = {'model__n_estimators': 1850, 'model__learning_rate': 0.01090254772325081,
#                      'model__num_leaves': 248, 'model__max_depth': 4,
#                      'model__min_data_in_leaf': 2, 'model__bagging_freq': 4,
#                      'model__bagging_fraction': 0.6, 'model__reg_alpha': 0.013788629591410967,
#                      'model__reg_lambda': 0.10833316245693235, 'model__colsample_bytree': 0.56}

# Best is trial 86 with value: 0.11545575046806685.
# [-0.10764055 -0.12391567 -0.12530486 -0.10516194 -0.11525574]
# 0.11525574056354786
# 0.008193353838302055

# The below param set has slightly better score than previous, but has large value for n_estimators.
# So using the previous as best params

# best_params = {'model__n_estimators': 6850, 'model__learning_rate': 0.0024998981926820743, 
#                'model__num_leaves': 92, 'model__max_depth': 4, 
#                'model__min_data_in_leaf': 1, 'model__bagging_freq': 5, 
#                'model__bagging_fraction': 0.65, 'model__reg_alpha': 0.008654799905365489, 
#                'model__reg_lambda': 0.004158569757202218, 'model__colsample_bytree': 0.46}
# Best is trial 64 with value: 0.11518509313360328.
# [-0.10615533 -0.1278085  -0.12361509 -0.10496181 -0.11338474]
# 0.11338473860489612
# 0.009161972687259528

# with skf split
best_params_lgbm3 = {'model__n_estimators': 1350, 'model__learning_rate': 0.005813900232531856, 
                     'model__num_leaves': 248, 'model__max_depth': 10, 
                     'model__min_data_in_leaf': 20, 'model__bagging_freq': 1, 
                     'model__bagging_fraction': 0.5, 'model__reg_alpha': 0.0028223797745013522, 
                     'model__reg_lambda': 0.005988097246415957, 'model__colsample_bytree': 0.41000000000000003}
# Best is trial 241 with value: 0.12476945989203363.
# [-0.12135222 -0.11226465 -0.1272039  -0.1535579  -0.10946863]
# 0.12135222151106254
# 0.015728090881840222

In [72]:
# Pipeline LGBM 4 

# best_params = {'model__n_estimators': 800, 'model__learning_rate': 0.05310111992098777, 
#                'model__num_leaves': 126, 'model__max_depth': 2, 
#                'model__min_data_in_leaf': 2, 'model__bagging_freq': 4, 
#                'model__bagging_fraction': 0.8500000000000001, 'model__reg_alpha': 0.2286575446927289, 
#                'model__reg_lambda': 0.16178904582449463, 'model__colsample_bytree': 0.45}

# Best is trial 57 with value: 0.11904876713223907.
# [-0.10424882 -0.12487474 -0.1259728  -0.11114655 -0.12900091]
# 0.12487474415214168
# 0.009616786926739675

# skf
best_params_lgbm4 = {'model__n_estimators': 1600, 'model__learning_rate': 0.016221004562637534, 
                     'model__num_leaves': 55, 'model__max_depth': 4, 
                     'model__min_data_in_leaf': 3, 'model__bagging_freq': 7, 
                     'model__bagging_fraction': 0.7, 'model__reg_alpha': 0.003283241571893786, 
                     'model__reg_lambda': 0.042729411250030144, 'model__colsample_bytree': 0.41000000000000003}
# Best is trial 233 with value: 0.12056356608490129.
# [-0.12052822 -0.10662705 -0.12023993 -0.15732043 -0.0981022 ]
# 0.12023993219991116
# 0.020248023319354637

In [73]:
# Pipeline LGBM 5

# best_params_lgbm5 = {'model__n_estimators': 1650, 'model__learning_rate': 0.010068383476147162,
#                      'model__num_leaves': 187, 'model__max_depth': 4,
#                      'model__min_data_in_leaf': 2, 'model__bagging_freq': 2,
#                      'model__bagging_fraction': 0.55, 'model__reg_alpha': 0.34281244232395713,
#                      'model__reg_lambda': 0.11822047924368936, 'model__colsample_bytree': 0.4}

# Best is trial 95 with value: 0.11911372688117125.
# [-0.10709439 -0.13089928 -0.12669271 -0.10927673 -0.12160552]
# 0.12160552202387531
# 0.009421046079262654

#skf
best_params_lgbm5 = {'model__n_estimators': 1700, 'model__learning_rate': 0.030862626260224756, 
                     'model__num_leaves': 249, 'model__max_depth': 4, 
                     'model__min_data_in_leaf': 1, 'model__bagging_freq': 1, 
                     'model__bagging_fraction': 0.7, 'model__reg_alpha': 0.00018348579563313004, 
                     'model__reg_lambda': 0.00038494776166918504, 'model__colsample_bytree': 0.61}

# Best is trial 294 with value: 0.12194678876497556.
# [-0.12324616 -0.10222344 -0.12207349 -0.16217016 -0.1000207 ]
# 0.1220734899492754
# 0.02231316891513297

In [74]:
# Pipeline LGBM 6
# best_params_lgbm6 = {'model__n_estimators': 1950, 'model__learning_rate': 0.011909594905015761, 
#                      'model__num_leaves': 186, 'model__max_depth': 4, 
#                      'model__min_data_in_leaf': 1, 'model__bagging_freq': 5, 
#                      'model__bagging_fraction': 0.55, 'model__reg_alpha': 0.0008973495636700269, 
#                      'model__reg_lambda': 1.6499577771752032, 'model__colsample_bytree': 0.48000000000000004}

# # Best is trial 251 with value: 0.11575282236697218.
# # [-0.10518519 -0.12841047 -0.12598079 -0.10541443 -0.11377324]
# # 0.11377323964590669
# # 0.009872218200399905

# skf
best_params_lgbm6 = {'model__n_estimators': 850, 'model__learning_rate': 0.01593545381425983, 
                     'model__num_leaves': 140, 'model__max_depth': 3, 
                     'model__min_data_in_leaf': 2, 'model__bagging_freq': 5, 
                     'model__bagging_fraction': 0.6, 'model__reg_alpha': 0.000599107791544195, 
                     'model__reg_lambda': 0.38106260630011507, 'model__colsample_bytree': 0.81}
# Best is trial 262 with value: 0.1243739978569125.
# [-0.12021403 -0.1201359  -0.11912219 -0.16589076 -0.0965071 ]
# 0.1201359012486722
# 0.02264091018759263

In [75]:
# Pipeline Bayesian Ridge Regression

# best_params = {'model__n_iter': 650, 'model__tol': 0.00012277969304736106, 
#                'model__alpha_1': 0.00010633679673272293, 'model__alpha_2': 1.4064886184001036e-05, 
#                'model__lambda_1': 7.387907093592057e-06, 'model__lambda_2': 0.024216858525360933}

# Best is trial 71 with value: 0.1330226223829522.
# [-0.11564542 -0.13708175 -0.13063521 -0.11616119 -0.16558953]
# 0.1306352120412312
# 0.018270494205095723

In [76]:
# Pipeline ET 1
best_params_et1 = {'model__n_estimators': 2300, 'model__max_depth': None, 
                   'model__min_samples_split': 2, 'model__min_samples_leaf': 1, 
                   'model__max_features': None, 'model__bootstrap': False, 
                   'model__min_impurity_decrease': 1e-07}

# Best is trial 288 with value: 0.12569768187431177.
# [-0.11588193 -0.13975249 -0.1361102  -0.11405137 -0.12269242]
# 0.12269241707639009
# 0.01045921027558864

In [77]:
# Pipeline ET 2
# best_params_et2 = {'model__n_estimators': 750, 'model__max_depth': None, 
#                    'model__min_samples_split': 2, 'model__min_samples_leaf': 1,
#                    'model__max_features': None, 'model__bootstrap': False, 
#                    'model__min_impurity_decrease': 1e-07}

# Best is trial 223 with value: 0.12694198504244897.
# [-0.11649969 -0.1433932  -0.13690193 -0.11405778 -0.12385734]
# 0.12385733695275575
# 0.011440299908769621

In [78]:
# HistGradientBoostingRegressor

In [79]:
# best_params_h1 = {'model__learning_rate': 0.044404188319438584, 'model__max_iter': 1400, 
#                   'model__max_leaf_nodes': 165, 'model__max_depth': 4, 
#                   'model__min_samples_leaf': 3, 'model__l2_regularization': 4.5307178884883724e-05}

# Best is trial 280 with value: 0.11847668483746401.
# [-0.1136     -0.13109049 -0.12442296 -0.10306427 -0.12020571]
# 0.12020571416839064
# 0.009578938200902679

In [80]:
# best_params_h2 = {'model__learning_rate': 0.053890996060272485, 'model__max_iter': 1000, 
#                   'model__max_leaf_nodes': 256, 'model__max_depth': 4, 
#                   'model__min_samples_leaf': 2, 'model__l2_regularization': 0.015371180014333559}

# Best is trial 69 with value: 0.11963724792628469.
# [-0.11453667 -0.12855222 -0.12750158 -0.10434391 -0.12325185]
# 0.12325185209040704
# 0.009102656229568875

In [81]:
# Pipeline H3
best_params_h3 = {'model__learning_rate': 0.0167517247503107, 'model__max_iter': 1700, 
                  'model__max_leaf_nodes': 122, 'model__max_depth': 4, 
                  'model__min_samples_leaf': 11, 'model__l2_regularization': 3.113934050131242e-05}

# Best is trial 258 with value: 0.1200693567478865.
# [-0.11318242 -0.12996007 -0.12947043 -0.11172037 -0.11601349]
# 0.11601348963259125
# 0.007997398100660027

In [82]:
# pipeline = pipeline_rf
# best_params = best_params_rf
# pipeline.set_params(**best_params)

# pipeline_xgb1.set_params(**best_params_xgb1)
pipeline_xgb2.set_params(**best_params_xgb2)
# pipeline_xgb3.set_params(**best_params_xgb3)
pipeline_xgb7.set_params(**best_params_xgb7)
# pipeline_xgb9.set_params(**best_params_xgb9)
# pipeline_xgb10.set_params(**best_params_xgb10)

# pipeline_cb1.set_params(**best_params_cb1)
pipeline_cb2.set_params(**best_params_cb2)
# pipeline_cb3.set_params(**best_params_cb3)

pipeline_cb4_1 = clone(pipeline_cb4)
pipeline_cb4_2 = clone(pipeline_cb4)
pipeline_cb4_1.set_params(**best_params_cb4_1)
pipeline_cb4_2.set_params(**best_params_cb4_2)


# pipeline_lgbm1.set_params(**best_params_lgbm1)
pipeline_lgbm2.set_params(**best_params_lgbm2)
# pipeline_lgbm3.set_params(**best_params_lgbm3)
pipeline_lgbm4.set_params(**best_params_lgbm4)
# pipeline_lgbm6.set_params(**best_params_lgbm6)



# Ensemble

In [83]:
# Creating a custom class because VotingRegressor passes the data as numpy array to the individual estimators
# But the pipelines used (i.e. the estimators for the VotingRegressor) expect Pandas DataFrame.
class CustomVotingRegressor(VotingRegressor):
    def __init__(self, estimators, *, weights=None, n_jobs=None, verbose=False):
        super().__init__(estimators=estimators, weights=weights, n_jobs=n_jobs, verbose=verbose)

    def fit(self, X, y):
        for name, estimator in self.estimators:
            if hasattr(estimator, "fit"):
                estimator.fit(X, y)
        return self

    def predict(self, X):
        # Collect predictions from each estimator
        predictions = []
        for name, estimator in self.estimators:
            if hasattr(estimator, "predict"):
                predictions.append(estimator.predict(X))

        # Combine predictions using weights
        final_predictions = np.average(predictions, axis=0, weights=self.weights)

        return final_predictions

In [84]:
# pipeline = CustomVotingRegressor([("xgb9", pipeline_xgb9),
#                                   ("cb2", pipeline_cb2),
#                                   ("lgbm3", pipeline_lgbm3)])

# pipeline = CustomVotingRegressor([("xgb1", pipeline_xgb1), ("xgb3", pipeline_xgb3), ("xgb7", pipeline_xgb7), ("xgb9", pipeline_xgb9),
#                                   ("cb1", pipeline_cb1), ("cb2", pipeline_cb2), ("cb3", pipeline_cb3), ("cb4", pipeline_cb4),
#                                   ("lgbm1", pipeline_lgbm1), ("lgbm2", pipeline_lgbm2), ("lgbm3", pipeline_lgbm3), ("lgbm6", pipeline_lgbm6)])

# pipeline = CustomVotingRegressor([("xgb9", pipeline_xgb9),
#                                   ("cb4", pipeline_cb4),
#                                   ("lgbm3", pipeline_lgbm3)])

# pipeline = CustomVotingRegressor([("xgb9", pipeline_xgb9),
#                                   ("cb2", pipeline_cb2), ("cb4", pipeline_cb4),
#                                   ("lgbm3", pipeline_lgbm3)])


# pipeline = CustomVotingRegressor([("xgb3", pipeline_xgb3), ("xgb7", pipeline_xgb7), ("xgb9", pipeline_xgb9),
#                                   ("cb1", pipeline_cb1), ("cb2", pipeline_cb2), ("cb3", pipeline_cb3), ("cb4", pipeline_cb4),
#                                   ("lgbm1", pipeline_lgbm1), ("lgbm2", pipeline_lgbm2), ("lgbm3", pipeline_lgbm3), ("lgbm6", pipeline_lgbm6)])

# pipeline = CustomVotingRegressor([("xgb3", pipeline_xgb3), ("xgb9", pipeline_xgb9),
#                                   ("cb2", pipeline_cb2), ("cb4", pipeline_cb4),
#                                   ("lgbm1", pipeline_lgbm1), ("lgbm2", pipeline_lgbm2), ("lgbm3", pipeline_lgbm3), ("lgbm6", pipeline_lgbm6)])


# pipeline = CustomVotingRegressor([("xgb3", pipeline_xgb3), ("xgb9", pipeline_xgb9),
#                                   ("cb2", pipeline_cb2), ("cb4", pipeline_cb4),
#                                   ("lgbm2", pipeline_lgbm2), ("lgbm3", pipeline_lgbm3)])

# pipeline = CustomVotingRegressor([("xgb9", pipeline_xgb9), ("xgb10", pipeline_xgb10),
#                                   ("cb2", pipeline_cb2), ("cb4", pipeline_cb4),
#                                   ("lgbm3", pipeline_lgbm3)])

# pipeline = CustomVotingRegressor([("xgb9", pipeline_xgb9), ("xgb10", pipeline_xgb10),
#                                    ("cb2", pipeline_cb2), ("cb4", pipeline_cb4),
#                                    ("lgbm3", pipeline_lgbm3),
#                                    ("h3", pipeline_h3)])

# pipeline = CustomVotingRegressor([("xgb9", pipeline_xgb9), ("xgb10", pipeline_xgb10),
#                                    ("cb2", pipeline_cb2), ("cb4", pipeline_cb4),
#                                    ("lgbm3", pipeline_lgbm3),
#                                    ("et1", pipeline_et1),
#                                    ("h3", pipeline_h3)])

# in 0.115
# pipeline = CustomVotingRegressor([ ("xgb3", pipeline_xgb3), ("xgb9", pipeline_xgb9), ("xgb10", pipeline_xgb10),
#                                    ("cb2", pipeline_cb2),
#                                    ("lgbm1", pipeline_lgbm1), ("lgbm2", pipeline_lgbm2), ("lgbm3", pipeline_lgbm3), ("lgbm6", pipeline_lgbm6)
#                                 ])


# pipeline = CustomVotingRegressor([ ("xgb3", pipeline_xgb3), ("xgb9", pipeline_xgb9), ("xgb10", pipeline_xgb10),
#                                    ("cb2", pipeline_cb2), ("cb4", pipeline_cb4),
#                                    ("lgbm1", pipeline_lgbm1), ("lgbm2", pipeline_lgbm2), ("lgbm3", pipeline_lgbm3), ("lgbm6", pipeline_lgbm6)
#                                 ])

# pipeline = CustomVotingRegressor([("cb1", pipeline_cb1), ("cb2", pipeline_cb2), ("cb3", pipeline_cb3), ("cb4", pipeline_cb4)])
# pipeline

In [85]:
# best CV score ensemble
# pipeline = CustomVotingRegressor([("xgb7", pipeline_xgb7),
#                                   ("cb2", pipeline_cb2),
#                                   ("lgbm2", pipeline_lgbm2)])

# [-0.11683245 -0.10538843 -0.11915497 -0.15306302 -0.09637309]
# 0.11683245126666751
# 0.019280847882660095
# 0.11816239196799852
# public score : 0.11982

# best public score ensemble
pipeline = CustomVotingRegressor([("xgb2", pipeline_xgb2),
                                  ("cb4_1", pipeline_cb4_1),
                                  ("lgbm4", pipeline_lgbm4)])

# [-0.11909416 -0.10784494 -0.12015935 -0.15589714 -0.09967064]
# 0.1190941614942
# 0.019230481823789974
# 0.12053324647111108
# public score :


# best public score ensemble
# pipeline = CustomVotingRegressor([("cb4_1", pipeline_cb4_1),
#                                   ("cb4_2", pipeline_cb4_2),
#                                   ("cb6", pipeline_cb6)])


# public score :


In [86]:
# score_dataset(X, y, pipeline)

In [87]:
class CustomStackingRegressor(StackingRegressor):
    def __init__(self, estimators, final_estimator=None, *, cv=None, n_jobs=None, passthrough=False, verbose=0):
        super().__init__(estimators=estimators, final_estimator=final_estimator, 
                         cv=cv, n_jobs=n_jobs, passthrough=passthrough, verbose=verbose)

    def fit(self, X, y):
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input to fit must be a pandas DataFrame.")        

        # Fit all base estimators
        self.base_estimators_ = []  # Store fitted base estimators
        for name, estimator in self.estimators:
            if hasattr(estimator, "fit"):
                fitted_estimator = clone(estimator).fit(X, y)
                self.base_estimators_.append((name, fitted_estimator))
            else:
                raise ValueError(f"Estimator {name} does not implement a fit method.")

        # Generate predictions from base estimators for training the final estimator
        meta_features = self._predict_base_estimators(X)

        # Assign and fit the final estimator
        if self.final_estimator is None:
            self.final_estimator_ = RidgeCV(alphas=np.logspace(-6, 6, 13))
        else:
            self.final_estimator_ = clone(self.final_estimator)

        self.final_estimator_.fit(meta_features, y)
        return self

    def predict(self, X):
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input to predict must be a pandas DataFrame.")

        # Generate predictions from base estimators
        meta_features = self._predict_base_estimators(X)

        # Use the final estimator to make predictions
        return self.final_estimator_.predict(meta_features)

    def _predict_base_estimators(self, X):
        """
        Generate predictions from all base estimators and return as a DataFrame for meta-learning.
        """
        predictions = []
        for name, estimator in self.base_estimators_:
            if hasattr(estimator, "predict"):
                predictions.append(estimator.predict(X))
            else:
                raise ValueError(f"Estimator {name} does not implement a predict method.")

        # Stack base predictions column-wise and return as a DataFrame
        meta_features = pd.DataFrame(
            np.column_stack(predictions),
            columns=[name for name, _ in self.base_estimators_]
        )
        return meta_features

In [88]:
# def et_stacking_objective(trial):  
    
#     params = {
#         'n_estimators':          trial.suggest_int('n_estimators', 50, 300, step=10),  
#         'max_depth':             trial.suggest_categorical('max_depth', [None] + list(range(3, 12))),  
#         'min_samples_split':     trial.suggest_int('min_samples_split', 2, 10),  
#         'min_samples_leaf':      trial.suggest_int('min_samples_leaf', 1, 10),  
#         'max_features':          trial.suggest_categorical('max_features', ["sqrt", None, 0.5]), 
#         'bootstrap':             trial.suggest_categorical('bootstrap', [True, False]),  
#         'min_impurity_decrease': trial.suggest_categorical('min_impurity_decrease', [0, 1e-6, 1e-5, 1e-4, 1e-3])  
#     }

#     final_estimator = ExtraTreesRegressor(random_state=SEED, **params)
#     pipeline = CustomStackingRegressor([ ("xgb3", pipeline_xgb3), ("xgb9", pipeline_xgb9), ("xgb10", pipeline_xgb10),
#                                          ("cb2", pipeline_cb2),
#                                          ("lgbm1", pipeline_lgbm1), ("lgbm2", pipeline_lgbm2), ("lgbm3", pipeline_lgbm3), ("lgbm6", pipeline_lgbm6)
#                                        ],
#                                        final_estimator=final_estimator)    
#     val_score = score_dataset(X, y, pipeline)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')
# study.enqueue_trial({'n_estimators': 200, 'max_depth': 10, 
#                     'min_samples_split': 3, 'min_samples_leaf': 5, 
#                     'max_features': None, 'bootstrap': False, 
#                     'min_impurity_decrease': 1e-05})
# study.enqueue_trial({'n_estimators': 200, 'max_depth': 10, 
#                     'min_samples_split': 5, 'min_samples_leaf': 4, 
#                     'max_features': None, 'bootstrap': False, 
#                     'min_impurity_decrease': 1e-05})
# study.enqueue_trial({'n_estimators': 200, 'max_depth': 7, 
#                     'min_samples_split': 10, 'min_samples_leaf': 4, 
#                     'max_features': 0.5, 'bootstrap': False, 
#                     'min_impurity_decrease': 1e-05})
# study.optimize(et_stacking_objective, n_trials = 300)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"ETStack tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [89]:
# def xgb_stacking_objective(trial):  
    
#     params = {
#         'n_estimators':             trial.suggest_int('n_estimators', 50, 300, step=10), 
#         'learning_rate':            trial.suggest_float('learning_rate', 0.001, 0.1, log=True),  
#         'max_depth':                trial.suggest_int('max_depth', 2, 10),  
#         'min_child_weight':         trial.suggest_int('min_child_weight', 1, 6),  
#         'lambda':                   trial.suggest_float('lambda', 1e-3, 10.0, log=True),  
#         'alpha':                    trial.suggest_float('alpha', 1e-3, 10.0, log=True),  
#         'subsample':                trial.suggest_float('subsample', 0.6, 1.0, step=0.05),  
#         'colsample_bytree':         trial.suggest_float('colsample_bytree', 0.6, 1.0, step=0.05)  
#     }
#     final_estimator = XGBRegressor(random_state = SEED, **params)
#     pipeline = CustomStackingRegressor([ ("xgb3", pipeline_xgb3), ("xgb9", pipeline_xgb9), ("xgb10", pipeline_xgb10),
#                                          ("cb2", pipeline_cb2),
#                                          ("lgbm1", pipeline_lgbm1), ("lgbm2", pipeline_lgbm2), ("lgbm3", pipeline_lgbm3), ("lgbm6", pipeline_lgbm6)
#                                        ],
#                                        final_estimator=final_estimator)    
#     val_score = score_dataset(X, y, pipeline)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')
# study.enqueue_trial({'n_estimators': 150, 'learning_rate': 0.07375950393946418, 
#                      'max_depth': 3, 'min_child_weight': 5, 
#                      'lambda': 0.025091460051336875, 'alpha': 0.26969435515966034, 
#                      'subsample': 0.75, 'colsample_bytree': 1.0})
# study.optimize(xgb_stacking_objective, n_trials = 300)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"XGBStack tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [90]:
# def lgbm_stacking_objective(trial):  
    
#     params = {
#         'n_estimators':             trial.suggest_int('n_estimators', 100, 1000, step=50), 
#         'learning_rate':            trial.suggest_float('learning_rate', 0.001, 0.1, log=True),  
#         'max_depth':                trial.suggest_int('max_depth', 2, 10),  
#         'min_child_weight':         trial.suggest_int('min_child_weight', 1, 6),  
#         'lambda':                   trial.suggest_float('lambda', 1e-3, 10.0, log=True),  
#         'alpha':                    trial.suggest_float('alpha', 1e-3, 10.0, log=True),  
#         'subsample':                trial.suggest_float('subsample', 0.6, 1.0, step=0.05),  
#         'colsample_bytree':         trial.suggest_float('colsample_bytree', 0.6, 1.0, step=0.05)  
#     }
#     final_estimator = LGBMRegressor(random_state = SEED, bagging_seed = SEED, verbose = -1, **params)
#     pipeline = CustomStackingRegressor([ ("xgb3", pipeline_xgb3), ("xgb9", pipeline_xgb9), ("xgb10", pipeline_xgb10),
#                                          ("cb2", pipeline_cb2),
#                                          ("lgbm1", pipeline_lgbm1), ("lgbm2", pipeline_lgbm2), ("lgbm3", pipeline_lgbm3), ("lgbm6", pipeline_lgbm6)
#                                        ],
#                                        final_estimator=final_estimator)    
#     val_score = score_dataset(X, y, pipeline)
#     return val_score

# start_time = time.time()
# study = optuna.create_study(direction = 'minimize')
# study.optimize(lgbm_stacking_objective, n_trials = 50)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"LGBMStack tuning took {elapsed_time:.2f} seconds.")
# print(elapsed_time)

# print(study.best_params)
# print(study.best_value)
# print(study.best_trial)

In [91]:
# best_params_f_et = {'n_estimators': 200, 'max_depth': 10, 
#                     'min_samples_split': 5, 'min_samples_leaf': 4, 
#                     'max_features': None, 'bootstrap': False, 
#                     'min_impurity_decrease': 1e-05}

# Best is trial 38 with value: 0.11259536584058619.
# [-0.10494648 -0.12383074 -0.12322337 -0.09923414 -0.11174211]
# 0.11174210587382624
# 0.009766719818534123
# public score : 0.12381

# best_params_f_et = {'n_estimators': 200, 'max_depth': 7, 
#                     'min_samples_split': 10, 'min_samples_leaf': 4, 
#                     'max_features': 0.5, 'bootstrap': False, 
#                     'min_impurity_decrease': 1e-05}
# Best is trial 23 with value: 0.11255919861402615.


# best_params_f_et = {'n_estimators': 200, 'max_depth': 10, 
#                     'min_samples_split': 3, 'min_samples_leaf': 5, 
#                     'max_features': None, 'bootstrap': False, 
#                     'min_impurity_decrease': 1e-05}
# Best is trial 34 with value: 0.11262836864219486.


best_params_f_et = {'n_estimators': 50, 'max_depth': 9, 
                    'min_samples_split': 9, 'min_samples_leaf': 5, 
                    'max_features': None, 'bootstrap': False, 'min_impurity_decrease': 1e-05}

# Best is trial 294 with value: 0.11244229505996625.
# [-0.10461246 -0.12366897 -0.12326438 -0.0992486  -0.11141707]
# 0.11141707029932142
# 0.009793740766324388


######################################################

# best_params_f_xgb = {'n_estimators': 150, 'learning_rate': 0.07375950393946418, 
#                      'max_depth': 3, 'min_child_weight': 5, 
#                      'lambda': 0.025091460051336875, 'alpha': 0.26969435515966034, 
#                      'subsample': 0.75, 'colsample_bytree': 1.0}

# Best is trial 41 with value: 0.11302519095383065.
# [-0.10429697 -0.12403466 -0.12563498 -0.09928949 -0.11186986]
# 0.11186986016214154
# 0.010453719390146561

best_params_f_xgb = {'n_estimators': 180, 'learning_rate': 0.08414082166224444, 
                     'max_depth': 9, 'min_child_weight': 4, 
                     'lambda': 0.0016861040102727459, 'alpha': 0.218576435429603, 
                     'subsample': 0.6, 'colsample_bytree': 0.6}

# Best is trial 157 with value: 0.11265372717488362.
# [-0.10423112 -0.12307777 -0.12397106 -0.09923176 -0.11275694]
# 0.1127569352884956
# 0.009877680402843174



best_params_f_lgbm = {'n_estimators': 350, 'learning_rate': 0.014046781406678008, 
                      'max_depth': 3, 'min_child_weight': 1, 
                      'lambda': 1.225131966294904, 'alpha': 5.663169960003222, 
                      'subsample': 0.75, 'colsample_bytree': 0.65}

# Best is trial 36 with value: 0.11498894431586999.
# [-0.10848398 -0.12815922 -0.12371949 -0.09895674 -0.11562529]
# 0.11562528518344482
# 0.010482648308667334




In [92]:
# pipeline = CustomStackingRegressor([("xgb9", pipeline_xgb9),
#                                     ("cb2", pipeline_cb2),
#                                     ("lgbm3", pipeline_lgbm3)],
#                                    final_estimator=LinearRegression())
# pipeline = CustomStackingRegressor([("xgb1", pipeline_xgb1), ("xgb3", pipeline_xgb3), ("xgb7", pipeline_xgb7), ("xgb9", pipeline_xgb9),
#                                     ("cb1", pipeline_cb1), ("cb2", pipeline_cb2), ("cb3", pipeline_cb3), ("cb4", pipeline_cb4),
#                                     ("lgbm1", pipeline_lgbm1), ("lgbm2", pipeline_lgbm2), ("lgbm3", pipeline_lgbm3), ("lgbm6", pipeline_lgbm6)],
#                                    final_estimator=LinearRegression())


# individual model CV scores in [0.115, 0.116)
# pipeline = CustomStackingRegressor([ ("xgb3", pipeline_xgb3), ("xgb9", pipeline_xgb9), ("xgb10", pipeline_xgb10),
#                                      ("cb2", pipeline_cb2),
#                                      ("lgbm1", pipeline_lgbm1), ("lgbm2", pipeline_lgbm2), ("lgbm3", pipeline_lgbm3), ("lgbm6", pipeline_lgbm6)
#                                    ],
#                                    final_estimator=ExtraTreesRegressor(random_state = SEED))

# pipeline = CustomStackingRegressor([ ("xgb3", pipeline_xgb3), ("xgb9", pipeline_xgb9), ("xgb10", pipeline_xgb10),
#                                      ("cb2", pipeline_cb2),
#                                      ("lgbm1", pipeline_lgbm1), ("lgbm2", pipeline_lgbm2), ("lgbm3", pipeline_lgbm3), ("lgbm6", pipeline_lgbm6)
#                                    ],
#                                    final_estimator=XGBRegressor(random_state = SEED))

# pipeline = CustomStackingRegressor([ ("xgb3", pipeline_xgb3), ("xgb9", pipeline_xgb9), ("xgb10", pipeline_xgb10),
#                                      ("cb2", pipeline_cb2),
#                                      ("lgbm1", pipeline_lgbm1), ("lgbm2", pipeline_lgbm2), ("lgbm3", pipeline_lgbm3), ("lgbm6", pipeline_lgbm6)
#                                    ],
#                                    final_estimator=LGBMRegressor(random_state = SEED, bagging_seed = SEED, verbose = -1))


# pipeline = CustomStackingRegressor([ ("xgb3", pipeline_xgb3), ("xgb9", pipeline_xgb9), ("xgb10", pipeline_xgb10),
#                                      ("cb2", pipeline_cb2),
#                                      ("lgbm1", pipeline_lgbm1), ("lgbm2", pipeline_lgbm2), ("lgbm3", pipeline_lgbm3), ("lgbm6", pipeline_lgbm6)
#                                    ],
#                                    final_estimator=ExtraTreesRegressor(random_state = SEED, **best_params_f_et))

# pipeline = CustomStackingRegressor([ ("xgb3", pipeline_xgb3), ("xgb9", pipeline_xgb9), ("xgb10", pipeline_xgb10),
#                                      ("cb2", pipeline_cb2),
#                                      ("lgbm1", pipeline_lgbm1), ("lgbm2", pipeline_lgbm2), ("lgbm3", pipeline_lgbm3), ("lgbm6", pipeline_lgbm6)
#                                    ],
#                                    final_estimator=XGBRegressor(random_state = SEED, **best_params_f_xgb))

# pipeline = CustomStackingRegressor([ ("xgb3", pipeline_xgb3), ("xgb9", pipeline_xgb9), ("xgb10", pipeline_xgb10),
#                                      ("cb2", pipeline_cb2),
#                                      ("lgbm1", pipeline_lgbm1), ("lgbm2", pipeline_lgbm2), ("lgbm3", pipeline_lgbm3), ("lgbm6", pipeline_lgbm6)
#                                    ],
#                                    final_estimator=LGBMRegressor(random_state = SEED, bagging_seed = SEED, verbose = -1,
#                                                                 **best_params_f_lgbm))

pipeline


# CV Score

In [93]:
# score_dataset(X, y, pipeline)

# pipeline_xgb3
# 0.11540221675361506
# public score : 0.12438

# pipeline_xgb8
# 0.11839759395186619
# public score : 0.12303

# pipeline_xgb9
# 0.11519055282052985
# public score : 0.12741

# pipeline_cb2
# 0.115131021385492
# public score : 0.12092

# pipeline_cb4
	# 0.1214007095620582
	# public score : 0.11915
	
	# 0.11973587332021315
	# public score : 0.11767   *best*
	
	# 0.11877609053306046
	# public score : 0.11857
	
	# 0.11886993653590813
	# public score : 0.11854
	
	# 0.11888409741568533
	# public score : 0.11855
	
	# 0.11947660122245345
	# public score : 0.11848

# pipeline_lgbm3
# 0.11545575046806685
# public score : 0.12226

# pipeline_lgbm5
# 0.11911372688117125
# public score : 0.12243

# pipeline_lgbm6
# 0.11575282236697218
# public score : 0.12254
##################################################################################

# CustomVotingRegressor xgb3, cb2, lgbm3
# [-0.10592071 -0.12359544 -0.12471606 -0.10401465 -0.11350561]
# 0.11350561026281031
# 0.00861996029068565
# 0.11435049565486803

# public score : 0.12054

######################################
# with median voting

# [-0.10593103 -0.12457699 -0.12517086 -0.10322035 -0.11443593]
# 0.1144359285988727
# 0.0091206939039808
# 0.11466703109172434


##################################################################################

# CustomVotingRegressor xgb 137, cb 234, lgbm 123
# [-0.10452924 -0.12480733 -0.124325   -0.10344957 -0.11256266]
# 0.11256265630472277
# 0.009235317204888441
# 0.11393475744392181

# public score : 0.11934

######################################
# with median voting
# [-0.10510494 -0.12485347 -0.1248122  -0.10324854 -0.11312523]
# 0.11312523328731622
# 0.009272764398273466
# 0.11422887489665161

# public score : 0.11979

##################################################################################

# CustomVotingRegressor cb 234

# [-0.10501766 -0.1257221  -0.12555756 -0.10671793 -0.11465395]
# 0.1146539509677765
# 0.008869725329364584
# 0.11553383925228788

######################################
# with median voting

# [-0.10542628 -0.12512008 -0.12560693 -0.10800153 -0.11600194]
# 0.11600193500340161
# 0.00838144314418917
# 0.11603135196791996

##################################################################################

# CustomVotingRegressor xgb9, cb2, lgbm3
# [-0.10401297 -0.12311788 -0.12432479 -0.10241974 -0.11268986]
# 0.11268986367629229
# 0.009197143492658417
# 0.11331304808833617

# public score : 0.12167

##################################################################################

# CustomVotingRegressor xgb1379, cb1234, lgbm1236
# [-0.10369239 -0.12428663 -0.12359763 -0.10263568 -0.11262192]
# 0.11262191936406919
# 0.009308258360005239
# 0.1133668506007757

# public score : 0.11976

##################################################################################

# CustomStackingRegressor default - xgb9, cb2, lgbm3

# [-0.10691649 -0.13275209 -0.13382256 -0.10219391 -0.11916575]
# 0.11916574699434053
# 0.012940687421358413
# 0.11897015913278006

##################################################################################

# CustomStackingRegressor RandomForestRegressor - xgb9, cb2, lgbm3

# [-0.1067862  -0.1275201  -0.12878482 -0.10269413 -0.11542928]
# 0.1154292784897735
# 0.010565369501369276
# 0.11624290480991635

##################################################################################

# CustomStackingRegressor ExtraTreesRegressor - xgb9, cb2, lgbm3

# [-0.10641619 -0.12591297 -0.12771407 -0.10173553 -0.11662047]
# 0.1166204699514208
# 0.010302266754528064
# 0.11567984700482883

##################################################################################

# CustomStackingRegressor XGBRegressor - xgb9, cb2, lgbm3

# [-0.10726087 -0.12657785 -0.13098654 -0.10115142 -0.11870751]
# 0.11870750578174134
# 0.011280799380537491
# 0.1169368370827479

##################################################################################

# CustomStackingRegressor LinearRegression - xgb9, cb2, lgbm3

# [-0.10691851 -0.13276061 -0.13383164 -0.10219697 -0.11916949]
# 0.1191694947181288
# 0.01294342954787272
# 0.11897544352874967

##################################################################################

# CustomStackingRegressor default - xgb1379, cb1234, lgbm1236

# [-0.10334732 -0.12459756 -0.12667014 -0.10081828 -0.11439494]
# 0.11439493633301483
# 0.010585160344522773
# 0.11396564656116766

# public score : 0.12780

##################################################################################

# CustomStackingRegressor RandomForestRegressor - xgb1379, cb1234, lgbm1236

# [-0.10530758 -0.1258567  -0.124897   -0.10206537 -0.1125273 ]
# 0.11252729691106078
# 0.009791860538931
# 0.11413078916789661

# public score : 0.12398

##################################################################################

# CustomStackingRegressor ExtraTreesRegressor - xgb1379, cb1234, lgbm1236

# [-0.1045066  -0.12478949 -0.12493746 -0.10054467 -0.1118624 ]
# 0.11186240388347506
# 0.010094788874069284
# 0.1133281245116865

# public score : 0.12374

##################################################################################

# CustomStackingRegressor XGBRegressor - xgb1379, cb1234, lgbm1236

# [-0.10461118 -0.1244706  -0.12838806 -0.10155516 -0.11142797]
# 0.11142797143066285
# 0.01064187964368029
# 0.11409059373971794

# public score : 0.12782

##################################################################################

# CustomStackingRegressor LinearRegression - xgb1379, cb1234, lgbm1236

# [-0.10336114 -0.12457695 -0.12670197 -0.10082869 -0.1144315 ]
# 0.11443149863628488
# 0.010583617624520984
# 0.11398004868765754

##################################################################################

# CustomVotingRegressor xgb9, cb4, lgbm3

# [-0.10566885 -0.1242927  -0.12409399 -0.10202532 -0.11536619]
# 0.11536618585929441
# 0.009187785645136805
# 0.11428940750379435

# public score : 0.12002

##################################################################################

# CustomVotingRegressor xgb9, cb24, lgbm3

# [-0.10510279 -0.12360204 -0.12376737 -0.10264434 -0.11413495]
# 0.11413494837001555
# 0.008895147837995483
# 0.113850298243567

# public score : 0.11960

##################################################################################

# CustomVotingRegressor xgb379, cb1234, lgbm1236

# [-0.10325114 -0.12393798 -0.12322187 -0.10258439 -0.11294147]
# 0.1129414652172083
# 0.009246393808513463
# 0.11318736819912698

# public score : 0.11973


##################################################################################

# CustomVotingRegressor xgb39, cb24, lgbm1236

# [-0.10460306 -0.12463335 -0.1236712  -0.10200868 -0.11267596]
# 0.11267595787059419
# 0.00937324016857296
# 0.1135184499881754

# public score : 0.12030

##################################################################################

# CustomVotingRegressor xgb39, cb24, lgbm23

# [-0.10448702 -0.12423125 -0.12373836 -0.10175558 -0.11332416]
# 0.11332415833679216
# 0.009372083461854544
# 0.11350727513774599

# public score : 0.12050


##################################################################################

# CustomVotingRegressor xgb9_10, cb24, lgbm3

# [-0.10449493 -0.12320307 -0.12390624 -0.10217436 -0.11349497]
# 0.11349496519678004
# 0.00907513777576779
# 0.11345471281275384

# public score : 0.12064

##################################################################################

# CustomVotingRegressor xgb9_10, cb24, lgbm3, h3

# [-0.10508806 -0.1233123  -0.12384158 -0.1029072  -0.11301524]
# 0.11301523626275962
# 0.00879027119705094
# 0.11363287487339124

# public score : 0.12017

##################################################################################

# CustomVotingRegressor xgb9_10, cb24, lgbm3, et1, h3

# [-0.10556809 -0.12437792 -0.12417388 -0.10336476 -0.11301214]
# 0.11301213988323612
# 0.008903270953142868
# 0.1140993594331654

# public score : 0.12078

##################################################################################

# CustomVotingRegressor xgb39_10, cb2, lgbm1236 - individual models in 0.115 score range

# [-0.10398498 -0.12427925 -0.12411001 -0.10175914 -0.11184123]
# 0.11184122937770229
# 0.009585760592841996
# 0.11319491934130788

# public score : 0.12181

##################################################################################

# CustomVotingRegressor xgb39_10, cb24, lgbm1236 - individual models in 0.115 score range + cb4

# [-0.10433444 -0.12428831 -0.12377054 -0.10179974 -0.11249091]
# 0.11249091382371852
# 0.009419769361899857
# 0.11333678945000407

# public score : 0.12079

##################################################################################

# CustomStacking using ET xgb39_10, cb2, lgbm1236 - individual models in 0.115 score range

# [-0.10573317 -0.12636065 -0.12449944 -0.10099839 -0.11342567]
# 0.11342566953457542
# 0.010005290491775985
# 0.11420346480175711

# public score : 0.12593

######################################
# with best hyperparam

# [-0.10494648 -0.12383074 -0.12322337 -0.09923414 -0.11174211]
# 0.11174210587382624
# 0.009766719818534123
# 0.11259536584058619

# public score : 0.12381

######################################

# [-0.10461246 -0.12366897 -0.12326438 -0.0992486  -0.11141707]
# 0.11141707029932142
# 0.009793740766324388
# 0.11244229505996625

# public score : 0.12361

##################################################################################

# CustomStacking using XGB xgb39_10, cb2, lgbm1236 - individual models in 0.115 score range

# [-0.10432277 -0.1263227  -0.12727069 -0.10158633 -0.11423715]
# 0.1142371534004282
# 0.010704835153754118
# 0.1147479279788834

# public score : 0.12946

######################################
# with best hyperparam

# [-0.10429697 -0.12403466 -0.12563498 -0.09928949 -0.11186986]
# 0.11186986016214154
# 0.010453719390146561
# 0.11302519095383065

# public score : 0.12648

######################################

# [-0.10423112 -0.12307777 -0.12397106 -0.09923176 -0.11275694]
# 0.1127569352884956
# 0.009877680402843174
# 0.11265372717488362

# public score : 0.12721

##################################################################################

# CustomStacking using LGBM xgb39_10, cb2, lgbm1236 - individual models in 0.115 score range

# [-0.11223253 -0.13014404 -0.12500044 -0.09929034 -0.11630782]
# 0.11630781998849603
# 0.010703200480852946
# 0.11659503271176211

# public score : 0.12515

######################################
#with best hyperparam

# [-0.10848398 -0.12815922 -0.12371949 -0.09895674 -0.11562529]
# 0.11562528518344482
# 0.010482648308667334
# 0.11498894431586999

##################################################################################

# CustomVotingRegressor cb 1234

# [-0.10486403 -0.12427486 -0.12376708 -0.10590704 -0.1155346 ]
# 0.11553460390005736
# 0.008348715542746495
# 0.1148695224807608

# public score : 0.11858

In [94]:
# print(score_dataset(X, y, pipeline_xgb1))
# [-0.110808   -0.129472   -0.12964501 -0.10534453 -0.11138566]
# 0.1113856557342387
# 0.010204027122436335
# 0.11733104229147182

# print(score_dataset(X, y, pipeline_xgb3))
# [-0.10548422 -0.12680313 -0.12650509 -0.10255713 -0.11566151]
# 0.11566150694345757
# 0.010165479749796716
# 0.11540221675361506

# print(score_dataset(X, y, pipeline_xgb7))
# [-0.1039625  -0.12587415 -0.12546581 -0.10753416 -0.12017951]
# 0.1201795084009607
# 0.009157741226268295
# 0.11660322347801237

# print(score_dataset(X, y, pipeline_xgb9))
# [-0.10426655 -0.12730364 -0.12830147 -0.10071816 -0.11536294]
# 0.11536294365393755
# 0.01137924965344543
# 0.11519055282052985



# print(score_dataset(X, y, pipeline_cb1))
# [-0.10668533 -0.12480329 -0.1268338  -0.10741574 -0.11567317]
# 0.11567317248352649
# 0.00842650896916564
# 0.11628226597361586

# print(score_dataset(X, y, pipeline_cb2))
# [-0.10631423 -0.12341525 -0.12524074 -0.10716469 -0.11352019]
# 0.11352019431883034
# 0.00793256565121934
# 0.115131021385492

# print(score_dataset(X, y, pipeline_cb3))
# [-0.10281654 -0.126599   -0.12371644 -0.10972694 -0.11812327]
# 0.11812327086423803
# 0.008824425231554592
# 0.11619643846168552

# print(score_dataset(X, y, pipeline_cb4))
# [-0.11301114 -0.12934981 -0.12627155 -0.10764008 -0.12240679]
# 0.12240678985145056
# 0.008170764103107636
# 0.11973587332021315



# print(score_dataset(X, y, pipeline_lgbm1))
# [-0.10897157 -0.12806925 -0.12505263 -0.10505543 -0.11226835]
# 0.11226835473674715
# 0.009062634359964232
# 0.11588344796374961

# print(score_dataset(X, y, pipeline_lgbm2))
# [-0.10663778 -0.12916449 -0.1261197  -0.10309439 -0.11300596]
# 0.11300596297350247
# 0.010374005785838268
# 0.11560446553278303

# print(score_dataset(X, y, pipeline_lgbm3))
# [-0.10764055 -0.12391567 -0.12530486 -0.10516194 -0.11525574]
# 0.11525574056354786
# 0.008193353838302055
# 0.11545575046806685

# print(score_dataset(X, y, pipeline_lgbm6))
# [-0.10518519 -0.12841047 -0.12598079 -0.10541443 -0.11377324]
# 0.11377323964590669
# 0.009872218200399905
# 0.11575282236697218

# CV score skf

In [95]:
#xgb1
# 0.12096327119801184.
# [-0.11749978 -0.11468448 -0.11998631 -0.16015004 -0.09249576]
# 0.1174997758651165
# 0.02190148844115302
# public score : 0.12208

#xgb2
# 0.12121695838464552.
# [-0.1190666  -0.11047241 -0.12027858 -0.15368264 -0.10258457]
# 0.11906659648415295
# 0.0174478434513716
# public score : 0.12121

#xgb3
# 0.12291040055737397.
# [-0.1207175  -0.11784717 -0.12084213 -0.15751711 -0.09762809]
# 0.12071750059786712
# 0.01934705328976381
# public score : 0.12344

#xgb5
# 0.12121663671265123.
# [-0.1183398  -0.11764894 -0.11819146 -0.15724026 -0.09466274]
# 0.11819145566235034
# 0.02016416246342805
# public score : 0.12270

#xgb7
# 0.12016913498104102.
# [-0.1231054  -0.10620493 -0.11861282 -0.15727291 -0.09564962]
# 0.11861282156685948
# 0.020899470502765815
# public score : 0.12176

#xgb8
# 0.12218887615283583.
# [-0.12311578 -0.10393904 -0.11978129 -0.16208063 -0.10202764]
# 0.11978128899996028
# 0.021621929170878792
# public score : 0.12340

#xgb9
# 0.12289801572950965
# [-0.11956882 -0.11307731 -0.12082082 -0.16199867 -0.09902446]
# 0.1195688216223085
# 0.02102690141132909
# public score : 0.12279

#xgb10
# 0.12189723833484438.
# [-0.11769908 -0.11210582 -0.12542139 -0.15399192 -0.10026798]
# 0.11769907780421265
# 0.018021150486328682
# public score : 0.12595

#cb1
# 0.12146870978524071
# [-0.11689243 -0.11222937 -0.1195251  -0.1484336  -0.11026306]
# 0.11689242795517175
# 0.013876291530023033
# public score : 0.12256

#cb2
# 0.12063624125827858
# [-0.1156934  -0.10911198 -0.12282546 -0.15034066 -0.1052097 ]
# 0.1156933974068409
# 0.01601476634147856
# public score : 0.12234

#cb3
# 0.12117629660914572
# [-0.11685888 -0.11011589 -0.1235822  -0.1489466  -0.10637792]
# 0.11685887726354985
# 0.015079407156906926
# public score : 0.12243

#cb4
# 0.12746553160273738.
# [-0.12479513 -0.11602021 -0.12677251 -0.16181625 -0.10792355]
# 0.1247951298785159
# 0.01844162469894824
# public score : 0.11767

# 0.12657391647714436.
# [-0.12250947 -0.11908302 -0.12198717 -0.16438165 -0.10490827]
# 0.12198717243937014
# 0.019962446238785396
# public score : 0.11843

#cb5
# 0.12232080734704395
# [-0.11730409 -0.10672066 -0.12218277 -0.15967962 -0.1057169 ]
# 0.11730408572695568
# 0.019697317031556482
# public score : 0.12262

#cb6
# 0.1274473054822071
# [-0.12410766 -0.11949513 -0.12663447 -0.15884384 -0.10815544]
# 0.124107658094659
# 0.01692908485762067
# public score : 0.11918

#lgbm1
# 0.12052610219793292
# [-0.11954391 -0.11328187 -0.11886363 -0.15219455 -0.09874655]
# 0.11886362989527258
# 0.01751254964107485
# public score : 0.12100

#lgbm2
# 0.12006520251936215
# [-0.1175394  -0.10907231 -0.12194504 -0.15602383 -0.09574543]
# 0.11753940364579261
# 0.020076009133654676
# public score : 0.12188

#lgbm3
# 0.12476945989203363
# [-0.12135222 -0.11226465 -0.1272039  -0.1535579  -0.10946863]
# 0.12135222151106254
# 0.015728090881840222
# public score : 0.12432

#lgbm4
# 0.12056356608490129
# [-0.12052822 -0.10662705 -0.12023993 -0.15732043 -0.0981022 ]
# 0.12023993219991116
# 0.020248023319354637
# public score : 0.12081

#lgbm5
# 0.12194678876497556.
# [-0.12324616 -0.10222344 -0.12207349 -0.16217016 -0.1000207 ]
# 0.1220734899492754
# 0.02231316891513297
# public score : 0.12629

#lgbm6
# 0.1243739978569125.
# [-0.12021403 -0.1201359  -0.11912219 -0.16589076 -0.0965071 ]
# 0.1201359012486722
# 0.02264091018759263
# public score : 0.12429

#rf
# 0.13801475483285586.
# [-0.1305587  -0.12929222 -0.14595504 -0.15818268 -0.12608512]
# 0.13055870069685813
# 0.0121965965512414
# public score : 0.14004

# Train on full data and obtain test predictions

In [96]:
#retrain on full data and obtain test predictions using best model hyperparameter values
pipeline.fit(X, np.log(y))

# Preprocessing of validation data, get predictions
pred = np.exp(pipeline.predict(X_test))

print(pred[:10])

[125321.43265209 157841.4066918  189251.05004532 198367.43387555
 184937.52469191 174989.94740377 173926.42958538 168606.43359007
 176047.92668074 125709.32469768]


In [97]:
pipeline

In [98]:
# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': pred})
output.to_csv('submission.csv', index=False)
print('saved output file')

saved output file
