In [232]:
# !pip install -U scikit-learn --quiet

In [233]:
import sys
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
import statistics
import re
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, TargetEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectFromModel, SequentialFeatureSelector
from functools import partial
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score
from sklearn.model_selection import KFold
from lightgbm import log_evaluation, early_stopping
import warnings

warnings.filterwarnings('ignore')


In [234]:
sys.path.append(os.path.abspath("/home/bk_anupam/code/ML/ML_UTILS/"))

In [235]:
import train_tabular_utils as tt
import cv_split_utils
import enums
from enums import ModelName
import data_utils
import param_tuning_utils as ptu

In [236]:
class Config:
    RUN_MODE = "LOCAL"
    RANDOM_SEED = 42
    NUM_FOLDS = 5
    TARGET_COL_NAME = "price"        
    SCALER = enums.Scaler.StandardScaler
    METRIC = enums.Metrics.RMSE
    # These values are more dynamic   
    MODEL_TYPE = enums.ModelName.CatBoost
    NUM_TUNING_TRIALS = 2
    TUNE_ON_SINGLE_FOLD = True
    TUNE_STEPWISE = False
    TRAIN_SINGLE_FOLD = False    
    PERSIST_MODEL = True    
    USE_MANUAL_FEATURES = False
    USE_ORIGINAL_DATA = True    
    FEATURE_SELECTION_METHOD = None
    NUM_CLASSES = None

COLS_TO_LEAVE = ["id", "price", "kfold", "target_grp", "transmission_speed", "is_price_outlier"]
CPU_COUNT = os.cpu_count()

DATA_READPATH = "./data/"
DATA_WRITEPATH = "./output/"
SUBMISSION_FILEPATH = DATA_READPATH
if Config.RUN_MODE == "KAGGLE":    
    DATA_READPATH = "/kaggle/input/playground-series-s4e9/"
    if Config.USE_MANUAL_FEATURES:
        DATA_READPATH = "/kaggle/input/ps4e9-fe/"        
    SUBMISSION_FILEPATH = "/kaggle/input/playground-series-s4e9/"
    DATA_WRITEPATH = "/kaggle/working/"

In [237]:
model_static_params = {
    enums.ModelName.XGBoost: {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "seed": Config.RANDOM_SEED,
        "verbosity": 0,
        "device": "cuda",
        "tree_method": "hist"
    },
    enums.ModelName.LGBM: {
        "objective": "root_mean_squared_error",
        "metric": 'rmse',
        "verbosity": -1,    # <0: fatal, =0: error (warn), =1: info, >1: debug
        "boosting_type": "gbdt",
        "device":  "gpu",
        "gpu_platform_id": 0,
        "gpu_device_id": 0
    },
    enums.ModelName.CatBoost: {
        "objective": "RMSE",
        "verbose": 0,
        "random_seed": Config.RANDOM_SEED,
        "eval_metric": "RMSE",
        #"iterations": 1000,
        #"early_stopping_rounds": 100,
        'grow_policy':  'Lossguide',
        'bootstrap_type': 'Poisson',
        'task_type': 'GPU'
    }
}

In [238]:
# For RandomForest
# tuned_model_params = {'n_estimators': 1300, 'max_depth': 17, 'min_samples_leaf': 3, 'min_samples_split': 3, 'max_features': 'sqrt'}
# For CatBoost
cb_tuned_model_params = {'learning_rate': 0.0026752932092600212, 'n_estimators': 4550, 'max_depth': 6, 'min_data_in_leaf': 73, 'num_leaves': 136, 'subsample': 0.9417643579568004, 'reg_lambda': 99.65892476752238, 'random_strength': 0.6286348851856994, 'early_stopping_rounds': 200, 'max_bin': 57}
# For XGBoost
xgb_tuned_model_params = {'learning_rate': 0.028614729311166577, 'n_estimators': 3200, 'max_depth': 4, 'min_child_weight': 17, 'subsample': 0.8642950041583155, 'colsample_bytree': 0.6804130458486328, 'reg_lambda': 3.8611185565274893, 'reg_alpha': 9.227741719171974, 'max_leaves': 157, 'early_stopping_rounds': 46, 'max_bin': 64, 'gamma': 1.2844801638262502}
# For LGBM
lgbm_tuned_model_params = {'learning_rate': 0.04384599885192124, 'n_estimators': 1850, 'max_depth': 4, 'min_data_in_leaf': 79, 'num_leaves': 20, 'min_child_weight': 1.3000000000000003, 'subsample': 0.6909074597205864, 'colsample_bytree': 0.7928274099493369, 'reg_lambda': 178.39029552790532, 'reg_alpha': 1.704574120515208, 'early_stopping_rounds': 200, 'max_bin': 184}

# cb_tuned_model_params = None

In [239]:
def get_train_data():
    df_train = pd.read_csv(DATA_READPATH + "train.csv")
    if Config.USE_ORIGINAL_DATA:
        # df_train_orig = pd.read_csv("/kaggle/input/used-car-price-prediction-dataset/" + "used_cars.csv")
        df_train_orig = pd.read_csv(DATA_READPATH + "used_cars.csv")
        df_train_orig[['milage', 'price']] = df_train_orig[['milage', 'price']].applymap(lambda x: int(re.sub("[^0-9]", "", x)))
        df_train_orig['milage'] = df_train_orig['milage'].astype('int64')
        df_train_orig['price'] = df_train_orig['price'].astype('int64')
        # add df_train_orig rows to df_train
        df_train = pd.concat([df_train, df_train_orig], axis=0, ignore_index=True)        
    return df_train

In [240]:
df_train = get_train_data()
print(f"df_train.shape: {df_train.shape}")
df_test = pd.read_csv(DATA_READPATH + "test.csv")
df_test["price"] = 0
print(f"df_test.shape: {df_test.shape}")
df_combined = pd.concat([df_train, df_test],axis=0,ignore_index=True)
print("df_combined shape:", df_combined.shape )
# keep a copy of original train and test data for later use
df_train_orig = df_train.copy()
df_test_orig = df_test.copy()
# # drop id column
# df_train = df_train.drop("id", axis=1)
# df_test = df_test.drop("id", axis=1)

df_train.shape: (192542, 13)
df_test.shape: (125690, 13)
df_combined shape: (318232, 13)


In [241]:
def extract_age_features(df):
    current_year = 2024
    df['Vehicle_Age'] = current_year - df['model_year']
    # set Vehicle_Age to 1 where Vehicle_Age = 0
    df.loc[df['Vehicle_Age'] == 0, 'Vehicle_Age'] = 1
    df['Mileage_per_Year'] = df['milage'] / df['Vehicle_Age']
    df['milage_with_age'] =  df.groupby('Vehicle_Age')['milage'].transform('mean')
    df['Mileage_per_Year_with_age'] =  df.groupby('Vehicle_Age')['Mileage_per_Year'].transform('mean')
    return df

In [242]:
def extract_engine_features(df):
    
    def extract_horsepower(engine):
        try:
            return float(engine.split('HP')[0])
        except:
            return None

    def extract_engine_size(engine):
        try:
            return float(engine.split(' ')[1].replace('L', ''))
        except:
            return None

    df['Horsepower'] = df['engine'].apply(extract_horsepower)
    df['Engine_Size'] = df['engine'].apply(extract_engine_size)
    df['Power_to_Weight_Ratio'] = df['Horsepower'] / df['Engine_Size']
    
    return df

In [243]:
def extract_other_features(df):
    luxury_brands = ['Mercedes-Benz', 'Bentley', 'Aston', 'Jaguar', 'Tesla', 'Lamborghini', 'Land', 'RAM', 
                                'Cadillac', 'Alfa', 'Ferrari', 'Porsche', 'Bugatti', 'McLaren', 'Rolls-Royce', 'Lucid', 
                                'Maserati', 'Rivian', 'Genesis']
    df['is_luxury_brand'] = df['brand'].isin(luxury_brands)
    return df    

In [244]:
%%time

df_train = extract_age_features(df_train)
df_test = extract_age_features(df_test)

# train = extract_engine_features(train)
# test = extract_engine_features(test)

df_train = extract_other_features(df_train)
df_test = extract_other_features(df_test)

CPU times: user 36.5 ms, sys: 0 ns, total: 36.5 ms
Wall time: 36.1 ms


In [245]:
def update(df):    
    t = 100    
    cat_c = ['brand','model','fuel_type','engine','transmission','ext_col','int_col','accident','clean_title']
    re_ = ['model','engine','transmission','ext_col','int_col']
    
    for col in re_:
        df.loc[df[col].value_counts(dropna=False)[df[col]].values < t, col] = "noise"
        
    for col in cat_c:
        df[col] = df[col].fillna('missing')
        df[col] = df[col].astype('category')
        
    return df

df_train = update(df_train)
df_test = update(df_test)

In [246]:
# do not include 'id' column in the list of int columns
int_cols = [col for col in df_train.columns if df_train[col].dtypes == 'int64' and col not in COLS_TO_LEAVE]
float_cols = [col for col in df_train.columns if df_train[col].dtypes == 'float64']
bool_cols = [col for col in df_train.columns if df_train[col].dtypes == 'bool']
cat_cols = [col for col in df_train.columns if df_train[col].dtypes in ['object', 'category'] and col not in COLS_TO_LEAVE]
feature_cols = [x for x in df_train.columns if x not in COLS_TO_LEAVE]
print(f"feature_cols: {feature_cols}")
print(f"cat_cols: {cat_cols}")

feature_cols: ['brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title', 'Vehicle_Age', 'Mileage_per_Year', 'milage_with_age', 'Mileage_per_Year_with_age', 'is_luxury_brand']
cat_cols: ['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']


In [247]:
# create a new column in df_train called is_price_outlier and set it 1 if price > 99250 else set it to 0
df_train['is_price_outlier'] = np.where(df_train['price'] > 99250, 1, 0)
df_train['is_price_outlier'].value_counts()

is_price_outlier
0    181418
1     11124
Name: count, dtype: int64

In [248]:
callbacks = [log_evaluation(period=300), early_stopping(stopping_rounds=200)]

def get_is_price_outlier_oof(df, target, lgb_params, model_type='LGBM'):    
    oof_predictions = np.zeros(len(df))
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    models = []
    auc_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
        print(f"Training fold {fold + 1}/{5} with {model_type}")

        X_train, X_val = df.iloc[train_idx], df.iloc[val_idx]
        y_train, y_val = target.iloc[train_idx], target.iloc[val_idx]
        
        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)            
        model = lgb.train(
            lgb_params,
            train_data,
            valid_sets=[train_data, val_data],
            valid_names=['train', 'valid'],
            callbacks=callbacks    
        )                
        models.append(model)                
        
        pred = model.predict(X_val, num_iteration=model.best_iteration)        
        fold_auc = roc_auc_score(y_val, pred)
        auc_scores.append(fold_auc)
        print(f'{model_type} Fold AUC: {fold_auc}')        
        oof_predictions[val_idx] = pred
        
    print(f'Mean AUC: {np.mean(auc_scores)}')
    return oof_predictions, models

In [249]:
lgb_params_1 = {
    'objective': 'binary',
    'n_estimators': 1000,
    'random_state': 42,
    'metric': 'auc',
}

X = df_train[feature_cols]
y = df_train['is_price_outlier']
oof_predictions_lgbm, models_lgbm = get_is_price_outlier_oof(X, y, lgb_params_1, model_type='LGBM')
df_train['lgbm_price_outlier_proba'] = oof_predictions_lgbm

LGBM_preds = np.zeros(len(df_test[feature_cols]))
for model in models_lgbm:
    LGBM_preds += model.predict(df_test[feature_cols]) / len(models_lgbm)
df_test['lgbm_price_outlier_proba'] = LGBM_preds

Training fold 1/5 with LGBM
[LightGBM] [Info] Number of positive: 8863, number of negative: 145170
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015485 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1722
[LightGBM] [Info] Number of data points in the train set: 154033, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.057540 -> initscore=-2.796020
[LightGBM] [Info] Start training from score -2.796020
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[40]	train's auc: 0.918758	valid's auc: 0.876206
LGBM Fold AUC: 0.8762061608901712
Training fold 2/5 with LGBM
[LightGBM] [Info] Number of positive: 8985, number of negative: 145048
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014420 seconds.
You can set `force_col_wise=tr

In [250]:
cat_cols = [col for col in df_train.columns if df_train[col].dtypes in ['object', 'category'] and col not in COLS_TO_LEAVE]
feature_cols = [x for x in df_train.columns if x not in COLS_TO_LEAVE]
print(f"feature_cols: {feature_cols}")
print(f"cat_cols: {cat_cols}")

feature_cols: ['brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title', 'Vehicle_Age', 'Mileage_per_Year', 'milage_with_age', 'Mileage_per_Year_with_age', 'is_luxury_brand', 'lgbm_price_outlier_proba']
cat_cols: ['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']


In [251]:
callbacks = [log_evaluation(period=300), early_stopping(stopping_rounds=200)]

def get_MAE_oof(df, target, lgb_params, cat_params=None, model_type='LGBM'):    
    oof_predictions = np.zeros(len(df))
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    models = []
    rmse_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
        print(f"Training fold {fold + 1}/{5} with {model_type}")

        X_train, X_val = df.iloc[train_idx], df.iloc[val_idx]
        y_train, y_val = target.iloc[train_idx], target.iloc[val_idx]

        if model_type == 'LGBM':
            train_data = lgb.Dataset(X_train, label=y_train)
            val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)            
            model = lgb.train(
                lgb_params,
                train_data,
                valid_sets=[train_data, val_data],
                valid_names=['train', 'valid'],
                callbacks=callbacks    
            )        
        elif model_type == 'CAT':
            train_data = Pool(data=X_train, label=y_train , cat_features=cat_cols)
            val_data = Pool(data=X_val, label=y_val , cat_features=cat_cols )            
            model = CatBoostRegressor(**cat_params)
            model.fit(train_data, eval_set=val_data, verbose=150, early_stopping_rounds=200)
        
        models.append(model)
        
        if model_type == 'LGBM':
            pred = model.predict(X_val, num_iteration=model.best_iteration)
        elif model_type == 'CAT':
            pred = model.predict(X_val)
        
        rmse = np.sqrt(mean_squared_error(y_val, pred))
        rmse_scores.append(rmse)
        print(f'{model_type} Fold RMSE: {rmse}')        
        oof_predictions[val_idx] = pred
        
    print(f'Mean RMSE: {np.mean(rmse_scores)}')
    return oof_predictions, models

lgb_params = {
    'objective': 'MAE',
    'n_estimators': 1000,
    'random_state': 42,
}

X = df_train[feature_cols]
y = df_train['price']
oof_predictions_lgbm, models_lgbm = get_MAE_oof(X, y, lgb_params, model_type='LGBM')
df_train['LGBM_MAE'] = oof_predictions_lgbm

LGBM_preds = np.zeros(len(df_test[feature_cols]))
for model in models_lgbm:
    LGBM_preds += model.predict(df_test[feature_cols]) / len(models_lgbm)
df_test['LGBM_MAE'] = LGBM_preds

lgb_params = {
    'objective': 'MSE',
    'n_estimators': 1000,
    'random_state': 42,
}

oof_predictions_lgbm, models_lgbm = get_MAE_oof(X, y, lgb_params, model_type='LGBM')
df_train['LGBM_MSE_diff'] = oof_predictions_lgbm - df_train['LGBM_MAE']

LGBM_preds = np.zeros(len(df_test[feature_cols]))
for model in models_lgbm:
    LGBM_preds += model.predict(df_test[feature_cols]) / len(models_lgbm)
df_test['LGBM_MSE_diff'] = LGBM_preds - df_test['LGBM_MAE']

Training fold 1/5 with LGBM
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022591 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1977
[LightGBM] [Info] Number of data points in the train set: 154033, number of used features: 17
[LightGBM] [Info] Start training from score 30775.000000
Training until validation scores don't improve for 200 rounds
[300]	train's l1: 16259.3	valid's l1: 17337.8
[600]	train's l1: 15982.5	valid's l1: 17337.7
Early stopping, best iteration is:
[427]	train's l1: 16123.1	valid's l1: 17336.5
LGBM Fold RMSE: 75981.08997022854
Training fold 2/5 with LGBM
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018207 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1977
[Lig

In [252]:
df_train.describe()

Unnamed: 0,id,model_year,milage,price,Vehicle_Age,Mileage_per_Year,milage_with_age,Mileage_per_Year_with_age,is_price_outlier,lgbm_price_outlier_proba,LGBM_MAE,LGBM_MSE_diff
count,188533.0,192542.0,192542.0,192542.0,192542.0,192542.0,192542.0,192542.0,192542.0,192542.0,192542.0,192542.0
mean,94266.0,2015.823452,65684.728927,43892.07,8.177421,8684.833645,65684.728927,8684.833645,0.057774,0.057916,35906.34014,8286.823842
std,54424.933488,5.670724,49851.51298,78817.11,5.669542,6251.326532,35495.308987,1107.818872,0.233317,0.101303,23527.828508,11618.404407
min,0.0,1974.0,100.0,2000.0,1.0,5.263158,9745.115355,1258.575385,0.0,0.002808,2763.11281,-45380.402858
25%,47133.0,2013.0,24115.0,17000.0,4.0,5150.0,34436.373284,8609.093321,0.0,0.00716,17795.733112,3074.104233
50%,94266.0,2017.0,57550.0,30825.0,7.0,8000.0,67981.161419,8885.004937,0.0,0.017169,32353.852109,6226.699267
75%,141399.0,2020.0,95400.0,49900.0,11.0,11000.0,92817.275956,9522.712359,0.0,0.055415,47172.774414,10067.435026
max,188532.0,2024.0,405000.0,2954083.0,50.0,235000.0,134082.653779,9775.812469,1.0,0.829225,263745.920088,360649.941748


In [None]:
# df['Vehicle_Age'] = current_year - df['model_year']
#     df['Mileage_per_Year'] = df['milage'] / df['Vehicle_Age']
#     df['milage_with_age'] =  df.groupby('Vehicle_Age')['milage'].transform('mean')
#     df['Mileage_per_Year_with_age'] =  df.groupby('Vehicle_Age')['Mileage_per_Year'].transform('mean')

In [None]:
cat_cols = [col for col in df_train.columns if df_train[col].dtypes in ['object', 'category'] and col not in COLS_TO_LEAVE]
feature_cols = [x for x in df_train.columns if x not in COLS_TO_LEAVE]
print(f"feature_cols: {feature_cols}")
print(f"cat_cols: {cat_cols}")

In [110]:
df_train = cv_split_utils.strat_kfold_dataframe(df_train, 
                                                random_state=Config.RANDOM_SEED, 
                                                num_folds=Config.NUM_FOLDS,
                                                target_col_name=Config.TARGET_COL_NAME, 
                                                n_bins=40)

In [112]:
# preprocessing
preprocessor = None
imputation_config = None
cat_encoders = {col: [
                       OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
                     ]
                    for col in cat_cols
                }

In [113]:
cb_level_params_totune = {
    "1": ["learning_rate", "n_estimators"],
    "2": ["max_depth", "min_data_in_leaf", "num_leaves"],
    "3": ["subsample", "reg_lambda", "random_strength", "early_stopping_rounds", "max_bin"]
}

cb_param_ranges = {
    'learning_rate': {'type': 'loguniform', 'min_value': 1e-3, 'max_value': 0.1},    
    'max_depth': {'type': 'int', 'min_value': 4, 'max_value': 20},    
    # 'subsample': {'type': 'float', 'min_value': 0.2, 'max_value': 1},
    'reg_lambda': {'type': 'loguniform', 'min_value': 1e-3, 'max_value': 10},
    'random_strength': {'type': 'int', 'min_value': 0, 'max_value': 10}
}

cb_params_defaults = {
    'max_depth': 6,
    'min_data_in_leaf': 1,
    'subsample': 0.8,
    # comment colsample_bylevel for GPU training
    #'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1),
    'num_leaves': 96,
    'reg_lambda': 3,
    'random_strength': 1,
    'early_stopping_rounds': 100,
    'max_bin': 254
}

In [114]:
lgbm_level_params_totune = {
    "1": ["learning_rate", "n_estimators", "max_depth"],
    "2": ["min_data_in_leaf", "num_leaves", "min_child_weight"],
    "3": ["subsample", "colsample_bytree", "reg_lambda", "reg_alpha", "early_stopping_rounds", "max_bin"]
}

lgbm_param_ranges = {
    'learning_rate': {'type': 'float', 'min_value': 0.005, 'max_value': 0.3, 'log': True},
    'n_estimators': {'type': 'int', 'min_value': 500, 'max_value': 5000, 'step': 50},
    'max_depth': {'type': 'int', 'min_value': 4, 'max_value': 20},
    'min_data_in_leaf': {'type': 'int', 'min_value': 5, 'max_value': 100},
    'num_leaves': {'type': 'int', 'min_value': 4, 'max_value': 256, 'step': 4},
    'min_child_weight': {'type': 'float', 'min_value': 0.1, 'max_value': 10, 'step': 0.2},
    'subsample': {'type': 'float', 'min_value': 0.5, 'max_value': 1},
    'colsample_bytree': {'type': 'float', 'min_value': 0.5, 'max_value': 1},
    'reg_lambda': {'type': 'float', 'min_value': 1, 'max_value': 300},
    'reg_alpha': {'type': 'float', 'min_value': 0, 'max_value': 5},    
    'early_stopping_rounds': {'type': 'int', 'min_value': 50, 'max_value': 500, 'step': 20},
    'max_bin': {'type': 'int', 'min_value': 32, 'max_value': 255}
}

lgbm_params_defaults = {
    'max_depth': 5,
    'min_data_in_leaf': 20,
    'num_leaves': 31,
    'min_child_weight': 1.0,
    'subsample': 0.8,    
    'colsample_bytree': 0.8,
    'reg_lambda': 3,
    'reg_alpha': 0,
    'random_strength': 1,
    'early_stopping_rounds': 100,
    'max_bin': 255
}

In [115]:
xgb_level_params_totune = {
    "1": ["learning_rate", "n_estimators", "max_depth"],
    "2": ["min_child_weight", "subsample", "colsample_bytree"],
    "3": ["reg_lambda", "reg_alpha", "early_stopping_rounds", "max_bin", "max_leaves", "gamma"]
}

xgb_param_ranges = {
    'learning_rate': {'type': 'float', 'min_value': 0.005, 'max_value': 0.3, 'log': True},
    'n_estimators': {'type': 'int', 'min_value': 100, 'max_value': 5000, 'step': 50},
    'max_depth': {'type': 'int', 'min_value': 4, 'max_value': 20},        
    'min_child_weight': {'type': 'int', 'min_value': 1, 'max_value': 20},
    'subsample': {'type': 'float', 'min_value': 0.5, 'max_value': 1},
    'colsample_bytree': {'type': 'float', 'min_value': 0.5, 'max_value': 1},
    'reg_lambda': {'type': 'float', 'min_value': 0.01, 'max_value': 10},
    'reg_alpha': {'type': 'float', 'min_value': 0.0, 'max_value': 10.0},   
    'max_leaves': {'type': 'int', 'min_value': 0, 'max_value': 256}, 
    'early_stopping_rounds': {'type': 'int', 'min_value': 10, 'max_value': 100},
    'max_bin': {'type': 'int', 'min_value': 32, 'max_value': 255},
    'gamma': {'type': 'float', 'min_value': 0.0, 'max_value': 10.0}
}

xgb_params_defaults = {
    'max_depth': 5,    
    'min_child_weight': 1,
    'subsample': 0.8,    
    'colsample_bytree': 0.8,
    'reg_lambda': 0,
    'reg_alpha': 0,    
    'early_stopping_rounds': 15,
    'max_bin': 255,
    'gamma': 0.0,
    'max_leaves': 0
}

In [116]:
def get_tuning_params(model_name):
    if model_name == enums.ModelName.LGBM:
        level_params_totune = lgbm_level_params_totune
        param_ranges = lgbm_param_ranges
        params_defaults = lgbm_params_defaults
    elif model_name == enums.ModelName.CatBoost:
        level_params_totune = cb_level_params_totune
        param_ranges = cb_param_ranges
        params_defaults = cb_params_defaults
    elif model_name == enums.ModelName.XGBoost:
        level_params_totune = xgb_level_params_totune
        param_ranges = xgb_param_ranges
        params_defaults = xgb_params_defaults
    return level_params_totune, param_ranges, params_defaults

In [117]:
# if cb_tuned_model_params is None:
#     #df = df_train.sample(frac=0.1, random_state=Config.RANDOM_SEED)
#     level_params_totune, param_ranges, params_defaults = get_tuning_params(Config.MODEL_TYPE)
#     tuned_model_params = ptu.tune_model_params(
#                             study_name=Config.MODEL_TYPE + "_ModelTuning", 
#                             study_direction="minimize",
#                             num_trials=Config.NUM_TUNING_TRIALS,
#                             model_name=Config.MODEL_TYPE,
#                             preprocessor=preprocessor,
#                             df=df_train,
#                             feature_cols=feature_cols,
#                             metric=Config.METRIC,
#                             target_col_name=Config.TARGET_COL_NAME,
#                             single_fold=Config.TUNE_ON_SINGLE_FOLD,
#                             num_folds=Config.NUM_FOLDS,
#                             imputation_config=imputation_config,
#                             cat_features=cat_cols,
#                             cat_encoders=cat_encoders,
#                             stepwise=Config.TUNE_STEPWISE,
#                             level_params_totune=level_params_totune,
#                             params_defaults=params_defaults,
#                             static_params=model_static_params,
#                             param_ranges=param_ranges
#                         )
#     print(f"Tuned model params: {tuned_model_params}")

In [118]:
model_params = None
cb_params_static = model_static_params.get(ModelName.CatBoost)
cb_model_params = {**cb_params_static, **cb_tuned_model_params}
print(f"cb_model_params: {cb_model_params}")    
lgbm_params_static = model_static_params.get(ModelName.LGBM)
lgbm_model_params = {**lgbm_params_static, **lgbm_tuned_model_params}
print(f"lgbm_model_params: {lgbm_model_params}")    
xgb_params_static = model_static_params.get(ModelName.XGBoost)
xgb_model_params = {**xgb_params_static, **xgb_tuned_model_params}
print(f"xgb_model_params: {xgb_model_params}")    

cb_model_params: {'objective': 'RMSE', 'verbose': 0, 'random_seed': 42, 'eval_metric': 'RMSE', 'grow_policy': 'Lossguide', 'bootstrap_type': 'Poisson', 'task_type': 'GPU', 'learning_rate': 0.0026752932092600212, 'n_estimators': 4550, 'max_depth': 6, 'min_data_in_leaf': 73, 'num_leaves': 136, 'subsample': 0.9417643579568004, 'reg_lambda': 99.65892476752238, 'random_strength': 0.6286348851856994, 'early_stopping_rounds': 200, 'max_bin': 57}


In [None]:
%%time
cb_fold_metrics_model, df_oof_preds_cb, _ = tt.train_and_validate(
        model_name=ModelName.CatBoost,
        model_params=cb_model_params,
        preprocessor=None,
        df=df_train,
        feature_cols=feature_cols,
        target_col_name=Config.TARGET_COL_NAME,
        metric=Config.METRIC,
        single_fold=Config.TRAIN_SINGLE_FOLD,
        num_folds=Config.NUM_FOLDS,
        suppress_print=False,
        imputation_config=imputation_config,
        cat_features=cat_cols,
        cat_encoders=None
)

In [80]:
tt.persist(
    model_name=ModelName.CatBoost, 
    fold_metrics_model=cb_fold_metrics_model, 
    df_oof_preds=df_oof_preds_cb, 
    persist_model=Config.PERSIST_MODEL, 
    output_path=DATA_WRITEPATH
)

In [81]:
%%time
lgbm_fold_metrics_model, df_oof_preds_lgbm, _ = tt.train_and_validate(
        model_name=ModelName.LGBM,
        model_params=lgbm_model_params,
        preprocessor=None,
        df=df_train,
        feature_cols=feature_cols,
        target_col_name=Config.TARGET_COL_NAME,
        metric=Config.METRIC,
        single_fold=Config.TRAIN_SINGLE_FOLD,
        num_folds=Config.NUM_FOLDS,
        suppress_print=False,
        imputation_config=None,
        cat_features=None,
        cat_encoders=cat_encoders
)

In [82]:
tt.persist(
    model_name=ModelName.LGBM, 
    fold_metrics_model=lgbm_fold_metrics_model, 
    df_oof_preds=df_oof_preds_lgbm, 
    persist_model=Config.PERSIST_MODEL, 
    output_path=DATA_WRITEPATH
)

In [None]:
%%time
xgb_fold_metrics_model, df_oof_preds_xgb, _ = tt.train_and_validate(
        model_name=ModelName.XGBoost,
        model_params=xgb_model_params,
        preprocessor=None,
        df=df_train,
        feature_cols=feature_cols,
        target_col_name=Config.TARGET_COL_NAME,
        metric=Config.METRIC,
        single_fold=Config.TRAIN_SINGLE_FOLD,
        num_folds=Config.NUM_FOLDS,
        suppress_print=False,
        imputation_config=None,
        cat_features=None,
        cat_encoders=cat_encoders
)

In [None]:
tt.persist(
    model_name=ModelName.XGBoost, 
    fold_metrics_model=xgb_fold_metrics_model, 
    df_oof_preds=df_oof_preds_xgb, 
    persist_model=Config.PERSIST_MODEL, 
    output_path=DATA_WRITEPATH
)

In [None]:
df_oof_preds = pd.DataFrame()
df_oof_preds["cb_preds"] = df_oof_preds_cb["oof_preds"]
df_oof_preds["lgbm_preds"] = df_oof_preds_lgbm["oof_preds"]
df_oof_preds["xgb_preds"] = df_oof_preds_xgb["oof_preds"]
df_oof_preds["price"] = df_oof_preds_cb["price"]

In [83]:
df_test_preds_cb = tt.get_test_preds(cb_fold_metrics_model, df_test, feature_cols, preprocessor=None, num_folds=Config.NUM_FOLDS)
df_test_preds_cb.to_csv(DATA_WRITEPATH + f'df_test_preds_{ModelName.CatBoost}.csv',index=False)

In [None]:
# perform categorical encoding for test data
if cat_encoders is not None:
    for col, encoders in cat_encoders.items():    
        for encoder in encoders:
            df_train[[col]] = encoder.fit_transform(df_train[[col]], df_train[Config.TARGET_COL_NAME])
            df_test[[col]] = encoder.transform(df_test[[col]])

In [None]:
df_test_preds_lgbm = tt.get_test_preds(lgbm_fold_metrics_model, df_test, feature_cols, preprocessor=None, num_folds=Config.NUM_FOLDS)
df_test_preds_lgbm.to_csv(DATA_WRITEPATH + f'df_test_preds_{ModelName.LGBM}.csv',index=False)
df_test_preds_xgb = tt.get_test_preds(xgb_fold_metrics_model, df_test, feature_cols, preprocessor=None, num_folds=Config.NUM_FOLDS)
df_test_preds_xgb.to_csv(DATA_WRITEPATH + f'df_test_preds_{ModelName.XGBoost}.csv',index=False)
print(f"Completed prediction for {len(df_test)} test rows")

In [None]:
def rmse_func(weights, oof_preds, target):
    pred = (oof_preds * weights).sum(axis=1)
    rmse = np.sqrt(1 / len(pred) * ((target - pred)**2).sum())
    return rmse

In [None]:
from scipy.optimize import minimize

def optimize_weights(oof_preds, target, initial_weights):    
    # Use the squared weights to ensure non-negativity
    def objective(squared_weights):
        weights = squared_weights**2
        weights /= np.sum(weights)  # Normalize to sum to 1
        return rmse_func(weights, oof_preds, target)

    # Optimize using SLSQP method which supports constraints
    res = minimize(
        objective,
        np.sqrt(initial_weights),  # Use square root of initial weights
        method='SLSQP',
        # constraints is a list of dictionaries each with keys 'type' and 'fun'
        # type can be 'eq' for equality or 'ineq' for inequality
        # Equality constraint means that the constraint function result is to be zero whereas 
        # inequality means that it is to be non-negative
        constraints={'type': 'eq', 'fun': lambda w: np.sum(w**2) - 1},
        options={'ftol': 1e-9, 'disp': True}
    )

    # Square the optimized weights and normalize
    optimized_weights = res.x**2
    optimized_weights /= np.sum(optimized_weights)
    
    return optimized_weights, res.fun

In [None]:
# Example usage
n_models = 3
pred_cols = ["cb_preds", "lgbm_preds", "xgb_preds"]
initial_weights = np.ones(n_models) / n_models
target = df_oof_preds[Config.TARGET_COL_NAME]

model_weights, rmse = optimize_weights(df_oof_preds[pred_cols].to_numpy(), target, initial_weights)

print("Optimal Model Weights:", model_weights)
print("Optimal RMSE:", rmse)

In [None]:
df_oof_preds["ensemble_preds"] = model_weights[0] * df_oof_preds["cb_preds"] + model_weights[1] * df_oof_preds["lgbm_preds"] + model_weights[2] * df_oof_preds["xgb_preds"]
rmse = np.sqrt(1 / len(df_oof_preds) * ((df_oof_preds["price"] - df_oof_preds["ensemble_preds"])**2).sum())
print("Ensemble RMSE:", rmse)

In [84]:
df_submission = pd.read_csv(SUBMISSION_FILEPATH + 'sample_submission.csv')
df_submission[Config.TARGET_COL_NAME] = model_weights[0] * df_test_preds_cb["test_preds"] + model_weights[1] * df_test_preds_lgbm['test_preds'] + + model_weights[2] * df_test_preds_xgb["test_preds"]
df_submission.to_csv(DATA_WRITEPATH + f'submission_CB_LGBM.csv',index=False)
df_submission.head()