In [101]:
import sys
import os
import numpy as np
import pandas as pd
import joblib
from sklearn.metrics import mean_squared_log_error
from scipy.optimize import minimize
import warnings

warnings.filterwarnings('ignore')


In [102]:
sys.path.append(os.path.abspath("/home/bk_anupam/code/ML/ML_UTILS/"))

In [103]:
import train_tabular_utils as tt
import cv_split_utils
import enums
import data_utils

In [104]:
class Config:
    RUN_MODE = "LOCAL"
    RANDOM_SEED = 42
    NUM_FOLDS = 10
    TARGET_COL_NAME = "Rings"        
    METRIC = enums.Metrics.RMSLE        
    TRAIN_SINGLE_FOLD = True
    NORMALIZE_DATA = True        
    SCALER = enums.Scaler.StandardScaler

CPU_COUNT = os.cpu_count()
COLS_TO_LEAVE = ["Rings", "kfold", "outlier_labels"]

DATA_READPATH = "./models/"
DATA_WRITEPATH = "./output/"
SUBMISSION_FILEPATH = DATA_READPATH
if Config.RUN_MODE == "KAGGLE":
    # If we are not generating features, we are using already generated features
    if Config.GENERATE_AUTO_FEATURES:
        DATA_READPATH = "/kaggle/input/playground-series-s4e4/"
        SUBMISSION_FILEPATH = DATA_READPATH
    else:
        DATA_READPATH = "/kaggle/input/abalone-openfe/"
        SUBMISSION_FILEPATH = "/kaggle/input/playground-series-s4e4/"
    DATA_WRITEPATH = "/kaggle/working/"

In [105]:
# import train dataset locally from data folder
df_train = pd.read_csv("./data/train_openfe.csv")
df_train_LightGBM = df_train.copy()

In [106]:
df_train_LightGBM = cv_split_utils.strat_kfold_dataframe(
                                    df=df_train_LightGBM,                           
                                    target_col_name=Config.TARGET_COL_NAME,
                                    num_folds=5,
                                    random_state=Config.RANDOM_SEED
                                )
df_train_LightGBM.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,autoFE_f_0,autoFE_f_1,autoFE_f_2,autoFE_f_3,autoFE_f_4,autoFE_f_5,autoFE_f_6,autoFE_f_7,autoFE_f_8,autoFE_f_9,Rings,kfold
0,I,0.49,0.38,0.125,0.529,0.2165,0.1375,0.155,3.16129,1.396774,2.451613,2.443418,0.335,0.3125,0.28,774.0,0.155,0.225,7,3
1,I,0.42,0.345,0.1,0.3705,0.1625,0.0795,0.1025,4.097561,1.585366,3.365854,2.28,0.3175,0.208,0.2025,148.0,0.1025,0.2425,7,3
2,M,0.555,0.44,0.135,0.739,0.3515,0.1575,0.235,2.361702,1.495745,1.87234,2.102418,0.32,0.3875,0.37,1311.0,0.235,0.205,9,0
3,F,0.535,0.41,0.14,0.709,0.2505,0.17,0.19,2.815789,1.318421,2.157895,2.830339,0.345,0.4585,0.33,1017.0,0.19,0.22,9,4
4,F,0.605,0.455,0.15,1.059,0.4275,0.221,0.31,1.951613,1.379032,1.467742,2.477193,0.295,0.6315,0.46,1020.0,0.31,0.145,10,2


In [107]:
df_train = cv_split_utils.strat_kfold_dataframe(
                                    df=df_train,                           
                                    target_col_name=Config.TARGET_COL_NAME,
                                    num_folds=Config.NUM_FOLDS,
                                    random_state=Config.RANDOM_SEED
                                )
df_train.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,autoFE_f_0,autoFE_f_1,autoFE_f_2,autoFE_f_3,autoFE_f_4,autoFE_f_5,autoFE_f_6,autoFE_f_7,autoFE_f_8,autoFE_f_9,Rings,kfold
0,I,0.49,0.38,0.125,0.529,0.2165,0.1375,0.155,3.16129,1.396774,2.451613,2.443418,0.335,0.3125,0.28,774.0,0.155,0.225,7,7
1,I,0.42,0.345,0.1,0.3705,0.1625,0.0795,0.1025,4.097561,1.585366,3.365854,2.28,0.3175,0.208,0.2025,148.0,0.1025,0.2425,7,7
2,M,0.555,0.44,0.135,0.739,0.3515,0.1575,0.235,2.361702,1.495745,1.87234,2.102418,0.32,0.3875,0.37,1311.0,0.235,0.205,9,0
3,F,0.535,0.41,0.14,0.709,0.2505,0.17,0.19,2.815789,1.318421,2.157895,2.830339,0.345,0.4585,0.33,1017.0,0.19,0.22,9,8
4,F,0.605,0.455,0.15,1.059,0.4275,0.221,0.31,1.951613,1.379032,1.467742,2.477193,0.295,0.6315,0.46,1020.0,0.31,0.145,10,5


In [108]:
cols_float = [ x for x in df_train.select_dtypes(include=["float"]).columns.to_list() if x not in COLS_TO_LEAVE]
cols_int = df_train.select_dtypes(include=["int64"]).columns.to_list()
cols_str = df_train.select_dtypes(include=["object"]).columns.to_list()
feature_cols_to_normalize = cols_float

In [109]:
# one hot encoding of categorical variables
df_train_onehot = pd.get_dummies(df_train, columns=cols_str)
df_train_LightGBM_onehot = pd.get_dummies(df_train_LightGBM, columns=cols_str)

In [110]:
if Config.NORMALIZE_DATA:
    # normalize
    df_train_onehot = tt.normalize_features(df_train_onehot, 
                                            scaler=Config.SCALER,
                                            features_to_normalize=feature_cols_to_normalize)
    
    df_train_LightGBM_onehot = tt.normalize_features(df_train_LightGBM_onehot, 
                                                     scaler=Config.SCALER,
                                                     features_to_normalize=feature_cols_to_normalize)

In [111]:
feature_cols= [x for x in df_train_onehot.columns.to_list() if x not in COLS_TO_LEAVE]
print(feature_cols)

['Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight', 'autoFE_f_0', 'autoFE_f_1', 'autoFE_f_2', 'autoFE_f_3', 'autoFE_f_4', 'autoFE_f_5', 'autoFE_f_6', 'autoFE_f_7', 'autoFE_f_8', 'autoFE_f_9', 'Sex_F', 'Sex_I', 'Sex_M']


In [116]:
def get_oof_preds(df, feature_cols, model_name, num_folds, metric, transform_target=True):
    df_val_preds = pd.DataFrame()
    for fold in range(num_folds):
        fold_val_preds = None
        df_train_fold, df_val_fold = tt.get_fold_df(df, fold)
        fold_model = joblib.load(f"{DATA_READPATH}{model_name}_{fold}.joblib")
        if model_name == enums.ModelName.LGBM:
            fold_val_preds = fold_model.predict(df_val_fold[feature_cols], num_iteration=fold_model.best_iteration)
        else:
            fold_val_preds = fold_model.predict(df_val_fold[feature_cols])
        if metric == enums.Metrics.RMSLE and transform_target:
            # Since we have trained on np.log1p(y) instead of y, we need to reverse the transformation to extract the actual predictions
            fold_val_preds = np.expm1(fold_val_preds)
        df_val_fold["val_preds"] = fold_val_preds
        df_val_preds = pd.concat([df_val_preds, df_val_fold], axis=0)
        print(f"{model_name} fold {fold} of {num_folds} prediction completed")
    return df_val_preds

In [122]:
models = [enums.ModelName.XGBoost, enums.ModelName.CatBoost]
df_oof_preds = pd.DataFrame()
for model in models:
    num_folds = Config.NUM_FOLDS
    df = df_train_onehot
    if model == enums.ModelName.LGBM:
        num_folds = 5
        df = df_train_LightGBM_onehot
    df_oof_preds_model = get_oof_preds(df, feature_cols, model, num_folds=num_folds, metric=Config.METRIC)
    print(f"Got OOF predictions for {model}")
    df_oof_preds[f"{model}_val_preds"] = df_oof_preds_model["val_preds"]

XGBoost fold 0 of 10 prediction completed
XGBoost fold 1 of 10 prediction completed
XGBoost fold 2 of 10 prediction completed
XGBoost fold 3 of 10 prediction completed
XGBoost fold 4 of 10 prediction completed
XGBoost fold 5 of 10 prediction completed
XGBoost fold 6 of 10 prediction completed
XGBoost fold 7 of 10 prediction completed
XGBoost fold 8 of 10 prediction completed
XGBoost fold 9 of 10 prediction completed
Got OOF predictions for XGBoost
CatBoost fold 0 of 10 prediction completed
CatBoost fold 1 of 10 prediction completed
CatBoost fold 2 of 10 prediction completed
CatBoost fold 3 of 10 prediction completed
CatBoost fold 4 of 10 prediction completed
CatBoost fold 5 of 10 prediction completed
CatBoost fold 6 of 10 prediction completed
CatBoost fold 7 of 10 prediction completed
CatBoost fold 8 of 10 prediction completed
CatBoost fold 9 of 10 prediction completed
Got OOF predictions for CatBoost


In [123]:
df_oof_preds

Unnamed: 0,XGBoost_val_preds,CatBoost_val_preds
0,9.578035,9.471093
1,9.847206,9.415083
2,12.908609,12.599259
3,8.913700,8.840678
4,14.141740,14.442228
...,...,...
9056,10.680146,10.624234
9057,6.575980,6.579725
9058,9.285155,9.385207
9059,12.615673,12.754968


In [127]:
# df_val_preds_LGBM = pd.read_csv(f"{DATA_READPATH}df_val_preds_LightGBM.csv")
df_val_preds_XGB = pd.read_csv(f"{DATA_READPATH}df_val_preds_XGBoost.csv")
df_val_preds_CatBoost = pd.read_csv(f"{DATA_READPATH}df_val_preds_CatBoost.csv")
target = df_val_preds_XGB["Rings"]
df_val_preds_XGB.head()

Unnamed: 0.1,Unnamed: 0,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,autoFE_f_0,autoFE_f_1,...,autoFE_f_7,autoFE_f_8,autoFE_f_9,Rings,outlier_labels,kfold,Sex_F,Sex_I,Sex_M,val_preds
0,0,0.320613,0.390926,-0.01221,-0.109326,0.052448,-0.118145,0.069908,-0.322971,-0.054909,...,0.705113,0.066487,0.522112,9,,0,False,False,True,9.578035
1,1,0.109135,0.135891,0.119344,-0.088568,-0.030711,-0.375805,-0.045297,-0.309807,0.013237,...,1.926152,-0.048601,0.343421,9,,0,False,False,True,9.847206
2,2,0.489794,0.339919,1.171772,0.084046,-0.045386,0.060234,0.377121,-0.398196,-0.855491,...,1.660962,0.373388,-0.281998,15,,0,False,False,True,12.908609
3,3,0.912749,0.900996,0.908665,0.767947,0.908498,0.976908,0.453924,-0.36984,0.914822,...,1.200523,0.450113,0.522112,9,,0,False,False,True,8.9137
4,4,1.124226,1.462073,3.013521,1.734805,1.466154,1.606192,1.49077,-0.549119,0.025773,...,-0.413929,1.485903,-0.907417,15,Height-,0,False,False,True,14.14174


In [128]:
df_val_preds_CatBoost.head()

Unnamed: 0.1,Unnamed: 0,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,autoFE_f_0,autoFE_f_1,...,autoFE_f_7,autoFE_f_8,autoFE_f_9,Rings,outlier_labels,kfold,Sex_F,Sex_I,Sex_M,val_preds
0,0,0.320613,0.390926,-0.01221,-0.109326,0.052448,-0.118145,0.069908,-0.322971,-0.054909,...,0.705113,0.066487,0.522112,9,,0,False,False,True,9.471093
1,1,0.109135,0.135891,0.119344,-0.088568,-0.030711,-0.375805,-0.045297,-0.309807,0.013237,...,1.926152,-0.048601,0.343421,9,,0,False,False,True,9.415083
2,2,0.489794,0.339919,1.171772,0.084046,-0.045386,0.060234,0.377121,-0.398196,-0.855491,...,1.660962,0.373388,-0.281998,15,,0,False,False,True,12.599259
3,3,0.912749,0.900996,0.908665,0.767947,0.908498,0.976908,0.453924,-0.36984,0.914822,...,1.200523,0.450113,0.522112,9,,0,False,False,True,8.840678
4,4,1.124226,1.462073,3.013521,1.734805,1.466154,1.606192,1.49077,-0.549119,0.025773,...,-0.413929,1.485903,-0.907417,15,Height-,0,False,False,True,14.442228


In [129]:
models = [enums.ModelName.XGBoost, enums.ModelName.CatBoost]
df_oof_preds = pd.DataFrame()
# load the OOF csv for each model
for model in models:
    df_model_oof = pd.read_csv(f"{DATA_READPATH}df_val_preds_{model}.csv")
    df_oof_preds[f"{model}_val_preds"] = df_model_oof["val_preds"]
    df_oof_preds[f"{model}_target"] = df_model_oof["Rings"]

In [130]:
def rmsle_func(weights, oof_preds, target):
    preds = np.average(oof_preds, axis=1, weights=weights)
    return np.sqrt(mean_squared_log_error(target, preds))

In [131]:
df_oof_preds

Unnamed: 0,XGBoost_val_preds,XGBoost_target,CatBoost_val_preds,CatBoost_target
0,9.578035,9,9.471093,9
1,9.847206,9,9.415083,9
2,12.908609,15,12.599259,15
3,8.913700,9,8.840678,9
4,14.141740,15,14.442228,15
...,...,...,...,...
90610,10.680146,11,10.624234,11
90611,6.575980,9,6.579725,9
90612,9.285155,10,9.385207,10
90613,12.615673,14,12.754968,14


In [134]:
# Start by giving equal weight to each model ( = 1 / n_models). Sum of weights is 1.
target = df_oof_preds["XGBoost_target"]
initial_weights = np.ones(len(models)) / len(models)
# We want to find the set of weights that minimizes the RMSE. We start with the initial weights.
res = minimize(rmsle_func, initial_weights, args=(df_oof_preds[["XGBoost_val_preds","CatBoost_val_preds"]].to_numpy(),target), method='Nelder-Mead')
model_weights = res["x"]
rmsle = res["fun"]
model_weights_normalized = model_weights / np.sum(model_weights)
print("Optimal Model Weights:", model_weights_normalized)
print("Optimal RMSLE:", rmsle)


Optimal Model Weights: [0.54793446 0.45206554]
Optimal RMSLE: 0.14633776687930708


In [135]:
xgb_preds = df_oof_preds["XGBoost_val_preds"]
cb_preds = df_oof_preds["CatBoost_val_preds"]
print(f"XGBoost RMSLE cv score: {np.sqrt(mean_squared_log_error(target, xgb_preds))}")
print(f"CatBoost RMSLE cv score: {np.sqrt(mean_squared_log_error(target, cb_preds))}")

XGBoost RMSLE cv score: 0.1465205365842413
CatBoost RMSLE cv score: 0.1466049665951233
