In [76]:
import sys
import os
import numpy as np
import pandas as pd
import joblib
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from scipy.optimize import minimize
import warnings

warnings.filterwarnings('ignore')


In [77]:
sys.path.append(os.path.abspath("/home/bk_anupam/code/ML/ML_UTILS/"))

In [78]:
import train_tabular_utils as tt
import cv_split_utils
import enums
import data_utils

In [79]:
class Config:
    RUN_MODE = "LOCAL"
    RANDOM_SEED = 1
    NUM_FOLDS = 5
    MODEL_TYPE = enums.ModelName.L2_Ridge
    TARGET_COL_NAME = "FloodProbability"            
    METRIC = enums.Metrics.R2
    TRAIN_SINGLE_FOLD = False        

COLS_TO_LEAVE = ["FloodProbability", "kfold"]
CPU_COUNT = os.cpu_count()

DATA_READPATH = "./data/"
DATA_WRITEPATH = "./output/"
SUBMISSION_FILEPATH = DATA_READPATH
if Config.RUN_MODE == "KAGGLE":
    # If we are not generating features, we are using already generated features
    if Config.GENERATE_AUTO_FEATURES:
        DATA_READPATH = "/kaggle/input/playground-series-s4e5/"
        SUBMISSION_FILEPATH = DATA_READPATH
    else:
        DATA_READPATH = "/kaggle/input/playground-series-s4e5/"
        SUBMISSION_FILEPATH = "/kaggle/input/playground-series-s4e5/"
    DATA_WRITEPATH = "/kaggle/working/"

In [80]:
models = [enums.ModelName.Ridge, enums.ModelName.CatBoost, enums.ModelName.XGBoost]
oof_preds_cols = [f"{item}_oof_preds" for item in models]
test_preds_cols = [f"{item}_oof_preds" for item in models]

In [81]:
df_submission = pd.read_csv(DATA_READPATH + "sample_submission.csv")
df_oof_preds = pd.DataFrame()
df_test_preds = pd.DataFrame()
# load the OOF csv for each model
for model in models:
    df_model_oof = pd.read_csv(f"{DATA_WRITEPATH}df_val_preds_{model}.csv")
    df_model_test_preds = pd.read_csv(f"{DATA_WRITEPATH}submission_{model}.csv")
    df_oof_preds[f"{model}_oof_preds"] = df_model_oof["oof_preds"]
    df_test_preds[f"{model}_oof_preds"] = df_model_test_preds[Config.TARGET_COL_NAME]
    df_oof_preds[f"{model}_target"] = df_model_oof[Config.TARGET_COL_NAME]

In [82]:
df_oof_preds.head()

Unnamed: 0,Ridge_oof_preds,Ridge_target,CatBoost_oof_preds,CatBoost_target,XGBoost_oof_preds,XGBoost_target
0,0.537162,0.56,0.537745,0.56,0.536691,0.56
1,0.543132,0.53,0.543095,0.53,0.543138,0.53
2,0.393806,0.395,0.392896,0.395,0.394089,0.395
3,0.472214,0.46,0.473575,0.46,0.472767,0.46
4,0.484581,0.48,0.485064,0.48,0.484881,0.48


In [83]:
df_test_preds

Unnamed: 0,Ridge_oof_preds,CatBoost_oof_preds,XGBoost_oof_preds
0,0.577683,0.578234,0.573975
1,0.451413,0.455703,0.447683
2,0.451772,0.448710,0.443614
3,0.472086,0.466399,0.464954
4,0.472726,0.466858,0.464387
...,...,...,...
745300,0.474074,0.475750,0.466987
745301,0.440276,0.442807,0.435625
745302,0.619215,0.620232,0.615077
745303,0.548691,0.549319,0.546552


In [84]:
def r2_func(weights, oof_preds, target):
    preds = np.average(oof_preds, axis=1, weights=weights)
    return -1 * r2_score(target, preds)

Get model weights using Nelder-Mead

In [85]:
target = df_oof_preds["CatBoost_target"]
# Start by giving equal weight to each model ( = 1 / n_models). Sum of weights is 1.
initial_weights = np.ones(len(models)) / len(models)
# We want to find the set of weights that minimizes the RMSE. We start with the initial weights.
res = minimize(r2_func, initial_weights, args=(df_oof_preds[oof_preds_cols].to_numpy(),target), method='Nelder-Mead')
model_weights = res["x"]
r2 = res["fun"]
model_weights_normalized = model_weights / np.sum(model_weights)
print("Optimal Model Weights:", model_weights_normalized)
print("Optimal R2:", r2)

Optimal Model Weights: [-0.03852327  1.27399311 -0.23546984]
Optimal R2: -0.8693880939290987


In [86]:
# # Combine the test predictions of models using the optimal weights
# df_submission[Config.TARGET_COL_NAME] = np.average(df_test_preds.to_numpy(), axis=1, weights=model_weights_normalized)
# df_submission.to_csv(f"{DATA_WRITEPATH}submission_ensemble.csv", index=False)
# df_submission

Using base models predictions fit a L2 Ridge model

In [87]:
df_oof_preds = cv_split_utils.kfold_dataframe(df_oof_preds, random_state=Config.RANDOM_SEED, num_folds=Config.NUM_FOLDS)

In [88]:
fold_metrics_model, df_oof_preds_l2, preprocessor = tt.train_and_validate(
        model_name=Config.MODEL_TYPE,
        model_params=None,
        preprocessor=None,
        df=df_oof_preds,
        feature_cols=oof_preds_cols,
        target_col_name="Ridge_target",
        metric=Config.METRIC,
        single_fold=Config.TRAIN_SINGLE_FOLD,
        num_folds=Config.NUM_FOLDS
)

In [89]:
tt.get_cv_score(
        fold_metrics_model, 
        model_name=Config.MODEL_TYPE, 
        metric=Config.METRIC, 
        df_oof_preds=df_oof_preds_l2, 
        target_col_name="Ridge_target"
)

Fold 0 - L2_Ridge - R2 : 0.8684433586911444
Fold 1 - L2_Ridge - R2 : 0.8697184632735179
Fold 2 - L2_Ridge - R2 : 0.869158797897054
Fold 3 - L2_Ridge - R2 : 0.8693631700760908
Fold 4 - L2_Ridge - R2 : 0.8696437694961201
L2_Ridge metric=R2 CV score = 0.8692665112963635
L2_Ridge Mean R2 = 0.8692655118867855, std = 0.0004571880690499159


In [90]:
test_preds_cols

['Ridge_oof_preds', 'CatBoost_oof_preds', 'XGBoost_oof_preds']

In [91]:
df_fold_test_preds = tt.get_test_preds(fold_metrics_model, df_test_preds, test_preds_cols, preprocessor=None, num_folds=Config.NUM_FOLDS)

In [92]:
df_submission[Config.TARGET_COL_NAME]= df_fold_test_preds["test_preds"]
df_submission.to_csv(DATA_WRITEPATH + f'submission_{Config.MODEL_TYPE}.csv',index=False)
df_fold_test_preds.to_csv(DATA_WRITEPATH + f'{Config.MODEL_TYPE}_test_preds.csv',index=False)
df_submission.head()

Unnamed: 0,id,FloodProbability
0,1117957,0.577375
1,1117958,0.453923
2,1117959,0.447475
3,1117960,0.46596
4,1117961,0.466189


In [97]:
fold_metrics_model[3][1].coef_

array([-0.01234562,  0.79232395,  0.22114483])

In [93]:
# # Combine the test predictions of models using the optimal weights
# df_submission[Config.TARGET_COL_NAME] = np.average(df_test_preds.to_numpy(), axis=1, weights=initial_weights)
# df_submission.to_csv(f"{DATA_WRITEPATH}submission_ensemble.csv", index=False)
# df_submission