In [42]:
import sys
import os
import numpy as np
import pandas as pd
import joblib
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Ridge
from scipy.optimize import minimize
import warnings

warnings.filterwarnings('ignore')


In [43]:
sys.path.append(os.path.abspath("/home/bk_anupam/code/ML/ML_UTILS/"))

In [44]:
import train_tabular_utils as tt
import cv_split_utils
import enums
from enums import ModelName, Metrics
import data_utils

In [45]:
class Config:
    RUN_MODE = "LOCAL"
    RANDOM_SEED = 1
    NUM_FOLDS = 5
    MODEL_TYPE = enums.ModelName.L2_Ridge
    TARGET_COL_NAME = "Target"            
    METRIC = enums.Metrics.ACCURACY
    TRAIN_SINGLE_FOLD = False
    NUM_CLASSES = 3        

COLS_TO_LEAVE = ["id", "Target", "kfold"]
CPU_COUNT = os.cpu_count()

DATA_READPATH = "./data/"
BASE_MODELS_PATH = "./output/"

In [46]:
# key is model type and value is number of trained models for that type to be used in ensemble
models = {
    ModelName.CatBoost: 2,
    ModelName.XGBoost: 1,
    #ModelName.RandomForest: 1,
    #ModelName.LogisticRegression: 1
}

# base model prediction column names (both for oof predictions and test predictions) are the same as model names
base_model_names = [f"{key}{i+1}" for key, value in models.items() for i in range(value) ]
pred_cols = [f"{model_name}_preds_proba_{i}" for model_name in base_model_names for i in range(Config.NUM_CLASSES)]
print(base_model_names)
pred_cols

['CatBoost1', 'CatBoost2', 'XGBoost1']


['CatBoost1_preds_proba_0',
 'CatBoost1_preds_proba_1',
 'CatBoost1_preds_proba_2',
 'CatBoost2_preds_proba_0',
 'CatBoost2_preds_proba_1',
 'CatBoost2_preds_proba_2',
 'XGBoost1_preds_proba_0',
 'XGBoost1_preds_proba_1',
 'XGBoost1_preds_proba_2']

In [47]:
df_submission = pd.read_csv(DATA_READPATH + "sample_submission.csv")
df_oof_preds = pd.DataFrame()
df_test_preds = pd.DataFrame()
# load the OOF csv for each model
for model_name in base_model_names:
    df_model_oof = pd.read_csv(f"{BASE_MODELS_PATH}df_val_preds_{model_name}.csv")
    df_model_test_preds = pd.read_csv(f"{BASE_MODELS_PATH}df_test_preds_{model_name}.csv")
    for i in range(Config.NUM_CLASSES):
        df_oof_preds[f"{model_name}_preds_proba_{i}"] = df_model_oof[f"oof_preds_proba_{i}"]
        df_test_preds[f"{model_name}_preds_proba_{i}"] = df_model_test_preds[f"test_preds_proba_{i}"]
    df_oof_preds[f"{model_name}_preds"] = df_model_oof["oof_preds"]
    df_test_preds[f"{model_name}_preds"] = df_model_test_preds["test_preds"]
df_oof_preds[Config.TARGET_COL_NAME] = df_model_oof[Config.TARGET_COL_NAME]

In [48]:
df_oof_preds.head()

Unnamed: 0,CatBoost1_preds_proba_0,CatBoost1_preds_proba_1,CatBoost1_preds_proba_2,CatBoost1_preds,CatBoost2_preds_proba_0,CatBoost2_preds_proba_1,CatBoost2_preds_proba_2,CatBoost2_preds,XGBoost1_preds_proba_0,XGBoost1_preds_proba_1,XGBoost1_preds_proba_2,XGBoost1_preds,Target
0,0.00795,0.006622,0.985428,2,0.022356,0.01198,0.965664,2,0.012317,0.010887,0.976796,2,2
1,0.026453,0.02098,0.952566,2,0.033137,0.018453,0.94841,2,0.028486,0.033435,0.938079,2,2
2,0.061289,0.621982,0.316728,1,0.056843,0.788141,0.155016,1,0.06991,0.755106,0.174984,1,1
3,0.221125,0.618871,0.160004,1,0.127717,0.775832,0.096452,1,0.290566,0.487754,0.221679,1,1
4,0.036233,0.597098,0.366669,1,0.067244,0.58703,0.345726,1,0.079016,0.634216,0.286768,1,1


In [49]:
df_test_preds.head()

Unnamed: 0,CatBoost1_preds_proba_0,CatBoost1_preds_proba_1,CatBoost1_preds_proba_2,CatBoost1_preds,CatBoost2_preds_proba_0,CatBoost2_preds_proba_1,CatBoost2_preds_proba_2,CatBoost2_preds,XGBoost1_preds_proba_0,XGBoost1_preds_proba_1,XGBoost1_preds_proba_2,XGBoost1_preds
0,0.990005,0.007855,0.00214,0,0.990935,0.00739,0.001675,0,0.98832,0.008497,0.003182,0
1,0.006233,0.01378,0.979988,2,0.005124,0.01146,0.983416,2,0.007958,0.01843,0.973612,2
2,0.037894,0.251453,0.710653,2,0.0348,0.246753,0.718447,2,0.039474,0.237019,0.723506,2
3,0.183586,0.225122,0.591291,2,0.292535,0.237487,0.469978,2,0.172196,0.458779,0.369024,1
4,0.305291,0.65254,0.04217,1,0.249726,0.712745,0.037528,1,0.345187,0.619451,0.035362,1


In [50]:
def acc_func(weights, oof_preds, target):
    # weighted_preds is the final weighted probability of each target class
    weighted_preds = np.zeros((oof_preds.shape[0], 3))  # Corrected initialization
    # weights sequence length is equal to number of base models
    for i in range(len(weights)):
        # for each base model class probabilites are multiplied with corresponding model weight and added to weighted_preds
        weighted_preds += weights[i] * oof_preds[:, i*3:(i+1)*3]
    # argmax of weighted_preds gives final prediction        
    final_preds = np.argmax(weighted_preds, axis=1)
    accuracy = accuracy_score(target, final_preds)
    print(f"Weights: {weights}, Accuracy: {accuracy}")  # Debug output
    return -accuracy

In [51]:
# Start by giving equal weight to each model
n_models = len(pred_cols) // Config.NUM_CLASSES
initial_weights = np.ones(n_models) / n_models
initial_weights

array([0.33333333, 0.33333333, 0.33333333])

In [52]:
from scipy.optimize import minimize

# We want to find the set of weights that maximizes the accuracy. We start with the initial weights.
target = df_oof_preds[Config.TARGET_COL_NAME]
res = minimize(acc_func, initial_weights, args=(df_oof_preds[pred_cols].to_numpy(), target), method='Nelder-Mead')
model_weights = res["x"]
acc = res["fun"]

Weights: [0.33333333 0.33333333 0.33333333], Accuracy: 0.831867262978429
Weights: [0.35       0.33333333 0.33333333], Accuracy: 0.831830199402041
Weights: [0.33333333 0.35       0.33333333], Accuracy: 0.831941390131205
Weights: [0.33333333 0.33333333 0.35      ], Accuracy: 0.831867262978429
Weights: [0.31666667 0.34444444 0.34444444], Accuracy: 0.831904326554817
Weights: [0.32222222 0.35185185 0.32407407], Accuracy: 0.8319537446566677
Weights: [0.31666667 0.36111111 0.31111111], Accuracy: 0.8320155172839812
Weights: [0.31111111 0.37037037 0.32592593], Accuracy: 0.8320896444367571
Weights: [0.3        0.38888889 0.32222222], Accuracy: 0.831978453707593
Weights: [0.32407407 0.37654321 0.30246914], Accuracy: 0.831941390131205
Weights: [0.32222222 0.36851852 0.31296296], Accuracy: 0.8320278718094438
Weights: [0.3        0.38333333 0.3       ], Accuracy: 0.831941390131205
Weights: [0.325      0.35833333 0.325     ], Accuracy: 0.8320525808603692
Weights: [0.32222222 0.37037037 0.33148148], A

In [53]:
model_weights_normalized = model_weights / np.sum(model_weights)
print("Optimal Model Weights:", model_weights_normalized)
print("Optimal Accuracy:", acc)

Optimal Model Weights: [0.31029748 0.36613272 0.32356979]
Optimal Accuracy: -0.8321267080131453


In [54]:
def get_weighted_test_preds_proba(test_preds, model_weights):
    weighted_test_preds_proba = np.zeros((test_preds.shape[0], 3))
    for i in range(len(model_weights)):
        weighted_test_preds_proba += model_weights[i] * test_preds[:, i*3:(i+1)*3]
    return weighted_test_preds_proba

In [55]:
weighted_test_preds_proba = get_weighted_test_preds_proba(df_test_preds[pred_cols].to_numpy(), model_weights_normalized)
ensemble_test_preds = pd.Series(np.argmax(weighted_test_preds_proba, axis=1))
ensemble_test_preds.value_counts()

2    26308
0    15418
1     9286
Name: count, dtype: int64

In [56]:
# Mapping dictionary for class labels
class_mapping = {0: "Dropout", 1: "Enrolled", 2: "Graduate"}
# Replace class labels with corresponding strings using map
ensemble_test_preds = ensemble_test_preds.map(class_mapping)
ensemble_test_preds.value_counts()

Graduate    26308
Dropout     15418
Enrolled     9286
Name: count, dtype: int64

In [57]:
df_submission[Config.TARGET_COL_NAME] = ensemble_test_preds
df_submission.to_csv(BASE_MODELS_PATH + f'submission_ensemble.csv',index=False)
df_submission.head()

Unnamed: 0,id,Target
0,76518,Dropout
1,76519,Graduate
2,76520,Graduate
3,76521,Graduate
4,76522,Enrolled
