In [51]:
import sys
import os
import numpy as np
import pandas as pd
import joblib
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.linear_model import Ridge
from scipy.optimize import minimize
import warnings

warnings.filterwarnings('ignore')


In [52]:
sys.path.append(os.path.abspath("/home/bk_anupam/code/ML/ML_UTILS/"))

In [53]:
import train_tabular_utils as tt
import cv_split_utils
import enums
from enums import ModelName
import data_utils

In [54]:
class Config:
    RUN_MODE = "LOCAL"
    RANDOM_SEED = 42
    NUM_FOLDS = 5
    MODEL_TYPE = ModelName.L2_Ridge
    TARGET_COL_NAME = "price"            
    METRIC = enums.Metrics.RMSE
    TRAIN_SINGLE_FOLD = False     

COLS_TO_LEAVE = ["id", "price", "kfold", "transmission_speed", "target_grp"]
CPU_COUNT = os.cpu_count()

DATA_READPATH = "./data/"
BASE_MODELS_PATH = "./output/"

In [55]:
# key is model type and value is list of experiment serial number of trained models for that type to be used in ensemble
models = {
    #ModelName.NeuralNet: [12],    
    #ModelName.TabNetRegressor: [14],
    ModelName.CatBoost: [9, 18],    
    ModelName.LGBM: [5, 6],
    #ModelName.XGBoost: [15]    
}

# for each key in models dictionary iterate through the values list using for comprehension
base_model_names = [f"{key}{i}" for key, value in models.items() for i in value]
pred_cols = [f"{model_name}_preds" for model_name in base_model_names]
print(base_model_names)
print(pred_cols)


['CatBoost9', 'CatBoost18', 'LightGBM5', 'LightGBM6']
['CatBoost9_preds', 'CatBoost18_preds', 'LightGBM5_preds', 'LightGBM6_preds']


In [56]:
df_submission = pd.read_csv(DATA_READPATH + "sample_submission.csv")
df_oof_preds = pd.DataFrame()
df_test_preds = pd.DataFrame()
# load the OOF csv for each model
for model_name in base_model_names:
    df_model_oof = pd.read_csv(f"{BASE_MODELS_PATH}df_val_preds_{model_name}.csv")
    df_oof_preds[f"{model_name}_preds"] = df_model_oof["oof_preds"]
    if model_name not in ["NN12", "TabNetRegressor14"]:
        df_model_test_preds = pd.read_csv(f"{BASE_MODELS_PATH}df_test_preds_{model_name}.csv")        
        df_test_preds[f"{model_name}_preds"] = df_model_test_preds["test_preds"]
    else:
        df_model_test_preds = pd.read_csv(f"{BASE_MODELS_PATH}submission_{model_name}.csv")        
        df_test_preds[f"{model_name}_preds"] = df_model_test_preds["price"]
df_oof_preds[Config.TARGET_COL_NAME] = df_model_oof[Config.TARGET_COL_NAME]

In [57]:
df_oof_preds[df_oof_preds.price > 2000000].head(5)

Unnamed: 0,CatBoost9_preds,CatBoost18_preds,LightGBM5_preds,LightGBM6_preds,price
3466,72579.124707,21457.137612,81818.134155,102230.591316,2954083.0
5624,17501.740519,75782.22186,42077.212054,42081.9693,2954083.0
11503,53888.865028,23773.566581,34571.528056,33949.615149,2954083.0
15433,13402.243629,49199.235436,75380.885783,78812.613077,2954083.0
16342,24551.440165,88467.469753,14056.762866,14870.94217,2954083.0


In [58]:
df_test_preds.head(5)

Unnamed: 0,CatBoost9_preds,CatBoost18_preds,LightGBM5_preds,LightGBM6_preds
0,17501.813485,16895.231411,16852.724351,16901.960186
1,80169.706233,79046.194652,80879.492535,82545.374389
2,53371.576238,57597.742487,54045.232394,55073.505298
3,32236.6432,29303.107174,31765.062853,33402.10677
4,28712.21356,29187.541723,28973.723138,29356.469524


In [59]:
def rmse_func(weights, oof_preds, target):
    pred = (oof_preds * weights).sum(axis=1)
    rmse = np.sqrt(1 / len(pred) * ((target - pred)**2).sum())
    return rmse

In [60]:
# Start by giving equal weight to each model
n_models = len(base_model_names)
initial_weights = np.ones(n_models) / n_models
initial_weights

array([0.25, 0.25, 0.25, 0.25])

In [61]:
from scipy.optimize import minimize

# We want to find the set of weights that maximizes the accuracy. We start with the initial weights.
target = df_oof_preds[Config.TARGET_COL_NAME]
res = minimize(rmse_func, initial_weights, args=(df_oof_preds[pred_cols].to_numpy(), target), method='Nelder-Mead')
model_weights = res["x"]
rmse = res["fun"]

In [62]:
def optimize_weights(oof_preds, target, initial_weights):
    n_models = len(initial_weights)
    
    # Define a constraint to ensure the sum of weights is 1
    def weight_sum_constraint(weights):
        return np.sum(weights) - 1.0

    # Use the squared weights to ensure non-negativity
    def objective(squared_weights):
        weights = squared_weights**2
        weights /= np.sum(weights)  # Normalize to sum to 1
        return rmse_func(weights, oof_preds, target)

    # Optimize using SLSQP method which supports constraints
    res = minimize(
        objective,
        np.sqrt(initial_weights),  # Use square root of initial weights
        method='SLSQP',
        # constraints is a list of dictionaries each with keys 'type' and 'fun'
        # type can be 'eq' for equality or 'ineq' for inequality
        # Equality constraint means that the constraint function result is to be zero whereas 
        # inequality means that it is to be non-negative
        constraints={'type': 'eq', 'fun': lambda w: np.sum(w**2) - 1},
        options={'ftol': 1e-9, 'disp': True}
    )

    # Square the optimized weights and normalize
    optimized_weights = res.x**2
    optimized_weights /= np.sum(optimized_weights)
    
    return optimized_weights, res.fun

In [63]:
# Example usage
n_models = len(base_model_names)
initial_weights = np.ones(n_models) / n_models
target = df_oof_preds[Config.TARGET_COL_NAME]

model_weights, rmse = optimize_weights(df_oof_preds[pred_cols].to_numpy(), target, initial_weights)

print("Optimal Model Weights:", model_weights)
print("Optimal RMSE:", rmse)

Optimization terminated successfully    (Exit mode 0)
            Current function value: 71810.0557570048
            Iterations: 30
            Function evaluations: 182
            Gradient evaluations: 30
Optimal Model Weights: [5.60008874e-13 3.07583384e-13 9.92593700e-02 9.00740630e-01]
Optimal RMSE: 71810.0557570048


In [65]:
df_test_preds["ensemble_preds"] = df_test_preds.apply(lambda x: np.average(x, weights=model_weights), axis=1)

In [66]:
df_test_preds.head(5)

Unnamed: 0,CatBoost9_preds,CatBoost18_preds,LightGBM5_preds,LightGBM6_preds,ensemble_preds
0,17501.813485,16895.231411,16852.724351,16901.960186,16897.073068
1,80169.706233,79046.194652,80879.492535,82545.374389,82380.020005
2,53371.576238,57597.742487,54045.232394,55073.505298,54971.439578
3,32236.6432,29303.107174,31765.062853,33402.10677,33239.614822
4,28712.21356,29187.541723,28973.723138,29356.469524,29318.478359


In [68]:
df_submission[Config.TARGET_COL_NAME]= df_test_preds["ensemble_preds"]
df_submission.to_csv("./output/" + f'submission_ensemble.csv',index=False)
df_submission.head()

Unnamed: 0,id,price
0,188533,16897.073068
1,188534,82380.020005
2,188535,54971.439578
3,188536,33239.614822
4,188537,29318.478359


In [64]:
model_weights_normalized = model_weights / np.sum(model_weights)
print("Optimal Model Weights:", model_weights_normalized)
print("Optimal RMSE:", rmse)

Optimal Model Weights: [5.60008874e-13 3.07583384e-13 9.92593700e-02 9.00740630e-01]
Optimal RMSE: 71810.0557570048


In [46]:
# select the top 188533 rows from df_oof_preds
df_oof_preds = df_oof_preds.head(188533)

In [47]:
df_oof_preds = cv_split_utils.kfold_dataframe(df_oof_preds, random_state=Config.RANDOM_SEED, num_folds=Config.NUM_FOLDS)

In [48]:
fold_metrics_model, df_oof_preds_l2, preprocessor = tt.train_and_validate(
        model_name=Config.MODEL_TYPE,
        model_params=None,
        preprocessor=None,
        df=df_oof_preds,
        feature_cols=pred_cols,
        target_col_name="price",
        metric=Config.METRIC,
        single_fold=Config.TRAIN_SINGLE_FOLD,
        num_folds=Config.NUM_FOLDS
)

Fold 0 - L2_Ridge - RMSE : 71369.28607852732
Fold 1 - L2_Ridge - RMSE : 63752.668240812796
Fold 2 - L2_Ridge - RMSE : 74914.7688797453
Fold 3 - L2_Ridge - RMSE : 78903.57869638012
Fold 4 - L2_Ridge - RMSE : 73033.09179107365
L2_Ridge metric=RMSE CV score = 72566.90743360888
L2_Ridge Mean RMSE = 72394.67873730783, std = 4997.198810466754


In [49]:
df_fold_test_preds = tt.get_test_preds(fold_metrics_model, df_test_preds, pred_cols, preprocessor=None, num_folds=Config.NUM_FOLDS)

In [50]:
df_submission[Config.TARGET_COL_NAME]= df_fold_test_preds["test_preds"]
df_submission.to_csv("./output/" + f'submission_{Config.MODEL_TYPE}.csv',index=False)
df_fold_test_preds.to_csv("./output/" + f'{Config.MODEL_TYPE}_test_preds.csv',index=False)
df_submission.head()

Unnamed: 0,id,price
0,188533,16461.518631
1,188534,83740.449933
2,188535,55579.901303
3,188536,33247.332693
4,188537,29222.05451
