In [29]:
import pandas as pd

data = pd.read_pickle("sl_final_for_model.pkl")


In [30]:
pd.set_option('display.max_columns' , None)

### Train test split and scale

In [31]:
data.shape

(10633049, 47)

In [32]:
data.columns

Index(['user_id', 'order_id', 'product_id', 'substitute_id', 'GMM_cluster_id',
       'prod_price_usd', 'sub_price_usd', 'price_diff', 'price_ratio',
       'prod_serving_quantity', 'sub_serving_quantity',
       'serving_quantity_diff', 'serving_quantity_ratio',
       'prod_product_popularity', 'sub_product_popularity',
       'prod_global_reorder_prob', 'sub_global_reorder_prob',
       'same_department', 'same_aisle', 'same_brand', 'same_final_category',
       'nutriments_energy-kcal_100g_diff', 'nutriments_fat_100g_diff',
       'nutriments_carbohydrates_100g_diff', 'nutriments_proteins_100g_diff',
       'nutriments_sugars_100g_diff', 'nutriments_sodium_100g_diff',
       'nutriments_salt_100g_diff', 'nutriments_saturated-fat_100g_diff',
       'prod_ingredients_len', 'sub_ingredients_len', 'ingredients_len_diff',
       'user_total_orders', 'user_reorder_rate',
       'user_avg_days_between_orders', 'user_product_frequency_before',
       'user_substitute_frequency_before', 'us

In [33]:
data.head()

Unnamed: 0,user_id,order_id,product_id,substitute_id,GMM_cluster_id,prod_price_usd,sub_price_usd,price_diff,price_ratio,prod_serving_quantity,sub_serving_quantity,serving_quantity_diff,serving_quantity_ratio,prod_product_popularity,sub_product_popularity,prod_global_reorder_prob,sub_global_reorder_prob,same_department,same_aisle,same_brand,same_final_category,nutriments_energy-kcal_100g_diff,nutriments_fat_100g_diff,nutriments_carbohydrates_100g_diff,nutriments_proteins_100g_diff,nutriments_sugars_100g_diff,nutriments_sodium_100g_diff,nutriments_salt_100g_diff,nutriments_saturated-fat_100g_diff,prod_ingredients_len,sub_ingredients_len,ingredients_len_diff,user_total_orders,user_reorder_rate,user_avg_days_between_orders,user_product_frequency_before,user_substitute_frequency_before,user_substitute_past_purchases,order_number,order_dow,order_hour_of_day,days_since_prior_order,basket_size,basket_unique_products,basket_total_price,basket_avg_price,label
0,6344,1506,40350,7228,0,6.79,2.99,-3.8,0.440353,30.0,30.0,0.0,1.0,647,1896.0,0.33694,0.293249,1,1,0,1,1240.0,138.0,4.43,-3.33,0.0,-1.08128,-2.70319,18.870001,274,254,-20,13,0.587302,16.008696,0,0,0,10,4,14,12.0,10,10,49.0,4.9,1
1,26938,2089,44663,45671,0,9.49,8.99,-0.5,0.947313,30.0,30.0,0.0,1.0,717,519.0,0.359833,0.300578,1,1,0,1,183.0,7.799999,26.629999,-3.33,22.200001,-0.80902,-2.02256,-1.67,264,255,-9,16,0.588235,16.155125,2,0,0,10,2,18,17.0,20,20,113.8,5.69,1
2,26938,2089,19025,45671,0,11.49,8.99,-2.5,0.782419,30.0,30.0,0.0,1.0,935,519.0,0.35508,0.300578,1,1,0,1,266.333336,19.466666,23.299999,0.0,15.533334,-1.129553,-2.823893,0.0,333,255,-78,16,0.588235,16.155125,2,0,0,10,2,18,17.0,20,20,113.8,5.69,1
3,26938,2089,19025,44663,0,11.49,9.49,-2.0,0.825936,30.0,30.0,0.0,1.0,935,717.0,0.35508,0.359833,1,1,0,1,83.333336,11.666667,-3.33,3.33,-6.666667,-0.320533,-0.801333,1.67,333,264,-69,16,0.588235,16.155125,2,2,1,10,2,18,17.0,20,20,113.8,5.69,1
4,14151,16086,16966,10916,0,2.49,7.49,5.0,3.008032,30.0,7.0,-23.0,0.233333,44,285.0,0.204545,0.389474,1,1,0,0,57.0,-8.6,-6.67,39.570002,1.6,2.1,5.25,10.97,656,180,-476,7,0.631206,18.219048,3,0,0,5,1,7,18.0,19,19,131.01,6.895263,1


In [34]:
[x for x in data.columns if 'id' in x]

['user_id', 'order_id', 'product_id', 'substitute_id', 'GMM_cluster_id']

In [35]:
id_cols = ['user_id', 'order_id', 'product_id', 'substitute_id', 'GMM_cluster_id']

In [36]:
target = 'label'

In [37]:
features = [x for x in data.columns if x not in id_cols + [target]]

In [38]:
data.shape, data.drop_duplicates(subset = id_cols).shape

((10633049, 47), (10633049, 47))

In [39]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split

In [40]:
train, test = train_test_split(data, train_size = 0.7, random_state = 42)

In [41]:
train.shape, test.shape

((7443134, 47), (3189915, 47))

In [42]:
train_key = train[id_cols]
test_key = test[id_cols]
X_train = train.drop(columns=id_cols + [target])
y_train = train[target] 
X_test = test.drop(columns=id_cols + [target]) 
y_test = test[target] 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) 
X_test_scaled = scaler.transform(X_test) 
print('scaling done')

scaling done


### RF, LR, LightGBM,CatBoost

In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

In [44]:
def print_classification_metrics(model_name, y_true, y_pred, y_proba=None):
    acc = accuracy_score(y_true, y_pred)
    f1  = f1_score(y_true, y_pred, average="binary")  # change to 'macro' if multi-class
    print(f"{model_name} - Accuracy: {acc:.4f}, F1: {f1:.4f}", end="")
    if y_proba is not None:
        try:
            auc = roc_auc_score(y_true, y_proba)
            print(f", ROC-AUC: {auc:.4f}")
        except ValueError:
            # e.g. if only one class present in y_true
            print(", ROC-AUC: NA (only one class in y_true)")
    else:
        print()

In [45]:

# -------------------------------------------------------------------
# Random Forest Classifier
# -------------------------------------------------------------------
rf_clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=8,
    min_samples_split=1000,
    min_samples_leaf=100,
    max_features='sqrt',
    bootstrap=True,
    n_jobs=-1,
    random_state=42,
    class_weight='balanced_subsample'  # helps with class imbalance
)

rf_clf.fit(X_train_scaled, y_train)

y_pred_train = rf_clf.predict(X_train_scaled)
y_pred_test  = rf_clf.predict(X_test_scaled)

# probability of positive class (assumes binary 0/1)
y_proba_test = rf_clf.predict_proba(X_test_scaled)[:, 1]

train_key['pred_rf_class']  = y_pred_train
test_key['pred_rf_class']   = y_pred_test
test_key['pred_rf_proba']   = y_proba_test

print_classification_metrics("RandomForest", y_test, y_pred_test, y_proba_test)
print("RF classifier done")

# -------------------------------------------------------------------
# Logistic Regression
# -------------------------------------------------------------------
log_reg = LogisticRegression(
    penalty='l2',
    C=1.0,
    solver='lbfgs',
    max_iter=1000,
    class_weight='balanced'  # helps if target is imbalanced
)

log_reg.fit(X_train_scaled, y_train)

y_pred_train = log_reg.predict(X_train_scaled)
y_pred_test  = log_reg.predict(X_test_scaled)
y_proba_test = log_reg.predict_proba(X_test_scaled)[:, 1]

train_key['pred_lr_class'] = y_pred_train
test_key['pred_lr_class']  = y_pred_test
test_key['pred_lr_proba']  = y_proba_test

print_classification_metrics("LogisticRegression", y_test, y_pred_test, y_proba_test)
print("Logistic regression done")

# -------------------------------------------------------------------
# LightGBM Classifier
# -------------------------------------------------------------------
lgbm_clf = LGBMClassifier(
    num_leaves=31,
    learning_rate=0.05,
    n_estimators=500,
    objective='binary',
    class_weight='balanced',
    n_jobs=-1,
    random_state=42
)

lgbm_clf.fit(X_train_scaled, y_train)

y_pred_train = lgbm_clf.predict(X_train_scaled)
y_pred_test  = lgbm_clf.predict(X_test_scaled)
# LightGBM predict_proba
y_proba_test = lgbm_clf.predict_proba(X_test_scaled)[:, 1]

train_key['pred_lgbm_class'] = y_pred_train
test_key['pred_lgbm_class']  = y_pred_test
test_key['pred_lgbm_proba']  = y_proba_test

print_classification_metrics("LightGBM", y_test, y_pred_test, y_proba_test)
print("LightGBM classifier done")

# -------------------------------------------------------------------
# CatBoost Classifier
# -------------------------------------------------------------------
cat_clf = CatBoostClassifier(
    iterations=500,
    depth=6,
    learning_rate=0.05,
    loss_function='Logloss',
    eval_metric='AUC',
    verbose=False,
    random_seed=42,
    class_weights=None  # or [w_for_class_0, w_for_class_1] if you want custom
)

cat_clf.fit(X_train_scaled, y_train)

y_pred_train = cat_clf.predict(X_train_scaled)
y_pred_test  = cat_clf.predict(X_test_scaled).astype(int)  # CatBoost returns strings sometimes
y_proba_test = cat_clf.predict_proba(X_test_scaled)[:, 1]

train_key['pred_cat_class'] = y_pred_train
test_key['pred_cat_class']  = y_pred_test
test_key['pred_cat_proba']  = y_proba_test

print_classification_metrics("CatBoost", y_test, y_pred_test, y_proba_test)
print("CatBoost classifier done")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_key['pred_rf_class']  = y_pred_train
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_key['pred_rf_class']   = y_pred_test
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_key['pred_rf_proba']   = y_proba_test


RandomForest - Accuracy: 0.8490, F1: 0.5501, ROC-AUC: 0.9350
RF classifier done


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_key['pred_lr_class'] = y_pred_train
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_key['pred_lr_class']  = y_pred_test
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_key['pred_lr_proba']  = y_proba_test


LogisticRegression - Accuracy: 0.9008, F1: 0.6014, ROC-AUC: 0.9025
Logistic regression done
[LightGBM] [Info] Number of positive: 790655, number of negative: 6652479
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.254085 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7028
[LightGBM] [Info] Number of data points in the train set: 7443134, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_key['pred_lgbm_class'] = y_pred_train
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_key['pred_lgbm_class']  = y_pred_test
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_key['pred_lgbm_proba']  = y_proba_test


LightGBM - Accuracy: 0.8759, F1: 0.6033, ROC-AUC: 0.9530
LightGBM classifier done


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_key['pred_cat_class'] = y_pred_train
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_key['pred_cat_class']  = y_pred_test
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_key['pred_cat_proba']  = y_proba_test


CatBoost - Accuracy: 0.9398, F1: 0.6660, ROC-AUC: 0.9479
CatBoost classifier done


In [46]:
train_key['pred_rf_proba']   = rf_clf.predict_proba(X_train_scaled)[:, 1]
train_key['pred_lr_proba']  = log_reg.predict_proba(X_train_scaled)[:, 1]
train_key['pred_lgbm_proba']  = lgbm_clf.predict_proba(X_train_scaled)[:, 1]
train_key['pred_cat_proba']  = cat_clf.predict_proba(X_train_scaled)[:, 1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_key['pred_rf_proba']   = rf_clf.predict_proba(X_train_scaled)[:, 1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_key['pred_lr_proba']  = log_reg.predict_proba(X_train_scaled)[:, 1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_key['pred_lgbm_proba']  = lgbm_clf.predict_proba(X_

In [47]:
op = (train_key[id_cols + ['pred_rf_proba', 'pred_lr_proba', 'pred_lgbm_proba', 'pred_cat_proba']],
      test_key[id_cols + ['pred_rf_proba', 'pred_lr_proba', 'pred_lgbm_proba', 'pred_cat_proba']],
      rf_clf, log_reg, lgbm_clf, cat_clf)

In [48]:
import pickle

# Save the dictionary to a binary file
with open('model_op_ml.pkl', 'wb') as f:
    pickle.dump(op, f)

In [49]:
combined_op = pd.concat([
    op[0],
    op[1]
          ])


In [50]:
op[0].columns

Index(['user_id', 'order_id', 'product_id', 'substitute_id', 'GMM_cluster_id',
       'pred_rf_proba', 'pred_lr_proba', 'pred_lgbm_proba', 'pred_cat_proba'],
      dtype='object')

In [51]:
op[1].columns

Index(['user_id', 'order_id', 'product_id', 'substitute_id', 'GMM_cluster_id',
       'pred_rf_proba', 'pred_lr_proba', 'pred_lgbm_proba', 'pred_cat_proba'],
      dtype='object')

In [60]:
combined_op.to_csv('../data/supervised_learning_predictions.csv', index = False)

In [65]:
combined_op.head()

Unnamed: 0,user_id,order_id,product_id,substitute_id,GMM_cluster_id,pred_rf_proba,pred_lr_proba,pred_lgbm_proba,pred_cat_proba
9335088,154565,1367172,2962,22089,63,0.511289,0.46765,0.322032,0.072946
9944958,151073,2857952,46045,27606,75,0.16733,0.251689,0.006941,0.001396
2704721,150638,790551,45401,15468,16,0.124237,0.192894,0.063928,0.005806
2492248,101433,1387828,33731,4799,15,0.759782,0.914044,0.921196,0.631459
4682778,164774,1971432,25890,30446,35,0.128468,0.181311,0.014037,0.001061


### XGB - Optuna Hyperparameter tuning

In [18]:
from xgboost import XGBClassifier
import optuna
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    log_loss, accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix, classification_report
)
from datetime import datetime
import pickle

In [None]:


# Keep your existing scaled data & keys:
# X_train_scaled, X_test_scaled, y_train, y_test, train_key, test_key

# Train/validation split (same pattern as your snippet)
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_scaled, y_train, test_size=0.3, random_state=42, stratify=y_train
)

n_classes = len(np.unique(y_train))
is_binary = (n_classes == 2)

def objective(trial):
    params = {
        # ---- Objective & eval metric ----
        "objective": "binary:logistic" if is_binary else "multi:softprob",
        "num_class": None if is_binary else n_classes,
        "tree_method": "hist",
        "eval_metric": "logloss" if is_binary else "mlogloss",

        # ---- Learning rate & trees ----
        "learning_rate": trial.suggest_float("learning_rate", 0.03, 0.2, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 300, 900),

        # ---- Tree complexity ----
        "max_depth": trial.suggest_int("max_depth", 3, 7),
        "min_child_weight": trial.suggest_float("min_child_weight", 3.0, 80.0, log=True),

        # ---- Row & feature subsampling ----
        "subsample": trial.suggest_float("subsample", 0.6, 0.9),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 0.8),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1.0),

        # ---- Regularization ----
        "gamma": trial.suggest_float("gamma", 1.0, 10.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.1, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.1, 20.0, log=True),

        # ---- Histogram granularity ----
        "max_bin": trial.suggest_int("max_bin", 128, 512),

        "random_state": 42,
        "n_jobs": -1,
        # tip: for heavy class imbalance (binary), consider
        # "scale_pos_weight": (neg/pos) — you can add as a suggested_float/int
    }

    # Fit
    model = XGBClassifier(**params)
    model.fit(X_tr, y_tr)

    # Validate via log loss (works for binary & multiclass)
    proba = model.predict_proba(X_val)
    return log_loss(y_val, proba, labels=np.unique(y_train))

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100, show_progress_bar=False)


[I 2025-11-30 15:29:20,160] A new study created in memory with name: no-name-310dfd60-6865-4d22-99ec-00af1d2ca038
[I 2025-11-30 15:30:42,261] Trial 0 finished with value: 0.1467475774758256 and parameters: {'learning_rate': 0.042098121232082494, 'n_estimators': 777, 'max_depth': 7, 'min_child_weight': 29.132577815217093, 'subsample': 0.6147968374729996, 'colsample_bytree': 0.485724487160419, 'colsample_bylevel': 0.9494368507946758, 'gamma': 3.520562738827442, 'reg_alpha': 0.14211414025518554, 'reg_lambda': 1.1299537693237707, 'max_bin': 416}. Best is trial 0 with value: 0.1467475774758256.
[I 2025-11-30 15:31:28,110] Trial 1 finished with value: 0.15602432639815 and parameters: {'learning_rate': 0.04399109506258741, 'n_estimators': 689, 'max_depth': 4, 'min_child_weight': 6.865728559813745, 'subsample': 0.7376031405137949, 'colsample_bytree': 0.5091304974244799, 'colsample_bylevel': 0.9254974234816152, 'gamma': 5.46456337360755, 'reg_alpha': 0.592458647894528, 'reg_lambda': 10.32284575

[I 2025-11-30 15:43:24,762] Trial 17 finished with value: 0.14582657720041295 and parameters: {'learning_rate': 0.16151245095428227, 'n_estimators': 493, 'max_depth': 7, 'min_child_weight': 10.425680565011305, 'subsample': 0.841098092708128, 'colsample_bytree': 0.696042531489659, 'colsample_bylevel': 0.9066740394939384, 'gamma': 6.714749037235559, 'reg_alpha': 0.5048116274121143, 'reg_lambda': 0.5495457944568578, 'max_bin': 467}. Best is trial 9 with value: 0.14417424514953284.
[I 2025-11-30 15:43:57,502] Trial 18 finished with value: 0.1508722041261947 and parameters: {'learning_rate': 0.1111563335371216, 'n_estimators': 403, 'max_depth': 5, 'min_child_weight': 8.665613693324678, 'subsample': 0.7739391476866416, 'colsample_bytree': 0.5392889491273387, 'colsample_bylevel': 0.8082938759898792, 'gamma': 4.213061419335467, 'reg_alpha': 2.697923784235733, 'reg_lambda': 4.794017065286828, 'max_bin': 376}. Best is trial 9 with value: 0.14417424514953284.
[I 2025-11-30 15:44:28,511] Trial 19 

[I 2025-11-30 15:55:48,789] Trial 34 finished with value: 0.15151271516047285 and parameters: {'learning_rate': 0.05065836511838557, 'n_estimators': 739, 'max_depth': 5, 'min_child_weight': 5.223510217491775, 'subsample': 0.7009924343209415, 'colsample_bytree': 0.518571402168277, 'colsample_bylevel': 0.9620793900202779, 'gamma': 1.0086056391641087, 'reg_alpha': 3.325912893674448, 'reg_lambda': 0.2663895963895437, 'max_bin': 176}. Best is trial 24 with value: 0.14226851980169183.
[I 2025-11-30 15:56:38,296] Trial 35 finished with value: 0.14240346618349997 and parameters: {'learning_rate': 0.19896752419515884, 'n_estimators': 806, 'max_depth': 6, 'min_child_weight': 5.9230560467311495, 'subsample': 0.819620487342055, 'colsample_bytree': 0.7797485486956824, 'colsample_bylevel': 0.9183041186795611, 'gamma': 3.529210390923677, 'reg_alpha': 0.7334023513482986, 'reg_lambda': 0.10037944481909725, 'max_bin': 271}. Best is trial 24 with value: 0.14226851980169183.
[I 2025-11-30 15:57:26,621] Tr

In [19]:

# print("Best LogLoss:", study.best_value)
# print("Best params:", study.best_params)

# # Retrain on full training set with best params
# best_params = {
#     **study.best_params,
#     "objective": "binary:logistic",
#     "num_class": None,
#     "tree_method": "hist",
#     "eval_metric": "logloss",
#     "random_state": 42,
#     "n_jobs": -1,
# }



best_params = {
    'learning_rate': 0.19998570170455632, 'n_estimators': 687, 'max_depth': 7, 'min_child_weight': 5.441759731202355,
    'subsample': 0.780218351263169, 'colsample_bytree': 0.769739120815676, 'colsample_bylevel': 0.9553771482874794,
    'gamma': 4.345945751363333, 'reg_alpha': 2.572047028510029, 'reg_lambda': 0.1560558876938432, 'max_bin': 446,

    "objective": "binary:logistic",
    "num_class": None,
    "tree_method": "hist",
    "eval_metric": "logloss",
    "random_state": 42,
    "n_jobs": -1,
}


# Save best params
with open(f"best_cls_params_{datetime.strftime(datetime.now(), '%Y%m%d%H%M')}_.pkl", "wb") as f:
    pickle.dump(best_params, f)

In [20]:


best_model = XGBClassifier(**best_params)
best_model.fit(X_train_scaled, y_train)

# --- Predictions ---
y_tr_pred = best_model.predict(X_train_scaled)
y_te_pred = best_model.predict(X_test_scaled)

# Probabilities (for AUC/thresholding)
y_tr_proba = best_model.predict_proba(X_train_scaled)
y_te_proba = best_model.predict_proba(X_test_scaled)

# --- Metrics (train) ---
print("\n=== TRAIN METRICS ===")
print("Accuracy:", accuracy_score(y_train, y_tr_pred))
print("Precision (macro):", precision_score(y_train, y_tr_pred, average="macro", zero_division=0))
print("Recall (macro):", recall_score(y_train, y_tr_pred, average="macro", zero_division=0))
print("F1 (macro):", f1_score(y_train, y_tr_pred, average="macro", zero_division=0))
print("ROC-AUC:", roc_auc_score(y_train, y_tr_proba[:, 1]))

print("\nClassification report:\n", classification_report(y_train, y_tr_pred, zero_division=0))

# --- Metrics (test) ---
print("\n=== TEST METRICS ===")
print("Accuracy:", accuracy_score(y_test, y_te_pred))
print("Precision (macro):", precision_score(y_test, y_te_pred, average="macro", zero_division=0))
print("Recall (macro):", recall_score(y_test, y_te_pred, average="macro", zero_division=0))
print("F1 (macro):", f1_score(y_test, y_te_pred, average="macro", zero_division=0))
print("ROC-AUC:", roc_auc_score(y_test, y_te_proba[:, 1]))
print("Confusion matrix:\n", confusion_matrix(y_test, y_te_pred))
print("\nClassification report:\n", classification_report(y_test, y_te_pred, zero_division=0))


train_key['pred_class'] = y_tr_pred
train_key['pred_proba'] = y_tr_proba[:, 1]
test_key['pred_class'] = y_te_pred
test_key['pred_proba'] = y_te_proba[:, 1]



=== TRAIN METRICS ===
Accuracy: 0.9451733638007861
Precision (macro): 0.8847743103479765
Recall (macro): 0.8058966485004759
F1 (macro): 0.8394148595180828
ROC-AUC: 0.9634052138657881

Classification report:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97   6652479
           1       0.81      0.63      0.71    790655

    accuracy                           0.95   7443134
   macro avg       0.88      0.81      0.84   7443134
weighted avg       0.94      0.95      0.94   7443134


=== TEST METRICS ===
Accuracy: 0.9441257839158723
Precision (macro): 0.8814815337721205
Recall (macro): 0.8030803288973039
F1 (macro): 0.836371669218396
ROC-AUC: 0.9601165050716871
Confusion matrix:
 [[2800146   50750]
 [ 127484  211535]]

Classification report:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97   2850896
           1       0.81      0.62      0.70    339019

    accuracy                       

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_key['pred_class'] = y_tr_pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_key['pred_proba'] = y_tr_proba[:, 1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_key['pred_class'] = y_te_pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index

In [21]:
data_dict_xgb = (train_key,test_key, best_model, best_params)


In [None]:
data_dict_xgb

In [28]:
with open('model_op_xgb.pkl', 'wb') as f:
    pickle.dump(data_dict_xgb, f)

In [22]:

combined_op_xgb = pd.concat([
    data_dict_xgb[0],
    data_dict_xgb[1]
          ])



In [26]:
data_dict_xgb[0]

Unnamed: 0,user_id,order_id,product_id,substitute_id,GMM_cluster_id,pred_class,pred_proba
9335088,154565,1367172,2962,22089,63,0,0.049592
9944958,151073,2857952,46045,27606,75,0,0.000428
2704721,150638,790551,45401,15468,16,0,0.001799
2492248,101433,1387828,33731,4799,15,1,0.597095
4682778,164774,1971432,25890,30446,35,0,0.001871
...,...,...,...,...,...,...,...
2234489,105158,2360613,13284,24037,14,0,0.009275
4304572,73645,1757942,16290,49612,32,0,0.002715
10081351,122230,2005721,28476,4256,79,0,0.003210
6550634,109541,3306192,13176,37646,40,1,0.554273


In [27]:
combined_op_xgb.rename({'pred_proba' : 'pred_xgb'}, axis = 1).to_csv('../data/preds_train_test_op_xgb.csv', index = False)