### Combining outputs and agg

In [2]:
import pandas as pd

In [3]:
dat1 = pd.read_csv('../data/supervised_learning_predictions.csv')

In [4]:
dat1.shape

(10633049, 9)

In [6]:
dat4 = pd.read_csv('../data/preds_train_test_op_xgb.csv')

In [7]:
dat1.head()

Unnamed: 0,user_id,order_id,product_id,substitute_id,GMM_cluster_id,pred_rf_proba,pred_lr_proba,pred_lgbm_proba,pred_cat_proba
0,154565,1367172,2962,22089,63,0.511289,0.46765,0.322032,0.072946
1,151073,2857952,46045,27606,75,0.16733,0.251689,0.006941,0.001396
2,150638,790551,45401,15468,16,0.124237,0.192894,0.063928,0.005806
3,101433,1387828,33731,4799,15,0.759782,0.914044,0.921196,0.631459
4,164774,1971432,25890,30446,35,0.128468,0.181311,0.014037,0.001061


In [8]:
dat4.head()

Unnamed: 0,user_id,order_id,product_id,substitute_id,GMM_cluster_id,pred_class,pred_xgb
0,154565,1367172,2962,22089,63,0,0.049592
1,151073,2857952,46045,27606,75,0,0.000428
2,150638,790551,45401,15468,16,0,0.001799
3,101433,1387828,33731,4799,15,1,0.597095
4,164774,1971432,25890,30446,35,0,0.001871


In [9]:
dat1.shape, dat4.shape

((10633049, 9), (10633049, 7))

In [10]:
dat1.columns

Index(['user_id', 'order_id', 'product_id', 'substitute_id', 'GMM_cluster_id',
       'pred_rf_proba', 'pred_lr_proba', 'pred_lgbm_proba', 'pred_cat_proba'],
      dtype='object')

In [11]:

rf = dat1[['user_id', 'order_id', 'product_id', 'substitute_id', 'GMM_cluster_id', 'pred_rf_proba']].rename({'pred_rf_proba' : 'prediction'}, axis = 1)
lr = dat1[['user_id', 'order_id', 'product_id', 'substitute_id', 'GMM_cluster_id', 'pred_lr_proba']].rename({'pred_lr_proba' : 'prediction'}, axis = 1)
lg = dat1[['user_id', 'order_id', 'product_id', 'substitute_id', 'GMM_cluster_id', 'pred_lgbm_proba']].rename({'pred_lgbm_proba' : 'prediction'}, axis = 1)
ca = dat1[['user_id', 'order_id', 'product_id', 'substitute_id', 'GMM_cluster_id', 'pred_cat_proba']].rename({'pred_cat_proba' : 'prediction'}, axis = 1)


In [12]:
dat4.columns

Index(['user_id', 'order_id', 'product_id', 'substitute_id', 'GMM_cluster_id',
       'pred_class', 'pred_xgb'],
      dtype='object')

In [13]:
xgb = dat4[['user_id', 'order_id', 'product_id', 'substitute_id', 'GMM_cluster_id', 'pred_xgb']].rename({'pred_xgb' : 'prediction'}, axis = 1)

In [17]:
rf['model'] = 'RandomForest'
lr['model'] = 'LogisticRegression'
lg['model'] = 'LightGBM'
ca['model'] = 'CatBoost'
xgb['model'] = 'XGB'

In [18]:
final = pd.concat([rf,lr,lg,ca, xgb])

In [19]:
final.head()

Unnamed: 0,user_id,order_id,product_id,substitute_id,GMM_cluster_id,prediction,model
0,154565,1367172,2962,22089,63,0.511289,RandomForest
1,151073,2857952,46045,27606,75,0.16733,RandomForest
2,150638,790551,45401,15468,16,0.124237,RandomForest
3,101433,1387828,33731,4799,15,0.759782,RandomForest
4,164774,1971432,25890,30446,35,0.128468,RandomForest


In [21]:
final.to_csv('../data/final_model_predictions_combined.csv', index = False)

### Shap values

In [55]:
import numpy as np
import pandas as pd
import shap

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import xgboost as xgb


def mean_abs_shap_per_feature_classifiers(
    models: dict,
    X: np.ndarray,
    feature_names,
    nsample: int = 5000,
    positive_class: int = 1
) -> pd.DataFrame:
    """
    Compute mean(|SHAP|) per feature for a dict of fitted classification models.

    models: dict of fitted classification models, e.g.
            {
                "rf": rf_clf,
                "lr": log_reg,
                "lgbm": lgbm_clf,
                "cat": cat_clf,
                "xgb": xgb_clf
            }
    X:      numpy array of features (same columns used to train models)
    feature_names: list of feature names corresponding to columns of X
    nsample: number of rows to sample for SHAP (for speed)
    positive_class: which class index to use for SHAP in binary/multi-class (usually 1)
    """

    # sample rows from X
    rng = np.random.RandomState(42)
    idx = rng.choice(len(X), size=min(nsample, len(X)), replace=False)
    X_sample = X[idx]

    results = []

    for name, model in models.items():
        print(f"\nComputing SHAP for model: {name}")

        shap_values = None

        # -------------------------------------------------
        # 1. Choose explainer by model type
        # -------------------------------------------------
        if isinstance(model, xgb.XGBModel):
            print("  Using KernelExplainer for XGBoost classifier...")
            bg_idx = rng.choice(len(X_sample), size=min(200, len(X_sample)), replace=False)
            background = X_sample[bg_idx]

            f = lambda data: model.predict_proba(data)[:, positive_class]
            explainer = shap.KernelExplainer(f, background)
            shap_values = explainer.shap_values(X_sample, nsamples=100)

        elif isinstance(model, (RandomForestClassifier, LGBMClassifier, CatBoostClassifier)):
            print("  Using TreeExplainer with raw model output (log-odds)...")
            # Default model_output="raw" avoids the error you saw
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(X_sample)

        elif isinstance(model, LogisticRegression):
            print("  Using LinearExplainer for LogisticRegression...")
            explainer = shap.LinearExplainer(model, X_sample)
            shap_values = explainer.shap_values(X_sample)

        else:
            print("  Using generic shap.Explainer...")
            explainer = shap.Explainer(model, X_sample)
            shap_values = explainer(X_sample).values

        # -------------------------------------------------
        # 2. Handle multi-class outputs (pick positive_class)
        # -------------------------------------------------
        # Many classifiers return list of arrays: one per class
        if isinstance(shap_values, list):
            # select SHAP for the positive class
            shap_values = np.array(shap_values[positive_class])

        # For shap.Explanation we already took .values above
        # If shape is (n_samples, n_outputs, n_features) → average over outputs
        if shap_values.ndim == 3:
            shap_values = shap_values.mean(axis=1)

        # -------------------------------------------------
        # 3. Compute mean absolute SHAP per feature
        # -------------------------------------------------
        mean_abs = np.mean(np.abs(shap_values), axis=0)

        for feat, val in zip(feature_names, mean_abs):
            results.append({
                "model": name,
                "feature": feat,
                "mean_abs_shap": float(val)
            })

    df_imp = pd.DataFrame(results).sort_values(
        ["model", "mean_abs_shap"], ascending=[True, False]
    )

    return df_imp


In [41]:
import numpy as np
import pandas as pd
import shap
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import optuna
from xgboost import XGBRegressor
from xgboost.callback import EarlyStopping


In [36]:
data  = pd.read_pickle("sl_final_for_model.pkl")


In [37]:
id_cols = ['user_id', 'order_id', 'product_id', 'substitute_id', 'GMM_cluster_id']
target = 'label'
features = [x for x in data.columns if x not in id_cols + [target]]
train, test = train_test_split(data, train_size = 0.7, random_state = 42)
train_key = train[id_cols]
test_key = test[id_cols]
X_train = train.drop(columns=id_cols + [target])
y_train = train[target] 
X_test = test.drop(columns=id_cols + [target]) 
y_test = test[target] 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) 
X_test_scaled = scaler.transform(X_test) 
print('scaling done')

scaling done


In [38]:
import pickle

In [45]:
with open('model_op_ml.pkl', 'rb') as f:
    data_dict = pickle.load(f)

In [51]:
data_dict[3]

In [43]:
with open('model_op_xgb.pkl', 'rb') as f:
    data_dict_xgb = pickle.load(f)

In [56]:
print('start shap')

df_shap_imp = mean_abs_shap_per_feature_classifiers(
    models= {
        'rf'  : data_dict[2],
        'lr'  : data_dict[3],
        'lgbm': data_dict[4],
        'cat' : data_dict[5],
        'xgb' : data_dict_xgb[2]
    },
    X=X_train_scaled,
    feature_names=X_train.columns.tolist(),
    nsample=5000,
    positive_class=1
)

print('end shap')


start shap

Computing SHAP for model: rf
  Using TreeExplainer with raw model output (log-odds)...

Computing SHAP for model: lr
  Using LinearExplainer for LogisticRegression...

Computing SHAP for model: lgbm
  Using TreeExplainer with raw model output (log-odds)...





Computing SHAP for model: cat
  Using TreeExplainer with raw model output (log-odds)...


Using 200 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.



Computing SHAP for model: xgb
  Using KernelExplainer for XGBoost classifier...


100%|██████████| 5000/5000 [02:24<00:00, 34.71it/s]

end shap





In [57]:
df_shap_imp

Unnamed: 0,model,feature,mean_abs_shap
93,cat,sub_product_popularity,1.838349
92,cat,prod_product_popularity,0.448932
117,cat,order_number,0.338783
111,cat,user_total_orders,0.297664
115,cat,user_substitute_frequency_before,0.176090
...,...,...,...
155,xgb,user_product_frequency_before,0.000759
137,xgb,same_department,0.000719
159,xgb,order_dow,0.000663
160,xgb,order_hour_of_day,0.000642


In [58]:
    
df_shap_imp.to_csv('../data/shap_values_ml.csv', index=False)