In [4]:
import sys
import os 
sys.path.append(os.path.dirname(os.getcwd()))
%load_ext autoreload
%autoreload 2
import mlflow
import pandas as pd
import numpy as np
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GroupKFold
import numpy as np
from metrics import bootstrap_mrr_at_k, mrr_at_k, hit_rate_at_k, mrr_at_k_per_experiment, hit_rate_at_k_per_experiment
from models import get_model, get_pooled_dataset
from settings import DATA_FOLDER
from notebooks.experiment_data import get_experiment_data, COLS, CATEGORICAL_COLS, split_experiment_train_test_val_data

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
data = get_experiment_data()

users_df size before removing small experiments: 501008 rows
users_df size after removing small experiments: 500953 rows


### Baseline Ranking Experiment: Model Training


In [None]:
mlflow.set_experiment("Ranking Candidate Model")

model_params = {
    'iterations': 400,
    'depth': 3,
    'learning_rate': 0.03,
    'l2_leaf_reg': 3,
    'random_seed': 42,
    'subsample': 1.0,
    'rsm': 1.0, 
    'random_strength': 0.5,
    'bagging_temperature': 0.25
}
with mlflow.start_run(run_name=f"ranking_baseline_experiment"):
    train_data, _, test_data = split_experiment_train_test_val_data(data, n_last_test=4, n_last_val=0)


    n_splits = 5
    # Use "EXPERIMENT_ID" to group
    group_kfold = GroupKFold(n_splits=n_splits)
    groups = train_data[["EXPERIMENT_ID", "RECIPIENT_ID"]].apply(lambda x: f"{x['EXPERIMENT_ID']}_{x['RECIPIENT_ID']}", axis=1)

    cv_results = []
    for fold, (train_idx, val_idx) in enumerate(group_kfold.split(train_data, groups=groups)):
        fold_train_data = train_data.iloc[train_idx]
        fold_val_data = train_data.iloc[val_idx]

        # Prepare pools and datasets per fold
        train_df, train_pool, train_group_ids, X_train, y_train = get_pooled_dataset(
            fold_train_data, pos_neg_ratio=1, cols=COLS, cat_cols=CATEGORICAL_COLS
        )
        val_df, val_pool, val_group_ids, X_val, y_val = get_pooled_dataset(
            fold_val_data, pos_neg_ratio=0, cols=COLS, cat_cols=CATEGORICAL_COLS
        )
        cat_features = train_pool.get_cat_feature_indices()

        # Fit the model
        ranker = get_model("ranker", cat_features, model_params)
        ranker.fit(train_pool, eval_set=val_pool, use_best_model=True, plot=True)

        # Validation scoring
        scores = ranker.predict(X_val)
        preds = val_df.assign(
            PRED=scores
        )[["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "PRED"]]
        y_true = val_df[
            ["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "CLICK"]
        ].query("CLICK==1")

        mrr_at_5_catboost = ranker.eval_metrics(val_pool, metrics=['MRR'])
        mrr_at_5 = mrr_at_k(preds, y_true, 5, prefix=f"cvfold{fold}_")
        hit_rate_1 = hit_rate_at_k(preds, y_true, 1, prefix=f"cvfold{fold}_")
        cv_results.append(
            {
                "fold": fold,
                "mrr_at_5": mrr_at_5,
                "mrr_at_5_catboost": mrr_at_5_catboost,
                "hit_rate_1": hit_rate_1,
            }
        )
        print(cv_results)
    mrr_at_5_values = [fold_result["mrr_at_5"] for fold_result in cv_results]
    hit_rate_1_values = [fold_result["hit_rate_1"] for fold_result in cv_results]

    mean_mrr_at_5 = np.mean(mrr_at_5_values)
    std_mrr_at_5 = np.std(mrr_at_5_values)
    mean_hit_rate_1 = np.mean(hit_rate_1_values)
    std_hit_rate_1 = np.std(hit_rate_1_values)

    mlflow.log_metric("cv_mean_mrr_at_5", mean_mrr_at_5)
    mlflow.log_metric("cv_std_mrr_at_5", std_mrr_at_5)
    mlflow.log_metric("cv_mean_hit_rate_1", mean_hit_rate_1)
    mlflow.log_metric("cv_std_hit_rate_1", std_hit_rate_1)
    print("CV results:", cv_results)

    # Train on all data and predict on test data
    train_df, train_pool, train_group_ids, X_train, y_train = get_pooled_dataset(train_data, cols=COLS, cat_cols=CATEGORICAL_COLS) 
    test_df, test_pool, test_group_ids, X_test, y_test = get_pooled_dataset(test_data, cols=COLS, cat_cols=CATEGORICAL_COLS)
    
    cat_features = train_pool.get_cat_feature_indices()
    ranker = get_model("ranker", cat_features, model_params)
    ranker.fit(train_pool, eval_set=val_pool, use_best_model=True)

    scores = ranker.predict(X_test)
    preds = test_df.assign(PRED=scores)[["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "PRED"]]
    y_true = test_df[["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "CLICK"]].query("CLICK==1") 

    mrr_at_k_per_experiment(preds, y_true, 5, prefix="test_")
    hit_rate_at_k_per_experiment(preds, y_true, 1, prefix="test_")
    bootstrap_mrr_at_k(preds, y_true, 5, bootstrap_samples=100, random_state=42, prefix="test_")


In [None]:
ranker.save_model("./ranking_candidate_model.cbm")

In [12]:
from metrics import hit_rate_at_k_per_experiment, mrr_at_k_per_experiment
scores = ranker.predict(test_pool)
preds = test_df.assign(
    PRED=scores
)[["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "PRED"]]
y_true = test_df[
    ["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "CLICK"]
].query("CLICK==1")

print("MRR@5 per experiment:", mrr_at_k_per_experiment(preds, y_true, 5)[0], ", the mean uplift is", mrr_at_k_per_experiment(preds, y_true, 5)[2])
print("Hit rate@1 per experiment:", hit_rate_at_k_per_experiment(preds, y_true, 1)[0], ", the mean uplift is", hit_rate_at_k_per_experiment(preds, y_true, 1)[1])

MRR@5 per experiment: 0.5139099684501969 , the mean uplift is 5.056276950990513
Hit rate@1 per experiment: 0.26654884842163334 , the mean uplift is 18.41693534831873
