### Data Preparation

In [2]:
import sys
import os
sys.path.append(os.path.dirname(os.getcwd()))
%load_ext autoreload
%autoreload 2

import os 
import mlflow
import pandas as pd
import numpy as np
from catboost import cv
from experiment_data import get_experiment_data, split_experiment_train_test_val_data, CATEGORICAL_COLS, COLS
from models import get_pooled_dataset, get_catboost_classifier
from metrics import mrr_at_k_per_experiment, hit_rate_at_k_per_experiment, bootstrap_mrr_at_k

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
data = get_experiment_data()

users_df size before removing small experiments: 501008 rows
users_df size after removing small experiments: 500953 rows


In [8]:
train_data, _, test_data = split_experiment_train_test_val_data(data, n_last_test=4, n_last_val=0)
train_data = train_data.query("CLICK==1 or CLICK==0")
# Subsample to include all recipients with click=1, and equal number from click=0


train_df, train_pool, _, train_X, train_y = get_pooled_dataset(train_data, COLS, CATEGORICAL_COLS, pos_neg_ratio=1)
test_df, test_pool, _, test_X, test_y = get_pooled_dataset(test_data, COLS, CATEGORICAL_COLS)


model_params = {
    'iterations': 50,
    'learning_rate': 0.05,
    'eval_metric': 'AUC',
    'early_stopping_rounds': 20,
    'loss_function': 'Logloss',
    'random_seed': 42,
    'verbose': False
}

mlflow.set_experiment("Pointwise Ranking: Classification")

with mlflow.start_run(run_name="test_run") as run:
    cv_results = cv(
        params=model_params,
        pool=train_pool,
        fold_count=5,
        shuffle=True,
        partition_random_seed=42,
        verbose=True,
        stratified=False,
        early_stopping_rounds=20,
        as_pandas=True
    )

    best_iter = cv_results['test-AUC-mean'].idxmax()
    best_auc = cv_results.loc[best_iter, 'test-AUC-mean']

    mlflow.log_param("best_iteration", int(best_iter))
    mlflow.log_metric("best_cv_auc", float(best_auc))

    test_df, test_pool, _, test_X, test_y = get_pooled_dataset(test_data, COLS, CATEGORICAL_COLS)
    # Train CatBoostClassifier with best number of iterations found in CV

    model_params.update(iterations=best_iter)
    model = get_catboost_classifier(
        CATEGORICAL_COLS,
        model_params,
    )

    model.fit(train_pool)


    # Predict probability for the test set (probability that CLICK=1)
    scores = model.predict_proba(test_pool)[:, 1]
    preds = test_df.assign(PRED=scores)[["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "PRED"]]
    y_true = test_df[["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "CLICK"]].query("CLICK==1") 

    mrr_at_k_per_experiment(preds, y_true, 5, prefix="test_")
    hit_rate_at_k_per_experiment(preds, y_true, 1, prefix="test_")
    bootstrap_mrr_at_k(preds, y_true, 5, bootstrap_samples=100, random_state=42, prefix="test_")

    print(f"Best mean AUC across 5 folds: {best_auc:.4f} at iteration {best_iter}")


Training on fold [0/5]
0:	test: 0.5706248	best: 0.5706248 (0)	total: 33.2ms	remaining: 1.63s
1:	test: 0.5786835	best: 0.5786835 (1)	total: 59.3ms	remaining: 1.42s
2:	test: 0.5814499	best: 0.5814499 (2)	total: 88.2ms	remaining: 1.38s
3:	test: 0.5849329	best: 0.5849329 (3)	total: 117ms	remaining: 1.34s
4:	test: 0.5896289	best: 0.5896289 (4)	total: 146ms	remaining: 1.31s
5:	test: 0.5890622	best: 0.5896289 (4)	total: 174ms	remaining: 1.28s
6:	test: 0.5903551	best: 0.5903551 (6)	total: 204ms	remaining: 1.25s
7:	test: 0.5904940	best: 0.5904940 (7)	total: 234ms	remaining: 1.23s
8:	test: 0.5896802	best: 0.5904940 (7)	total: 258ms	remaining: 1.17s
9:	test: 0.5895720	best: 0.5904940 (7)	total: 290ms	remaining: 1.16s
10:	test: 0.5898872	best: 0.5904940 (7)	total: 316ms	remaining: 1.12s
11:	test: 0.5896066	best: 0.5904940 (7)	total: 343ms	remaining: 1.09s
12:	test: 0.5893384	best: 0.5904940 (7)	total: 372ms	remaining: 1.06s
13:	test: 0.5893409	best: 0.5904940 (7)	total: 387ms	remaining: 996ms
14:	