In [1]:
import sys
sys.path.append('/Users/broniy/Desktop/CreativeRank/')
%load_ext autoreload
%autoreload 2

import mlflow
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from metrics import bootstrap_mrr_at_k, mrr_at_k, hit_rate_at_k
from models import get_model, get_pooled_dataset
from settings import DATA_FOLDER
from notebooks.experiment_data import get_experiment_data, COLS, CATEGORICAL_COLS, split_experiment_train_test_val_data

In [2]:
data = get_experiment_data()
n_experiments = data["EXPERIMENT_ID"].nunique()

users_df size before removing small experiments: 501008 rows
users_df size after removing small experiments: 500953 rows


In [3]:
data.sort_values("EXPERIMENT_DATE", ascending=True).groupby(["EXPERIMENT_DATE", "EXPERIMENT_ID"]).agg({"VARIATION_ID": "nunique"})

Unnamed: 0_level_0,Unnamed: 1_level_0,VARIATION_ID
EXPERIMENT_DATE,EXPERIMENT_ID,Unnamed: 2_level_1
2025-07-01,8ea67496-0fb3-4efd-8cea-4b8d88351b8e,5
2025-07-10,81ae4870-e57d-4bc4-a2d7-48ffa5411707,5
2025-07-11,c5288ca2-3928-4364-8f08-bebc1036dd87,5
2025-07-14,3ecf34fc-1f15-4b32-970f-4061544da763,5
2025-07-17,00bb26ff-6fe3-4465-ac77-12bfc33aa6df,5
2025-07-22,6f506df9-be60-452d-b914-8230c29c2ff1,5
2025-07-26,44d26695-cdf2-41a4-b161-393fdaf964bc,5
2025-07-28,002deaf7-331f-4b5e-866b-f6dad60e4a79,5
2025-08-02,e4b4a349-3b14-439e-946f-f716101dac69,4
2025-08-07,e627d7f0-46c4-4894-872e-59a2fc108c30,5


In [4]:
train_data, val_data, test_data = split_experiment_train_test_val_data(data, n_last_test=4, n_last_val=0, n_last_train=2)
display(train_data.groupby(["EXPERIMENT_DATE", "EXPERIMENT_ID"]).agg({"CLICK": lambda x: (x==1).sum()}))
display(val_data.groupby(["EXPERIMENT_DATE", "EXPERIMENT_ID"]).agg({"CLICK": lambda x: (x==1).sum()}))
display(test_data.groupby(["EXPERIMENT_DATE", "EXPERIMENT_ID"]).agg({"CLICK": lambda x: (x==1).sum()}))


Unnamed: 0_level_0,Unnamed: 1_level_0,CLICK
EXPERIMENT_DATE,EXPERIMENT_ID,Unnamed: 2_level_1
2025-09-23,f93bf2bd-1d50-4131-9ec2-223a4d9987e8,2921
2025-09-25,43d750b5-8698-4cf0-9ea2-f705f4f196ed,1968


Unnamed: 0_level_0,Unnamed: 1_level_0,CLICK
EXPERIMENT_DATE,EXPERIMENT_ID,Unnamed: 2_level_1


Unnamed: 0_level_0,Unnamed: 1_level_0,CLICK
EXPERIMENT_DATE,EXPERIMENT_ID,Unnamed: 2_level_1
2025-09-27,823158da-7b0a-4c19-8189-663c22a3ae38,2286
2025-09-29,cd4a656f-290a-41e2-be1d-bf62ad85757d,2186
2025-10-02,78a802ae-d6cd-4f39-aecb-138668fa2607,1139
2025-10-06,0ef6d2e9-7601-4df6-a215-83e6e79aa24e,1293


In [13]:
# Start an mlflow experiment
mlflow.set_experiment("Expanding Window")

experiment_name = f"expanding_window"

model_params = {
    "learning_rate": 0.5,
    "depth": 6,
    "iterations": 1000,
    # "l2_leaf_reg": [1, 3, 5]
}
pos_neg_ratio = 4

with mlflow.start_run(run_name=f"{experiment_name}"):
    n_last_test = 4
    n_last_val = 0
    for n_last_train in range(2, n_experiments - n_last_test - n_last_val + 1, 2):
        print(
            f"Trianing on the last {n_last_train} experiments, testing on the last {n_last_test} experiments, validating on the last {n_last_val} experiments"
        )

        train_data, _, test_data = split_experiment_train_test_val_data(
            data,
            n_last_test=n_last_test,
            n_last_val=n_last_val,
            n_last_train=n_last_train,
        )

        test_data = test_data.groupby(["EXPERIMENT_ID", "RECIPIENT_ID"]).filter(
            lambda g: g["CLICK"].max() == 1
        )

        train_df, train_pool, _, X_train, y_train = get_pooled_dataset(
            train_data, cols=COLS, cat_cols=CATEGORICAL_COLS, pos_neg_ratio=pos_neg_ratio
        )
        test_df, test_pool, test_group_ids, X_test, y_test = get_pooled_dataset(
            test_data, cols=COLS, cat_cols=CATEGORICAL_COLS
        )
        cat_features = train_pool.get_cat_feature_indices()

        # Train CatBoost model

        print("Training CatBoost model...")
        model = get_model("ranker", cat_features, params=model_params)
        model.fit(train_pool)

        scores = model.predict(X_test)
        preds = test_df.assign(PRED=scores)[
            ["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "PRED"]
        ]
        y_true = test_df[
            ["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "CLICK"]
        ].query("CLICK==1")

        hit_rate, uplift_hit_rate = hit_rate_at_k(preds, y_true, k=1, step=n_last_train)
        avg_mrr, std_mrr, uplift_mrr = mrr_at_k(preds, y_true, 5, step=n_last_train)
        bootstrap_mrr, bootstrap_std_mrr = bootstrap_mrr_at_k(
            preds, y_true, 5, step=n_last_train
        )
        print(
            "hit_rate:",
            hit_rate,
            "uplift_hit_rate:",
            uplift_hit_rate,
            "mrr:",
            avg_mrr,
            "std_mrr:",
            std_mrr,
            "uplift_mrr:",
            uplift_mrr,
        )

Trianing on the last 2 experiments, testing on the last 4 experiments, validating on the last 0 experiments
Training CatBoost model...
hit_rate: 0.25488621115148613 uplift_hit_rate: 11.761023967950187 mrr: 0.510549203114311 std_mrr: 0.06497647784604835 uplift_mrr: 4.177031837382585
Trianing on the last 4 experiments, testing on the last 4 experiments, validating on the last 0 experiments
Training CatBoost model...
hit_rate: 0.24326191250171453 uplift_hit_rate: 7.4554675146556395 mrr: 0.5007317310460562 std_mrr: 0.0487894789893083 uplift_mrr: 2.3422332045532057
Trianing on the last 6 experiments, testing on the last 4 experiments, validating on the last 0 experiments
Training CatBoost model...
hit_rate: 0.24114212214294767 uplift_hit_rate: 5.715150912971964 mrr: 0.49895603076902395 std_mrr: 0.060828343673218606 uplift_mrr: 1.823675314154026
Trianing on the last 8 experiments, testing on the last 4 experiments, validating on the last 0 experiments
Training CatBoost model...


Training has stopped (degenerate solution on iteration 529, probably too small l2-regularization, try to increase it)


hit_rate: 0.22613944334670372 uplift_hit_rate: 0.2957656041007257 mrr: 0.4891532486235861 std_mrr: 0.03931750787108187 uplift_mrr: 0.06466549781381693
Trianing on the last 10 experiments, testing on the last 4 experiments, validating on the last 0 experiments
Training CatBoost model...
hit_rate: 0.2446490326863886 uplift_hit_rate: 8.220103237438774 mrr: 0.503692453278835 std_mrr: 0.04505355608298388 uplift_mrr: 3.0094009782179088
Trianing on the last 12 experiments, testing on the last 4 experiments, validating on the last 0 experiments
Training CatBoost model...
hit_rate: 0.22399740415517366 uplift_hit_rate: -1.167179990169121 mrr: 0.4879766433752797 std_mrr: 0.04881886881571608 uplift_mrr: -0.2876074239517954
Trianing on the last 14 experiments, testing on the last 4 experiments, validating on the last 0 experiments
Training CatBoost model...


Training has stopped (degenerate solution on iteration 233, probably too small l2-regularization, try to increase it)


hit_rate: 0.22720657608235934 uplift_hit_rate: 0.6558612607919858 mrr: 0.4861010499753641 std_mrr: 0.045356523885046705 uplift_mrr: -0.605387903434262
Trianing on the last 16 experiments, testing on the last 4 experiments, validating on the last 0 experiments
Training CatBoost model...


Training has stopped (degenerate solution on iteration 840, probably too small l2-regularization, try to increase it)


hit_rate: 0.21628766452757642 uplift_hit_rate: -4.092396854032011 mrr: 0.4802077889172069 std_mrr: 0.04214062456266775 uplift_mrr: -1.8124153085704064
Trianing on the last 18 experiments, testing on the last 4 experiments, validating on the last 0 experiments
Training CatBoost model...
hit_rate: 0.21071098318311288 uplift_hit_rate: -6.343010574832855 mrr: 0.4810364655868169 std_mrr: 0.036297378300742776 uplift_mrr: -1.573282529380919
Trianing on the last 20 experiments, testing on the last 4 experiments, validating on the last 0 experiments
Training CatBoost model...
hit_rate: 0.2446404605846044 uplift_hit_rate: 7.834840264143552 mrr: 0.5011737951440576 std_mrr: 0.05534685749649128 uplift_mrr: 2.3495109196266606
Trianing on the last 22 experiments, testing on the last 4 experiments, validating on the last 0 experiments
Training CatBoost model...


Training has stopped (degenerate solution on iteration 904, probably too small l2-regularization, try to increase it)


hit_rate: 0.22767439027312264 uplift_hit_rate: 0.6462065826745582 mrr: 0.49031671809929905 std_mrr: 0.04466438044618185 uplift_mrr: 0.2330610086251898
