### Data Preparation

In [16]:
import sys
sys.path.append('/Users/broniy/Desktop/CreativeRank/')
%load_ext autoreload
%autoreload 2

import os 
import mlflow
import pandas as pd
import numpy as np
from sklearn.model_selection import ParameterGrid
import numpy as np
from metrics import bootstrap_mrr_at_k, mrr_at_k, hit_rate_at_k
from models import get_model, get_pooled_dataset
from settings import DATA_FOLDER
from notebooks.experiment_data import get_experiment_data, COLS, CATEGORICAL_COLS, split_experiment_train_test_val_data

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
data = get_experiment_data()

users_df size before removing small experiments: 501008 rows
users_df size after removing small experiments: 500953 rows


In [14]:
data.sort_values(["EXPERIMENT_DATE"]).query("CLICK.isin([0, 1])").groupby(
    ["EXPERIMENT_DATE", "EXPERIMENT_ID"]
).agg({"CLICK": ["sum", "count"]})

Unnamed: 0_level_0,Unnamed: 1_level_0,CLICK,CLICK
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,count
EXPERIMENT_DATE,EXPERIMENT_ID,Unnamed: 2_level_2,Unnamed: 3_level_2
2025-07-01,8ea67496-0fb3-4efd-8cea-4b8d88351b8e,1766.0,11144
2025-07-10,81ae4870-e57d-4bc4-a2d7-48ffa5411707,2113.0,21852
2025-07-11,c5288ca2-3928-4364-8f08-bebc1036dd87,2626.0,28807
2025-07-14,3ecf34fc-1f15-4b32-970f-4061544da763,1835.0,20187
2025-07-17,00bb26ff-6fe3-4465-ac77-12bfc33aa6df,1787.0,19663
2025-07-22,6f506df9-be60-452d-b914-8230c29c2ff1,1622.0,17838
2025-07-26,44d26695-cdf2-41a4-b161-393fdaf964bc,2122.0,23348
2025-07-28,002deaf7-331f-4b5e-866b-f6dad60e4a79,1355.0,14905
2025-08-02,e4b4a349-3b14-439e-946f-f716101dac69,853.0,9410
2025-08-07,e627d7f0-46c4-4894-872e-59a2fc108c30,356.0,3951


### Debugging

In [8]:
from metrics import hit_rate_at_k, mrr_at_k, bootstrap_mrr_at_k
from models import get_model, get_pooled_dataset

users_df.head()

train_data = users_all_variations[users_all_variations["EXPERIMENT_ID"]=='823158da-7b0a-4c19-8189-663c22a3ae38']
test_data = users_all_variations[users_all_variations["EXPERIMENT_ID"]=='1d6dbba7-dcc5-46f4-a4aa-aef3124a8fcf']

train_df, train_pool, group_ids, X_train, y_train = get_pooled_dataset(train_data, pos_neg_ratio=1, cols=COLS, cat_cols=CATEGORICAL_COLS)
test_df, test_pool, group_ids, X_test, y_test = get_pooled_dataset(test_data, pos_neg_ratio=1, cols=COLS, cat_cols=CATEGORICAL_COLS)

ranker = get_model("ranker", train_pool.get_cat_feature_indices(), params={"depth": 6, "learning_rate": 0.5, "iterations": 1000})
ranker.fit(train_pool)

scores = ranker.predict(test_pool)


In [12]:
preds = test_df.assign(
    PRED=scores, GT=y_test
)[["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "PRED"]]

y_true = test_df[
    ["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "CLICK"]
].query("CLICK==1")

bootstraped_mrr_at_5, std_mrr_at_5 = bootstrap_mrr_at_k(preds, y_true, 5)
print(bootstraped_mrr_at_5, std_mrr_at_5)

0.47392530202484273 0.007637761761350472


### Ranking Results

In [None]:
mlflow.set_experiment("Baseline")

# Define grid of hyperparameters
model_params = dict(
    iterations=200,
    learning_rate=0.03,
    depth=4,
    l2_leaf_reg=10,
    random_strength=2,
    bagging_temperature=0.5,
    subsample=0.7,
    rsm=0.7,  # feature subsampling
)




with mlflow.start_run(run_name=f"ranking"):

    train_data, _, test_data = split_experiment_train_test_val_data(data, n_last_test=4, n_last_val=0)

    from sklearn.model_selection import GroupKFold

    n_splits = 5
    # Use "EXPERIMENT_ID" to group
    group_kfold = GroupKFold(n_splits=n_splits)
    groups = train_data["EXPERIMENT_ID"]

    cv_results = []
    for fold, (train_idx, val_idx) in enumerate(group_kfold.split(train_data, groups=groups)):
        fold_train_data = train_data.iloc[train_idx]
        fold_val_data = train_data.iloc[val_idx]

        # Prepare pools and datasets per fold
        train_df, train_pool, _, X_train, y_train = get_pooled_dataset(
            fold_train_data, pos_neg_ratio=1, cols=COLS, cat_cols=CATEGORICAL_COLS
        )
        val_df, val_pool, _, X_val, y_val = get_pooled_dataset(
            fold_val_data, cols=COLS, cat_cols=CATEGORICAL_COLS
        )
        cat_features = train_pool.get_cat_feature_indices()

        # Fit the model
        ranker = get_model("ranker", cat_features, model_params)
        ranker.fit(train_pool, eval_set=val_pool, use_best_model=True)

        # Validation scoring
        scores = ranker.predict(X_val)
        preds = val_df.assign(
            PRED=scores
        )[["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "PRED"]]
        y_true = val_df[
            ["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "CLICK"]
        ].query("CLICK==1")

        avg_mrr_at_5, _, _ = mrr_at_k(preds, y_true, 5, prefix=f"cvfold{fold}_")
        avg_hit_rate_1, _ = hit_rate_at_k(preds, y_true, 1, prefix=f"cvfold{fold}_")

        cv_results.append(
            {
                "fold": fold,
                "avg_mrr_at_5": avg_mrr_at_5,
                "avg_hit_rate_1": avg_hit_rate_1,
            }
        )

    print("CV results:", cv_results)

    train_df, train_pool, _, X_train, y_train = get_pooled_dataset(train_data, pos_neg_ratio=1, cols=COLS, cat_cols=CATEGORICAL_COLS)
    val_df, val_pool, _, X_val, y_val = get_pooled_dataset(val_data, cols=COLS, cat_cols=CATEGORICAL_COLS)
    cat_features = train_pool.get_cat_feature_indices()

    ranker = get_model("ranker", cat_features, model_params)
    ranker.fit(train_pool, eval_set=val_pool, use_best_model=True)

    test_df, test_pool, test_group_ids, X_test, y_test = get_pooled_dataset(test_data, cols=COLS, cat_cols=CATEGORICAL_COLS)

    for prefix, df, X in [("test_", test_df, X_test), ("val_", val_df, X_val)]:
        scores = ranker.predict(X)
        preds = df.assign(
            PRED=scores
        )[["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "PRED"]]
        y_true = df[
            ["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "CLICK"]
        ].query("CLICK==1")

        avg_mrr_at_5, _, _  = mrr_at_k(preds, y_true, 5, prefix=prefix)
        avg_hit_rate_1, _ = hit_rate_at_k(preds, y_true, 1, prefix=prefix)
        bootstraped_mrr_at_5, std_mrr_at_5 = bootstrap_mrr_at_k(preds, y_true, 5, prefix=prefix)


**Grid Search**

In [20]:
mlflow.set_experiment("RankerGridSearch")


# Define grid of hyperparameters
param_grid = {
    'iterations': [200, 400, 600],
    'learning_rate': [0.02, 0.03, 0.05],
    'depth': [3, 4, 5],
    'l2_leaf_reg': [3, 5, 10, 20],
    'random_strength': [0.5, 1, 2],
    'bagging_temperature': [0.25, 0.5, 1],
    'rsm': [0.6, 0.8, 1.0],  # feature subsampling
    'loss_function': ['YetiRank', 'PairLogit'],
    'bootstrap_type': ['Bayesian', 'Bernoulli'],
    'subsample': [0.6, 0.8, 1.0],
}

grid = list(ParameterGrid(param_grid))

for i, params in enumerate(grid):
    print(f"Running grid search {i+1} of {params}")
    with mlflow.start_run(run_name=f"ranker_grid_search_{i}"):
        # Ensure experiment_date is datetime
        users_df["EXPERIMENT_DATE"] = pd.to_datetime(users_df["EXPERIMENT_DATE"])

        variations_per_experimen_df = users_df[
            ["EXPERIMENT_ID", "VARIATION_ID"]
        ].drop_duplicates()

        # Sort unique experiments by date
        experiment_order = (
            users_df[["EXPERIMENT_ID", "EXPERIMENT_DATE"]]
            .sort_values("EXPERIMENT_DATE")
            .drop_duplicates()
            .reset_index(drop=True)
        )

        n_last_test = 4
        n_last_val = 2
        n_last_train = n_last_test + n_last_val

        # Get last two for test, others for train
        test_experiments = experiment_order.tail(n_last_test)["EXPERIMENT_ID"]
        val_experiments = experiment_order.iloc[-n_last_train:-n_last_test][
            "EXPERIMENT_ID"
        ]
        train_experiments = experiment_order.iloc[:-n_last_train]["EXPERIMENT_ID"]

        print(f"Number of train experiments: {len(train_experiments)}")
        print(f"Number of validation experiments: {len(val_experiments)}")
        print(f"Number of test experiments: {len(test_experiments)}")
        assert len(train_experiments) + len(val_experiments) + len(
            test_experiments
        ) == len(experiment_order)
        # Join users_df with variation_df on EXPERIMENT_ID and VARIATION_ID

        users_all_variations = pd.merge(
            users_df.drop(columns=["VARIATION_ID"]),
            variations_per_experimen_df,
            how="left",
            left_on="EXPERIMENT_ID",
            right_on="EXPERIMENT_ID",
        )
        # Assign the click to the correct variation
        users_all_variations["CLICK"] = (
            users_all_variations.set_index(
                ["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID"]
            )
            .index.map(
                users_df.drop_duplicates(
                    ["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID"]
                ).set_index(["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID"])["CLICK"]
            )
            .fillna(0.5)
        )

        users_all_variations = users_all_variations.merge(
            variations_df,
            left_on=["VARIATION_ID"],
            right_on=["VARIATION_ID"],
            how="left",
        )

        # Select rows for train/test
        train_df = users_all_variations[
            users_all_variations["EXPERIMENT_ID"].isin(train_experiments)
        ]
        # For validation set
        val_df_raw = users_all_variations[
            users_all_variations["EXPERIMENT_ID"].isin(val_experiments)
        ]
        val_df = val_df_raw.groupby(["EXPERIMENT_ID", "RECIPIENT_ID"]).filter(
            lambda g: g["CLICK"].max() == 1
        )

        # For test set
        test_df_raw = users_all_variations[
            users_all_variations["EXPERIMENT_ID"].isin(test_experiments)
        ]
        test_df = test_df_raw.groupby(["EXPERIMENT_ID", "RECIPIENT_ID"]).filter(
            lambda g: g["CLICK"].max() == 1
        )

        train_pool, _, X_train, y_train = get_pooled_dataset(train_df, pos_neg_ratio=1)
        val_pool, _, X_val, y_val = get_pooled_dataset(val_df)
        cat_features = train_pool.get_cat_feature_indices()

        ranker = get_model("ranker", cat_features, params)
        ranker.fit(train_pool, eval_set=val_pool, use_best_model=True)

        test_pool, test_group_ids, X_test, y_test = get_pooled_dataset(val_df)
        scores = ranker.predict(X_test)

        preds = val_df.sort_values(["EXPERIMENT_ID", "RECIPIENT_ID"]).assign(
            PRED=scores, GT=y_test
        )[["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "PRED"]]
        y_true = val_df[
            ["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "CLICK"]
        ].query("CLICK==1")

        mrr_at_5, uplift_mrr_at_5 = mrr_at_k(preds, y_true, 5)
        hit_rate_at_1, uplift_hit_rate_at_1 = hit_rate_at_k(preds, y_true, 1)

        mlflow.log_metric("mrr_at_5", mrr_at_5)
        mlflow.log_metric("mrr_at_5_uplift", uplift_mrr_at_5)
        mlflow.log_metric("hit_rate_at_1", hit_rate_at_1)
        mlflow.log_metric("hit_rate_at_1_uplift", uplift_hit_rate_at_1)

Running grid search 1 of {'depth': 3, 'iterations': 200, 'learning_rate': 0.03}
Number of train experiments: 20
Number of validation experiments: 2
Number of test experiments: 4
Running grid search 2 of {'depth': 3, 'iterations': 200, 'learning_rate': 0.1}
Number of train experiments: 20
Number of validation experiments: 2
Number of test experiments: 4
Running grid search 3 of {'depth': 3, 'iterations': 200, 'learning_rate': 0.5}
Number of train experiments: 20
Number of validation experiments: 2
Number of test experiments: 4
Running grid search 4 of {'depth': 3, 'iterations': 500, 'learning_rate': 0.03}
Number of train experiments: 20
Number of validation experiments: 2
Number of test experiments: 4
Running grid search 5 of {'depth': 3, 'iterations': 500, 'learning_rate': 0.1}
Number of train experiments: 20
Number of validation experiments: 2
Number of test experiments: 4
Running grid search 6 of {'depth': 3, 'iterations': 500, 'learning_rate': 0.5}
Number of train experiments: 20
Nu

### Train catboost via expanding window approach

In [12]:
variations_per_experimen_df = users_df[
    ["EXPERIMENT_ID", "VARIATION_ID"]
].drop_duplicates()

users_all_variations = pd.merge(
    users_df.drop(columns=["VARIATION_ID"]),
    variations_per_experimen_df,
    how="left",
    left_on="EXPERIMENT_ID",
    right_on="EXPERIMENT_ID",
)
# Assign the click to the correct variation
users_all_variations["CLICK"] = (
    users_all_variations.set_index(["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID"])
    .index.map(
        users_df.drop_duplicates(
            ["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID"]
        ).set_index(["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID"])["CLICK"]
    )
    .fillna(0.5)
)

users_all_variations = users_all_variations.merge(
    variations_df,
    left_on=["VARIATION_ID"],
    right_on=["VARIATION_ID"],
    how="left",
)

In [16]:
from sklearn.model_selection import TimeSeriesSplit

# Start an mlflow experiment
mlflow.set_experiment("Catboost_Ranker")

pos_neg_ratio = 1
experiment_name = f"pn_ratio_{pos_neg_ratio}_all_feats"

with mlflow.start_run(run_name=f"{experiment_name}"):
    tscv = TimeSeriesSplit(n_splits=5, test_size=4)
    idx = (
        users_df[["EXPERIMENT_ID", "EXPERIMENT_DATE"]]
        .sort_values("EXPERIMENT_DATE")
        .drop_duplicates()
        .reset_index(drop=True)
    )


    for train_idx, test_idx in tscv.split(idx):
        print(f"Train idx: {train_idx}, Test idx: {test_idx}")
        train_idx = idx.iloc[train_idx]["EXPERIMENT_ID"].values
        test_idx = idx.iloc[test_idx]["EXPERIMENT_ID"].values

        # prepare train data
        train_df = users_all_variations[
            users_all_variations["EXPERIMENT_ID"].isin(train_idx)
        ]

        train_pool, _, X_train, y_train = get_pooled_dataset(train_df, pos_neg_ratio)

        # prepare test data
        test_df_raw = users_all_variations[
            users_all_variations["EXPERIMENT_ID"].isin(test_idx)
        ]
        test_df = test_df_raw.groupby(["EXPERIMENT_ID", "RECIPIENT_ID"]).filter(
            lambda g: g["CLICK"].max() == 1
        )

        test_pool, test_group_ids, X_test, y_test = get_pooled_dataset(test_df)
        cat_features = train_pool.get_cat_feature_indices()

        # Train CatBoost model

        print("Training CatBoost model...")
        model = get_model("ranker", cat_features)
        model.fit(train_pool)

        scores = model.predict(X_test)
        preds = test_df.sort_values(["EXPERIMENT_ID", "RECIPIENT_ID"]).assign(
            PRED=scores, GT=y_test
        )[["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "PRED"]]
        y_true = test_df[
            ["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "CLICK"]
        ].query("CLICK==1")

        hit_rate, uplift_hit_rate = hit_rate_at_k(preds, y_true, k=1)
        mrr, uplift_mrr = mrr_at_k(preds, y_true, 5)

        # Gather the metrics for this split. You could add more metrics if needed.
        mlflow.log_metric("avg_hit_rate_at_1", hit_rate, step=len(train_idx))
        mlflow.log_metric("avg_mrr_at_5", mrr, step=len(train_idx))
        mlflow.log_metric(
            "avg_uplift_hit_rate_at_1", uplift_hit_rate, step=len(train_idx)
        )
        mlflow.log_metric("avg_uplift_mrr_at_5", uplift_mrr, step=len(train_idx))

        print(
            "hit_rate:",
            hit_rate,
            "uplift_hit_rate:",
            uplift_hit_rate,
            "mrr:",
            mrr,
            "uplift_mrr:",
            uplift_mrr,
        )

Train idx: [0 1 2 3 4 5], Test idx: [6 7 8 9]
Training CatBoost model...
hit_rate: 0.1975163715843296 uplift_hit_rate: -6.663854067155237 mrr: 0.4609974182112353 uplift_mrr: -2.468728386497536
Train idx: [0 1 2 3 4 5 6 7 8 9], Test idx: [10 11 12 13]
Training CatBoost model...
hit_rate: 0.18878523085612545 uplift_hit_rate: -5.607384571937282 mrr: 0.4500056619439219 uplift_mrr: -1.4586141728638173
Train idx: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13], Test idx: [14 15 16 17]
Training CatBoost model...
hit_rate: 0.2081134850303727 uplift_hit_rate: 4.0567425151863405 mrr: 0.46701370542355036 uplift_mrr: 2.265774910266509
Train idx: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17], Test idx: [18 19 20 21]
Training CatBoost model...
hit_rate: 0.17698985806504441 uplift_hit_rate: -11.505070967477801 mrr: 0.4419719346477357 uplift_mrr: -3.2178245296929013
Train idx: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21], Test idx: [22 23 24 25]
Training CatBoost model...
