### Data Preparation

In [15]:
import sys
sys.path.append('/Users/broniy/Desktop/CreativeRank/')
%load_ext autoreload
%autoreload 2

import os 
import mlflow
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.model_selection import ParameterGrid
import numpy as np
from catboost import CatBoostClassifier, CatBoostRanker, Pool
from typing import List
from settings import DATA_FOLDER

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
clicked_df = pd.read_csv(
    DATA_FOLDER / "processed/clicked.csv"
)
non_clicked_df = pd.read_csv(
    DATA_FOLDER / "processed/non_clicked_large.csv"
)

variations_df = pd.read_csv(
    DATA_FOLDER / "processed/feats_df.csv"
).rename(columns={"id": "VARIATION_ID"}).fillna("UNK")
variations_df = variations_df[~variations_df['error'].isna()].drop(columns=['error'])



In [17]:
users_df = (
    pd.concat([clicked_df, non_clicked_df], axis=0)
    .assign(
        CLICK=lambda x: (x["CLICK_COUNT"] > 0).astype(int),
        EXPERIMENT_DATE=lambda x: pd.to_datetime(
            {
                "year": 2025,
                "month": x["MONTH"],
                "day": x["DAY"],
            }
        ),
    )
    .dropna(subset=["CLICK_COUNT"])
    .drop(columns=["RN"])
    .fillna(
        value={
            "TOTAL_ORDERS_VALUE": 0,
            "AVG_ORDER_VALUE": 0,
            "LAST_ORDER_VALUE": 0,
            "COUNTRY": "UNK",
            "REGION": "UNK",
            "LATEST_CLICK_CLIENT_TYPE": "UNK",
            "LATEST_CLICK_CLIENT_NAME": "UNK",
            "LATEST_CLICK_CLIENT_OS_FAMILY": "UNK",
            "FIRST_UTM_SOURCE": "UNK",
            "FIRST_UTM_CONTENT": "UNK",
            "FIRST_UTM_CAMPAIGN": "UNK",
            "LAST_UTM_SOURCE": 'UNK', "LAST_UTM_CONTENT": 'UNK', "LAST_UTM_CAMPAIGN": 'UNK',
            "CITY": "UNK",
            "TIMEZONE": "UNK",
        }
    )
)
# Convert FIRST_ACTIVE_TS to datetime
users_df["FIRST_ACTIVE_TS_dt"] = pd.to_datetime(users_df["FIRST_ACTIVE_TS"])

# Compute months between today and FIRST_ACTIVE_TS
today = pd.Timestamp(datetime.today())

# Compute years and months difference and convert to total months
users_df["MONTHS_SINCE_FIRST_ACTIVE"] = (
    today.year - users_df["FIRST_ACTIVE_TS_dt"].dt.year
) * 12 + (today.month - users_df["FIRST_ACTIVE_TS_dt"].dt.month)


users_df = users_df[users_df["VARIATION_ID"].isin(variations_df["VARIATION_ID"])]
users_df = users_df.drop_duplicates()

# Print the size of users_df before removal
print(f"users_df size before removing small experiments: {users_df.shape[0]} rows")
# Remove experiments with less than 100 participants
experiment_counts = users_df.groupby("EXPERIMENT_ID")["RECIPIENT_ID"].nunique()
valid_experiments = experiment_counts[experiment_counts >= 100].index
users_df = users_df[users_df["EXPERIMENT_ID"].isin(valid_experiments)]
# Print the size of users_df after removal
print(f"users_df size after removing small experiments: {users_df.shape[0]} rows")


users_df size before removing small experiments: 501008 rows
users_df size after removing small experiments: 500953 rows


In [18]:
users_df.groupby(["EXPERIMENT_ID", "EXPERIMENT_DATE"]).agg({"CLICK": ["sum", "count"]})

Unnamed: 0_level_0,Unnamed: 1_level_0,CLICK,CLICK
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,count
EXPERIMENT_ID,EXPERIMENT_DATE,Unnamed: 2_level_2,Unnamed: 3_level_2
002deaf7-331f-4b5e-866b-f6dad60e4a79,2025-07-28,1355,14905
00bb26ff-6fe3-4465-ac77-12bfc33aa6df,2025-07-17,1787,19657
0ef6d2e9-7601-4df6-a215-83e6e79aa24e,2025-10-06,1293,14223
11c49e5d-21ac-4d6d-88c3-f211562a8e07,2025-09-17,1156,12647
1d6dbba7-dcc5-46f4-a4aa-aef3124a8fcf,2025-09-05,1959,21549
2a3f341e-1807-4eb3-9d8d-202c32d52632,2025-08-25,1075,11825
3ecf34fc-1f15-4b32-970f-4061544da763,2025-07-14,1835,20179
43d750b5-8698-4cf0-9ea2-f705f4f196ed,2025-09-25,1968,21585
44d26695-cdf2-41a4-b161-393fdaf964bc,2025-07-26,2122,23342
49c33d7c-ef04-43a7-bbd0-783489c64849,2025-09-06,1757,19327


In [19]:
USER_COLS = [
    "RECIPIENT_ID",
    "COUNTRY",
    "REGION",
    "LATEST_CLICK_CLIENT_TYPE",
    "LATEST_CLICK_CLIENT_NAME",
    "LATEST_CLICK_CLIENT_OS_FAMILY",
    "TOTAL_ORDERS_VALUE",
    "AVG_ORDER_VALUE",
    "LAST_ORDER_VALUE",
    "MONTHS_SINCE_FIRST_ACTIVE",
    "CLICK",
    "FIRST_UTM_SOURCE", "FIRST_UTM_CONTENT", "FIRST_UTM_CAMPAIGN",
    'LAST_UTM_SOURCE', 'LAST_UTM_CONTENT', 'LAST_UTM_CAMPAIGN',
    "CITY",
    "TIMEZONE"
]
VARIATION_COLS = [
   'Q1_CREATIVE', 'Q2_CREATIVE', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q1_SBL', 'Q2_SBL' 
]

CATEGORICAL_COLS = [
    "COUNTRY",
    "REGION",
    "CITY",
    "TIMEZONE",
    "LATEST_CLICK_CLIENT_TYPE",
    "LATEST_CLICK_CLIENT_NAME",
    "LATEST_CLICK_CLIENT_OS_FAMILY",
    "FIRST_UTM_SOURCE", "FIRST_UTM_CONTENT", "FIRST_UTM_CAMPAIGN",
    'LAST_UTM_SOURCE', 'LAST_UTM_CONTENT', 'LAST_UTM_CAMPAIGN',
    'Q1_CREATIVE', 'Q2_CREATIVE', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q1_SBL', 'Q2_SBL' 
]

NUMERICAL_COLS = [
    "TOTAL_ORDERS_VALUE",
    "AVG_ORDER_VALUE",
    "LAST_ORDER_VALUE",
    "MONTHS_SINCE_FIRST_ACTIVE"
]
COLS = CATEGORICAL_COLS + NUMERICAL_COLS

### Debugging

In [None]:
from metrics import hit_rate_at_k, mrr_at_k
from models import get_model, get_pooled_dataset

users_df.head()
train_data = get_pooled_dataset(users_df, pos_neg_ratio=1)

Unnamed: 0,EXPERIMENT_ID,VARIATION_ID,MONTH,DAY,HOUR,LAST_CLICKED_VARIATION_ID,RECIPIENT_ID,CITY,COUNTRY,REGION,...,FIRST_UTM_CONTENT,FIRST_UTM_CAMPAIGN,LAST_UTM_SOURCE,LAST_UTM_CONTENT,LAST_UTM_CAMPAIGN,CLICK_COUNT,CLICK,EXPERIMENT_DATE,FIRST_ACTIVE_TS_dt,MONTHS_SINCE_FIRST_ACTIVE
0,e697ab50-0abb-42d3-92a0-43f1ed597476,0cd88d89-da2d-4ff2-a223-189af1cdf1b8,8,29,14,,01JJNJR6V68XNN1WTS6710V6GZ,Auckland,New Zealand,Auckland,...,UNK,724059624219,UNK,UNK,UNK,2,1,2025-08-29,2025-01-28 04:28:50,9.0
2,44d26695-cdf2-41a4-b161-393fdaf964bc,2968d945-ec27-4bbb-8a6b-4a92db7266de,7,26,13,,01JWDZHGKRGH94A8X8BRCCWV58,Uzès,France,Gard,...,120223221813940318,tRoas 454 ASC 1.5 Campaign - Full Coverage Com...,Klaviyo,UNK,em - new just dropped 445 lace bra Lilas - Thu...,1,1,2025-07-26,2025-05-29 12:16:17,5.0
3,e697ab50-0abb-42d3-92a0-43f1ed597476,71995c0d-3a29-4098-aec0-505734948e83,8,29,12,,01JVZ51X6G5MBB5WCNJ05VBZ7D,Marana,United States,Arizona,...,120207224081300318,Bid Caps Bra,Klaviyo,UNK,em - End of summer sale 457 27.29 - Wed 17 Sep...,1,1,2025-08-29,2025-04-29 20:38:19,6.0
4,91eee220-fee7-488b-952a-c96aa8e493db,5238222f-cb69-4156-871e-321b669fe1e5,8,14,12,,01K1Y919AGBK33GCT4TFTVZJQP,Melbourne,Australia,Victoria,...,The Comfort Shaping Bra,true,UNK,UNK,UNK,1,1,2025-08-14,2025-08-05 23:28:08,2.0
5,00bb26ff-6fe3-4465-ac77-12bfc33aa6df,2ebafe74-b5a8-4ff9-890e-a76f96db1741,7,17,13,,01JW87NBGVQHE7XD4YH2NNQR1E,Wesley Chapel,United States,Florida,...,UNK,UNK,UNK,UNK,UNK,1,1,2025-07-17,2025-05-27 06:56:58,5.0


### Ranking Results

**Grid Search**

In [20]:
mlflow.set_experiment("RankerGridSearch")


# Define grid of hyperparameters
param_grid = {
    "learning_rate": [0.03, 0.1, 0.5],
    "depth": [3, 6, 10],
    "iterations": [200, 500, 1000],
    # "l2_leaf_reg": [1, 3, 5]
}

grid = list(ParameterGrid(param_grid))

for i, params in enumerate(grid):
    print(f"Running grid search {i+1} of {params}")
    with mlflow.start_run(run_name=f"ranker_grid_search_{i}"):
        # Ensure experiment_date is datetime
        users_df["EXPERIMENT_DATE"] = pd.to_datetime(users_df["EXPERIMENT_DATE"])

        variations_per_experimen_df = users_df[
            ["EXPERIMENT_ID", "VARIATION_ID"]
        ].drop_duplicates()

        # Sort unique experiments by date
        experiment_order = (
            users_df[["EXPERIMENT_ID", "EXPERIMENT_DATE"]]
            .sort_values("EXPERIMENT_DATE")
            .drop_duplicates()
            .reset_index(drop=True)
        )

        n_last_test = 4
        n_last_val = 2
        n_last_train = n_last_test + n_last_val

        # Get last two for test, others for train
        test_experiments = experiment_order.tail(n_last_test)["EXPERIMENT_ID"]
        val_experiments = experiment_order.iloc[-n_last_train:-n_last_test][
            "EXPERIMENT_ID"
        ]
        train_experiments = experiment_order.iloc[:-n_last_train]["EXPERIMENT_ID"]

        print(f"Number of train experiments: {len(train_experiments)}")
        print(f"Number of validation experiments: {len(val_experiments)}")
        print(f"Number of test experiments: {len(test_experiments)}")
        assert len(train_experiments) + len(val_experiments) + len(
            test_experiments
        ) == len(experiment_order)
        # Join users_df with variation_df on EXPERIMENT_ID and VARIATION_ID

        users_all_variations = pd.merge(
            users_df.drop(columns=["VARIATION_ID"]),
            variations_per_experimen_df,
            how="left",
            left_on="EXPERIMENT_ID",
            right_on="EXPERIMENT_ID",
        )
        # Assign the click to the correct variation
        users_all_variations["CLICK"] = (
            users_all_variations.set_index(
                ["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID"]
            )
            .index.map(
                users_df.drop_duplicates(
                    ["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID"]
                ).set_index(["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID"])["CLICK"]
            )
            .fillna(0.5)
        )

        users_all_variations = users_all_variations.merge(
            variations_df,
            left_on=["VARIATION_ID"],
            right_on=["VARIATION_ID"],
            how="left",
        )

        # Select rows for train/test
        train_df = users_all_variations[
            users_all_variations["EXPERIMENT_ID"].isin(train_experiments)
        ]
        # For validation set
        val_df_raw = users_all_variations[
            users_all_variations["EXPERIMENT_ID"].isin(val_experiments)
        ]
        val_df = val_df_raw.groupby(["EXPERIMENT_ID", "RECIPIENT_ID"]).filter(
            lambda g: g["CLICK"].max() == 1
        )

        # For test set
        test_df_raw = users_all_variations[
            users_all_variations["EXPERIMENT_ID"].isin(test_experiments)
        ]
        test_df = test_df_raw.groupby(["EXPERIMENT_ID", "RECIPIENT_ID"]).filter(
            lambda g: g["CLICK"].max() == 1
        )

        train_pool, _, X_train, y_train = get_pooled_dataset(train_df, pos_neg_ratio=1)
        val_pool, _, X_val, y_val = get_pooled_dataset(val_df)
        cat_features = train_pool.get_cat_feature_indices()

        ranker = get_model("ranker", cat_features, params)
        ranker.fit(train_pool, eval_set=val_pool, use_best_model=True)

        test_pool, test_group_ids, X_test, y_test = get_pooled_dataset(val_df)
        scores = ranker.predict(X_test)

        preds = val_df.sort_values(["EXPERIMENT_ID", "RECIPIENT_ID"]).assign(
            PRED=scores, GT=y_test
        )[["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "PRED"]]
        y_true = val_df[
            ["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "CLICK"]
        ].query("CLICK==1")

        mrr_at_5, uplift_mrr_at_5 = mrr_at_k(preds, y_true, 5)
        hit_rate_at_1, uplift_hit_rate_at_1 = hit_rate_at_k(preds, y_true, 1)

        mlflow.log_metric("mrr_at_5", mrr_at_5)
        mlflow.log_metric("mrr_at_5_uplift", uplift_mrr_at_5)
        mlflow.log_metric("hit_rate_at_1", hit_rate_at_1)
        mlflow.log_metric("hit_rate_at_1_uplift", uplift_hit_rate_at_1)

Running grid search 1 of {'depth': 3, 'iterations': 200, 'learning_rate': 0.03}
Number of train experiments: 20
Number of validation experiments: 2
Number of test experiments: 4
Running grid search 2 of {'depth': 3, 'iterations': 200, 'learning_rate': 0.1}
Number of train experiments: 20
Number of validation experiments: 2
Number of test experiments: 4
Running grid search 3 of {'depth': 3, 'iterations': 200, 'learning_rate': 0.5}
Number of train experiments: 20
Number of validation experiments: 2
Number of test experiments: 4
Running grid search 4 of {'depth': 3, 'iterations': 500, 'learning_rate': 0.03}
Number of train experiments: 20
Number of validation experiments: 2
Number of test experiments: 4
Running grid search 5 of {'depth': 3, 'iterations': 500, 'learning_rate': 0.1}
Number of train experiments: 20
Number of validation experiments: 2
Number of test experiments: 4
Running grid search 6 of {'depth': 3, 'iterations': 500, 'learning_rate': 0.5}
Number of train experiments: 20
Nu

**PN Ratio**

In [12]:
mlflow.set_experiment("PN Ratio")

# Define grid of hyperparameters
model_params = {
    "learning_rate": 0.5,
    "depth": 6,
    "iterations": 1000,
    # "l2_leaf_reg": [1, 3, 5]
}

pn_grid = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

for i, pn_ratio in enumerate(pn_grid):
    print(f"Running grid search {i+1} of pn_ratio: {pn_ratio}")
    with mlflow.start_run(run_name=f"ranker_pn_search_{i}"):
        # Ensure experiment_date is datetime
        users_df["EXPERIMENT_DATE"] = pd.to_datetime(users_df["EXPERIMENT_DATE"])

        variations_per_experimen_df = users_df[
            ["EXPERIMENT_ID", "VARIATION_ID"]
        ].drop_duplicates()

        # Sort unique experiments by date
        experiment_order = (
            users_df[["EXPERIMENT_ID", "EXPERIMENT_DATE"]]
            .sort_values("EXPERIMENT_DATE")
            .drop_duplicates()
            .reset_index(drop=True)
        )

        n_last_test = 4
        n_last_val = 2
        n_last_train = n_last_test + n_last_val

        # Get last two for test, others for train
        test_experiments = experiment_order.tail(n_last_test)["EXPERIMENT_ID"]
        val_experiments = experiment_order.iloc[-n_last_train:-n_last_test][
            "EXPERIMENT_ID"
        ]
        train_experiments = experiment_order.iloc[:-n_last_train]["EXPERIMENT_ID"]

        print(f"Number of train experiments: {len(train_experiments)}")
        print(f"Number of validation experiments: {len(val_experiments)}")
        print(f"Number of test experiments: {len(test_experiments)}")
        assert len(train_experiments) + len(val_experiments) + len(
            test_experiments
        ) == len(experiment_order)
        # Join users_df with variation_df on EXPERIMENT_ID and VARIATION_ID

        users_all_variations = pd.merge(
            users_df.drop(columns=["VARIATION_ID"]),
            variations_per_experimen_df,
            how="left",
            left_on="EXPERIMENT_ID",
            right_on="EXPERIMENT_ID",
        )
        # Assign the click to the correct variation
        users_all_variations["CLICK"] = (
            users_all_variations.set_index(
                ["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID"]
            )
            .index.map(
                users_df.drop_duplicates(
                    ["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID"]
                ).set_index(["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID"])["CLICK"]
            )
            .fillna(0.5)
        )

        users_all_variations = users_all_variations.merge(
            variations_df,
            left_on=["VARIATION_ID"],
            right_on=["VARIATION_ID"],
            how="left",
        )

        # Select rows for train/test
        train_df = users_all_variations[
            users_all_variations["EXPERIMENT_ID"].isin(train_experiments)
        ]
        # For validation set
        val_df_raw = users_all_variations[
            users_all_variations["EXPERIMENT_ID"].isin(val_experiments)
        ]
        val_df = val_df_raw.groupby(["EXPERIMENT_ID", "RECIPIENT_ID"]).filter(
            lambda g: g["CLICK"].max() == 1
        )

        # For test set
        test_df_raw = users_all_variations[
            users_all_variations["EXPERIMENT_ID"].isin(test_experiments)
        ]
        test_df = test_df_raw.groupby(["EXPERIMENT_ID", "RECIPIENT_ID"]).filter(
            lambda g: g["CLICK"].max() == 1
        )

        train_pool, _, X_train, y_train = get_pooled_dataset(train_df, pos_neg_ratio=pn_ratio)
        val_pool, _, X_val, y_val = get_pooled_dataset(val_df)
        cat_features = train_pool.get_cat_feature_indices()

        ranker = get_model("ranker", cat_features, model_params)
        ranker.fit(train_pool, eval_set=val_pool, use_best_model=True)

        test_pool, test_group_ids, X_test, y_test = get_pooled_dataset(val_df)
        scores = ranker.predict(X_test)

        preds = val_df.sort_values(["EXPERIMENT_ID", "RECIPIENT_ID"]).assign(
            PRED=scores, GT=y_test
        )[["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "PRED"]]
        y_true = val_df[
            ["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "CLICK"]
        ].query("CLICK==1")

        mrr_at_5, uplift_mrr_at_5 = mrr_at_k(preds, y_true, 5)
        hit_rate_at_1, uplift_hit_rate_at_1 = hit_rate_at_k(preds, y_true, 1)

        mlflow.log_metric("mrr_at_5", mrr_at_5)
        mlflow.log_metric("mrr_at_5_uplift", uplift_mrr_at_5)
        mlflow.log_metric("hit_rate_at_1", hit_rate_at_1)
        mlflow.log_metric("hit_rate_at_1_uplift", uplift_hit_rate_at_1)

2025/10/22 19:57:40 INFO mlflow.tracking.fluent: Experiment with name 'PN Ratio' does not exist. Creating a new experiment.


Running grid search 1 of pn_ratio: 0
Number of train experiments: 20
Number of validation experiments: 2
Number of test experiments: 4
The size of the dataset is:  176967
The size of the dataset is:  24445
The size of the dataset is:  24445
Running grid search 2 of pn_ratio: 1
Number of train experiments: 20
Number of validation experiments: 2
Number of test experiments: 4
The size of the dataset is:  354542
The size of the dataset is:  24445
The size of the dataset is:  24445
Running grid search 3 of pn_ratio: 2
Number of train experiments: 20
Number of validation experiments: 2
Number of test experiments: 4
The size of the dataset is:  532277
The size of the dataset is:  24445
The size of the dataset is:  24445
Running grid search 4 of pn_ratio: 3
Number of train experiments: 20
Number of validation experiments: 2
Number of test experiments: 4
The size of the dataset is:  709938
The size of the dataset is:  24445
The size of the dataset is:  24445
Running grid search 5 of pn_ratio: 4

Training has stopped (degenerate solution on iteration 557, probably too small l2-regularization, try to increase it)


The size of the dataset is:  24445
Running grid search 7 of pn_ratio: 6
Number of train experiments: 21
Number of validation experiments: 2
Number of test experiments: 4
The size of the dataset is:  1242822
The size of the dataset is:  24445
The size of the dataset is:  24445
Running grid search 8 of pn_ratio: 7
Number of train experiments: 21
Number of validation experiments: 2
Number of test experiments: 4
The size of the dataset is:  1420560
The size of the dataset is:  24445
The size of the dataset is:  24445
Running grid search 9 of pn_ratio: 8
Number of train experiments: 21
Number of validation experiments: 2
Number of test experiments: 4
The size of the dataset is:  1597962
The size of the dataset is:  24445


Training has stopped (degenerate solution on iteration 624, probably too small l2-regularization, try to increase it)


The size of the dataset is:  24445
Running grid search 10 of pn_ratio: 9
Number of train experiments: 21
Number of validation experiments: 2
Number of test experiments: 4
The size of the dataset is:  1775746
The size of the dataset is:  24445


Training has stopped (degenerate solution on iteration 576, probably too small l2-regularization, try to increase it)


The size of the dataset is:  24445


### Train catboost via expanding window approach

In [12]:
variations_per_experimen_df = users_df[
    ["EXPERIMENT_ID", "VARIATION_ID"]
].drop_duplicates()

users_all_variations = pd.merge(
    users_df.drop(columns=["VARIATION_ID"]),
    variations_per_experimen_df,
    how="left",
    left_on="EXPERIMENT_ID",
    right_on="EXPERIMENT_ID",
)
# Assign the click to the correct variation
users_all_variations["CLICK"] = (
    users_all_variations.set_index(["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID"])
    .index.map(
        users_df.drop_duplicates(
            ["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID"]
        ).set_index(["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID"])["CLICK"]
    )
    .fillna(0.5)
)

users_all_variations = users_all_variations.merge(
    variations_df,
    left_on=["VARIATION_ID"],
    right_on=["VARIATION_ID"],
    how="left",
)

In [16]:
from sklearn.model_selection import TimeSeriesSplit

# Start an mlflow experiment
mlflow.set_experiment("Catboost_Ranker")

pos_neg_ratio = 1
experiment_name = f"pn_ratio_{pos_neg_ratio}_all_feats"

with mlflow.start_run(run_name=f"{experiment_name}"):
    tscv = TimeSeriesSplit(n_splits=5, test_size=4)
    idx = (
        users_df[["EXPERIMENT_ID", "EXPERIMENT_DATE"]]
        .sort_values("EXPERIMENT_DATE")
        .drop_duplicates()
        .reset_index(drop=True)
    )


    for train_idx, test_idx in tscv.split(idx):
        print(f"Train idx: {train_idx}, Test idx: {test_idx}")
        train_idx = idx.iloc[train_idx]["EXPERIMENT_ID"].values
        test_idx = idx.iloc[test_idx]["EXPERIMENT_ID"].values

        # prepare train data
        train_df = users_all_variations[
            users_all_variations["EXPERIMENT_ID"].isin(train_idx)
        ]

        train_pool, _, X_train, y_train = get_pooled_dataset(train_df, pos_neg_ratio)

        # prepare test data
        test_df_raw = users_all_variations[
            users_all_variations["EXPERIMENT_ID"].isin(test_idx)
        ]
        test_df = test_df_raw.groupby(["EXPERIMENT_ID", "RECIPIENT_ID"]).filter(
            lambda g: g["CLICK"].max() == 1
        )

        test_pool, test_group_ids, X_test, y_test = get_pooled_dataset(test_df)
        cat_features = train_pool.get_cat_feature_indices()

        # Train CatBoost model

        print("Training CatBoost model...")
        model = get_model("ranker", cat_features)
        model.fit(train_pool)

        scores = model.predict(X_test)
        preds = test_df.sort_values(["EXPERIMENT_ID", "RECIPIENT_ID"]).assign(
            PRED=scores, GT=y_test
        )[["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "PRED"]]
        y_true = test_df[
            ["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "CLICK"]
        ].query("CLICK==1")

        hit_rate, uplift_hit_rate = hit_rate_at_k(preds, y_true, k=1)
        mrr, uplift_mrr = mrr_at_k(preds, y_true, 5)

        # Gather the metrics for this split. You could add more metrics if needed.
        mlflow.log_metric("avg_hit_rate_at_1", hit_rate, step=len(train_idx))
        mlflow.log_metric("avg_mrr_at_5", mrr, step=len(train_idx))
        mlflow.log_metric(
            "avg_uplift_hit_rate_at_1", uplift_hit_rate, step=len(train_idx)
        )
        mlflow.log_metric("avg_uplift_mrr_at_5", uplift_mrr, step=len(train_idx))

        print(
            "hit_rate:",
            hit_rate,
            "uplift_hit_rate:",
            uplift_hit_rate,
            "mrr:",
            mrr,
            "uplift_mrr:",
            uplift_mrr,
        )

Train idx: [0 1 2 3 4 5], Test idx: [6 7 8 9]
Training CatBoost model...
hit_rate: 0.1975163715843296 uplift_hit_rate: -6.663854067155237 mrr: 0.4609974182112353 uplift_mrr: -2.468728386497536
Train idx: [0 1 2 3 4 5 6 7 8 9], Test idx: [10 11 12 13]
Training CatBoost model...
hit_rate: 0.18878523085612545 uplift_hit_rate: -5.607384571937282 mrr: 0.4500056619439219 uplift_mrr: -1.4586141728638173
Train idx: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13], Test idx: [14 15 16 17]
Training CatBoost model...
hit_rate: 0.2081134850303727 uplift_hit_rate: 4.0567425151863405 mrr: 0.46701370542355036 uplift_mrr: 2.265774910266509
Train idx: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17], Test idx: [18 19 20 21]
Training CatBoost model...
hit_rate: 0.17698985806504441 uplift_hit_rate: -11.505070967477801 mrr: 0.4419719346477357 uplift_mrr: -3.2178245296929013
Train idx: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21], Test idx: [22 23 24 25]
Training CatBoost model...
