### Data Preparation

In [None]:
import sys
sys.path.append('/Users/broniy/Desktop/CreativeRank/')
%load_ext autoreload
%autoreload 2

import os 
import mlflow
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.model_selection import ParameterGrid
import numpy as np
from catboost import CatBoostClassifier, CatBoostRanker, Pool
from typing import List
from settings import DATA_FOLDER

In [None]:
clicked_df = pd.read_csv(
    DATA_FOLDER / "processed/clicked.csv"
)
non_clicked_df = pd.read_csv(
    DATA_FOLDER / "processed/non_clicked_large.csv"
)

variations_df = pd.read_csv(
    DATA_FOLDER / "processed/feats_df.csv"
).rename(columns={"id": "VARIATION_ID"}).fillna("UNK")
variations_df = variations_df[~variations_df['error'].isna()].drop(columns=['error'])



In [None]:
users_df = (
    pd.concat([clicked_df, non_clicked_df], axis=0)
    .assign(
        CLICK=lambda x: (x["CLICK_COUNT"] > 0).astype(int),
        EXPERIMENT_DATE=lambda x: pd.to_datetime(
            {
                "year": 2025,
                "month": x["MONTH"],
                "day": x["DAY"],
            }
        ),
    )
    .dropna(subset=["CLICK_COUNT"])
    .drop(columns=["RN"])
    .fillna(
        value={
            "TOTAL_ORDERS_VALUE": 0,
            "AVG_ORDER_VALUE": 0,
            "LAST_ORDER_VALUE": 0,
            "COUNTRY": "UNK",
            "REGION": "UNK",
            "LATEST_CLICK_CLIENT_TYPE": "UNK",
            "LATEST_CLICK_CLIENT_NAME": "UNK",
            "LATEST_CLICK_CLIENT_OS_FAMILY": "UNK",
            "FIRST_UTM_SOURCE": "UNK",
            "FIRST_UTM_CONTENT": "UNK",
            "FIRST_UTM_CAMPAIGN": "UNK",
            "LAST_UTM_SOURCE": 'UNK', "LAST_UTM_CONTENT": 'UNK', "LAST_UTM_CAMPAIGN": 'UNK',
            "CITY": "UNK",
            "TIMEZONE": "UNK",
        }
    )
)
# Convert FIRST_ACTIVE_TS to datetime
users_df["FIRST_ACTIVE_TS_dt"] = pd.to_datetime(users_df["FIRST_ACTIVE_TS"])

# Compute months between today and FIRST_ACTIVE_TS
today = pd.Timestamp(datetime.today())

# Compute years and months difference and convert to total months
users_df["MONTHS_SINCE_FIRST_ACTIVE"] = (
    today.year - users_df["FIRST_ACTIVE_TS_dt"].dt.year
) * 12 + (today.month - users_df["FIRST_ACTIVE_TS_dt"].dt.month)


users_df = users_df[users_df["VARIATION_ID"].isin(variations_df["VARIATION_ID"])]
users_df = users_df.drop_duplicates()

# Print the size of users_df before removal
print(f"users_df size before removing small experiments: {users_df.shape[0]} rows")
# Remove experiments with less than 100 participants
experiment_counts = users_df.groupby("EXPERIMENT_ID")["RECIPIENT_ID"].nunique()
valid_experiments = experiment_counts[experiment_counts >= 100].index
users_df = users_df[users_df["EXPERIMENT_ID"].isin(valid_experiments)]
# Print the size of users_df after removal
print(f"users_df size after removing small experiments: {users_df.shape[0]} rows")


users_df size before removing small experiments: 501008 rows
users_df size after removing small experiments: 500953 rows


In [None]:
users_df.groupby(["EXPERIMENT_ID", "EXPERIMENT_DATE"]).agg({"CLICK": ["sum", "count"]})

Unnamed: 0_level_0,Unnamed: 1_level_0,CLICK,CLICK
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,count
EXPERIMENT_ID,EXPERIMENT_DATE,Unnamed: 2_level_2,Unnamed: 3_level_2
002deaf7-331f-4b5e-866b-f6dad60e4a79,2025-07-28,1355,14905
00bb26ff-6fe3-4465-ac77-12bfc33aa6df,2025-07-17,1787,19657
0ef6d2e9-7601-4df6-a215-83e6e79aa24e,2025-10-06,1293,14223
11c49e5d-21ac-4d6d-88c3-f211562a8e07,2025-09-17,1156,12647
1d6dbba7-dcc5-46f4-a4aa-aef3124a8fcf,2025-09-05,1959,21549
2a3f341e-1807-4eb3-9d8d-202c32d52632,2025-08-25,1075,11825
3ecf34fc-1f15-4b32-970f-4061544da763,2025-07-14,1835,20179
43d750b5-8698-4cf0-9ea2-f705f4f196ed,2025-09-25,1968,21585
44d26695-cdf2-41a4-b161-393fdaf964bc,2025-07-26,2122,23342
49c33d7c-ef04-43a7-bbd0-783489c64849,2025-09-06,1757,19327


In [None]:
USER_COLS = [
    "RECIPIENT_ID",
    "COUNTRY",
    "REGION",
    "LATEST_CLICK_CLIENT_TYPE",
    "LATEST_CLICK_CLIENT_NAME",
    "LATEST_CLICK_CLIENT_OS_FAMILY",
    "TOTAL_ORDERS_VALUE",
    "AVG_ORDER_VALUE",
    "LAST_ORDER_VALUE",
    "MONTHS_SINCE_FIRST_ACTIVE",
    "CLICK",
    "FIRST_UTM_SOURCE", "FIRST_UTM_CONTENT", "FIRST_UTM_CAMPAIGN",
    'LAST_UTM_SOURCE', 'LAST_UTM_CONTENT', 'LAST_UTM_CAMPAIGN',
    "CITY",
    "TIMEZONE"
]
VARIATION_COLS = [
   'Q1_CREATIVE', 'Q2_CREATIVE', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q1_SBL', 'Q2_SBL' 
]

CATEGORICAL_COLS = [
    "COUNTRY",
    "REGION",
    "CITY",
    "TIMEZONE",
    "LATEST_CLICK_CLIENT_TYPE",
    "LATEST_CLICK_CLIENT_NAME",
    "LATEST_CLICK_CLIENT_OS_FAMILY",
    "FIRST_UTM_SOURCE", "FIRST_UTM_CONTENT", "FIRST_UTM_CAMPAIGN",
    'LAST_UTM_SOURCE', 'LAST_UTM_CONTENT', 'LAST_UTM_CAMPAIGN',
    'Q1_CREATIVE', 'Q2_CREATIVE', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q1_SBL', 'Q2_SBL' 
]

NUMERICAL_COLS = [
    "TOTAL_ORDERS_VALUE",
    "AVG_ORDER_VALUE",
    "LAST_ORDER_VALUE",
    "MONTHS_SINCE_FIRST_ACTIVE"
]
COLS = CATEGORICAL_COLS + NUMERICAL_COLS

In [None]:
### Train catboost via expanding window approach
variations_per_experimen_df = users_df[
    ["EXPERIMENT_ID", "VARIATION_ID"]
].drop_duplicates()

users_all_variations = pd.merge(
    users_df.drop(columns=["VARIATION_ID"]),
    variations_per_experimen_df,
    how="left",
    left_on="EXPERIMENT_ID",
    right_on="EXPERIMENT_ID",
)
# Assign the click to the correct variation
users_all_variations["CLICK"] = (
    users_all_variations.set_index(["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID"])
    .index.map(
        users_df.drop_duplicates(
            ["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID"]
        ).set_index(["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID"])["CLICK"]
    )
    .fillna(0.5)
)

users_all_variations = users_all_variations.merge(
    variations_df,
    left_on=["VARIATION_ID"],
    right_on=["VARIATION_ID"],
    how="left",
)
from sklearn.model_selection import TimeSeriesSplit

# Start an mlflow experiment
mlflow.set_experiment("Catboost_Ranker")

pos_neg_ratio = 1
experiment_name = f"pn_ratio_{pos_neg_ratio}_all_feats"

with mlflow.start_run(run_name=f"{experiment_name}"):
    tscv = TimeSeriesSplit(n_splits=5, test_size=4)
    idx = (
        users_df[["EXPERIMENT_ID", "EXPERIMENT_DATE"]]
        .sort_values("EXPERIMENT_DATE")
        .drop_duplicates()
        .reset_index(drop=True)
    )


    for train_idx, test_idx in tscv.split(idx):
        print(f"Train idx: {train_idx}, Test idx: {test_idx}")
        train_idx = idx.iloc[train_idx]["EXPERIMENT_ID"].values
        test_idx = idx.iloc[test_idx]["EXPERIMENT_ID"].values

        # prepare train data
        train_df = users_all_variations[
            users_all_variations["EXPERIMENT_ID"].isin(train_idx)
        ]

        train_pool, _, X_train, y_train = get_pooled_dataset(train_df, pos_neg_ratio)

        # prepare test data
        test_df_raw = users_all_variations[
            users_all_variations["EXPERIMENT_ID"].isin(test_idx)
        ]
        test_df = test_df_raw.groupby(["EXPERIMENT_ID", "RECIPIENT_ID"]).filter(
            lambda g: g["CLICK"].max() == 1
        )

        test_pool, test_group_ids, X_test, y_test = get_pooled_dataset(test_df)
        cat_features = train_pool.get_cat_feature_indices()

        # Train CatBoost model

        print("Training CatBoost model...")
        model = get_model("ranker", cat_features)
        model.fit(train_pool)

        scores = model.predict(X_test)
        preds = test_df.sort_values(["EXPERIMENT_ID", "RECIPIENT_ID"]).assign(
            PRED=scores, GT=y_test
        )[["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "PRED"]]
        y_true = test_df[
            ["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "CLICK"]
        ].query("CLICK==1")

        hit_rate, uplift_hit_rate = hit_rate_at_k(preds, y_true, k=1)
        mrr, uplift_mrr = mrr_at_k(preds, y_true, 5)

        # Gather the metrics for this split. You could add more metrics if needed.
        mlflow.log_metric("avg_hit_rate_at_1", hit_rate, step=len(train_idx))
        mlflow.log_metric("avg_mrr_at_5", mrr, step=len(train_idx))
        mlflow.log_metric(
            "avg_uplift_hit_rate_at_1", uplift_hit_rate, step=len(train_idx)
        )
        mlflow.log_metric("avg_uplift_mrr_at_5", uplift_mrr, step=len(train_idx))

        print(
            "hit_rate:",
            hit_rate,
            "uplift_hit_rate:",
            uplift_hit_rate,
            "mrr:",
            mrr,
            "uplift_mrr:",
            uplift_mrr,
        )
