In [6]:
import sys
sys.path.append('/Users/broniy/Desktop/CreativeRank/')
%load_ext autoreload
%autoreload 2

import os 
import mlflow
import pandas as pd
import numpy as np
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GroupKFold
import numpy as np
from metrics import bootstrap_mrr_at_k, mrr_at_k, hit_rate_at_k, mrr_at_k_per_experiment, hit_rate_at_k_per_experiment
from models import get_model, get_pooled_dataset
from settings import DATA_FOLDER
from notebooks.experiment_data import get_experiment_data, COLS, CATEGORICAL_COLS, split_experiment_train_test_val_data

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
data = get_experiment_data()

users_df size before removing small experiments: 501008 rows
users_df size after removing small experiments: 500953 rows


In [14]:
# Define grid of hyperparameters
param_grid = {
    'iterations': [10, 200, 400, 600],
    'learning_rate': [0.02, 0.03, 0.05],
    'depth': [3, 4, 5],
    'l2_leaf_reg': [3, 5, 10, 20],
    'random_strength': [0.5, 1, 2],
    'bagging_temperature': [0.25, 0.5, 1],
    'rsm': [0.6, 0.8, 1.0],  # feature subsampling
    # 'loss_function': ['YetiRank', 'PairLogit'],
    'subsample': [0.6, 0.8, 1.0],
}



In [None]:
mlflow.set_experiment("Fine Grid Search")


grid = list(ParameterGrid(param_grid))
for i, model_params in enumerate(grid):
    print(f"Running grid search {i+1} of {model_params}")
    with mlflow.start_run(run_name=f"ranker_grid_search_{i}"):
        train_data, _, test_data = split_experiment_train_test_val_data(data, n_last_test=4, n_last_val=0)


        n_splits = 5
        # Use "EXPERIMENT_ID" to group
        group_kfold = GroupKFold(n_splits=n_splits)
        groups = train_data[["EXPERIMENT_ID", "RECIPIENT_ID"]].apply(lambda x: f"{x['EXPERIMENT_ID']}_{x['RECIPIENT_ID']}", axis=1)

        cv_results = []
        for fold, (train_idx, val_idx) in enumerate(group_kfold.split(train_data, groups=groups)):
            fold_train_data = train_data.iloc[train_idx]
            fold_val_data = train_data.iloc[val_idx]

            # Prepare pools and datasets per fold
            train_df, train_pool, _, X_train, y_train = get_pooled_dataset(
                fold_train_data, pos_neg_ratio=1, cols=COLS, cat_cols=CATEGORICAL_COLS
            )
            val_df, val_pool, _, X_val, y_val = get_pooled_dataset(
                fold_val_data, cols=COLS, cat_cols=CATEGORICAL_COLS
            )
            cat_features = train_pool.get_cat_feature_indices()

            # Fit the model
            ranker = get_model("ranker", cat_features, model_params)
            ranker.fit(train_pool, eval_set=val_pool, use_best_model=True)

            # Validation scoring
            scores = ranker.predict(X_val)
            preds = val_df.assign(
                PRED=scores
            )[["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "PRED"]]
            y_true = val_df[
                ["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "CLICK"]
            ].query("CLICK==1")

            mrr_at_5 = mrr_at_k(preds, y_true, 5, prefix=f"cvfold{fold}_")
            hit_rate_1 = hit_rate_at_k(preds, y_true, 1, prefix=f"cvfold{fold}_")
            cv_results.append(
                {
                    "fold": fold,
                    "mrr_at_5": mrr_at_5,
                    "hit_rate_1": hit_rate_1,
                }
            )
        mrr_at_5_values = [fold_result["mrr_at_5"] for fold_result in cv_results]
        hit_rate_1_values = [fold_result["hit_rate_1"] for fold_result in cv_results]

        mean_mrr_at_5 = np.mean(mrr_at_5_values)
        std_mrr_at_5 = np.std(mrr_at_5_values)
        mean_hit_rate_1 = np.mean(hit_rate_1_values)
        std_hit_rate_1 = np.std(hit_rate_1_values)

        mlflow.log_metric("cv_mean_mrr_at_5", mean_mrr_at_5)
        mlflow.log_metric("cv_std_mrr_at_5", std_mrr_at_5)
        mlflow.log_metric("cv_mean_hit_rate_1", mean_hit_rate_1)
        mlflow.log_metric("cv_std_hit_rate_1", std_hit_rate_1)
        print("CV results:", cv_results)

        # Train on all data and predict on test data
        train_df, train_pool, train_group_ids, X_train, y_train = get_pooled_dataset(train_data, cols=COLS, cat_cols=CATEGORICAL_COLS) 
        test_df, test_pool, test_group_ids, X_test, y_test = get_pooled_dataset(test_data, cols=COLS, cat_cols=CATEGORICAL_COLS)
        
        cat_features = train_pool.get_cat_feature_indices()
        ranker = get_model("ranker", cat_features, model_params)
        ranker.fit(train_pool, eval_set=val_pool, use_best_model=True)

        scores = ranker.predict(X_test)
        preds = test_df.assign(PRED=scores)[["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "PRED"]]
        y_true = test_df[["EXPERIMENT_ID", "RECIPIENT_ID", "VARIATION_ID", "CLICK"]].query("CLICK==1") 

        mrr_at_k_per_experiment(preds, y_true, 5, prefix="test_")
        hit_rate_at_k_per_experiment(preds, y_true, 1, prefix="test_")
        bootstrap_mrr_at_k(preds, y_true, 5, bootstrap_samples=100, random_state=42, prefix="test_")
 

Running grid search 1 of {'bagging_temperature': 0.25, 'depth': 3, 'iterations': 10, 'l2_leaf_reg': 3, 'learning_rate': 0.02, 'random_strength': 0.5, 'rsm': 0.6, 'subsample': 0.6}
Groupwise loss function. OneHotMaxSize set to 10
0:	learn: 0.1009644	test: 0.0189946	best: 0.0189946 (0)	total: 111ms	remaining: 997ms
1:	learn: 0.1072276	test: 0.0189946	best: 0.0189946 (0)	total: 175ms	remaining: 700ms
2:	learn: 0.1072276	test: 0.0189946	best: 0.0189946 (0)	total: 253ms	remaining: 591ms
3:	learn: 0.1086548	test: 0.0192617	best: 0.0192617 (3)	total: 313ms	remaining: 469ms
4:	learn: 0.1190959	test: 0.0213621	best: 0.0213621 (4)	total: 367ms	remaining: 367ms
5:	learn: 0.1190983	test: 0.0213621	best: 0.0213621 (4)	total: 453ms	remaining: 302ms
6:	learn: 0.1328305	test: 0.0213621	best: 0.0213621 (4)	total: 507ms	remaining: 217ms
7:	learn: 0.1358430	test: 0.0213621	best: 0.0213621 (4)	total: 562ms	remaining: 140ms
8:	learn: 0.1364864	test: 0.0214925	best: 0.0214925 (8)	total: 615ms	remaining: 68.