# Full Experiment with CatBoost Models

This notebook runs a complete experiment using all available features with both CatBoostClassifier and CatBoostRanker models.

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import polars as pl
import time

from lavka_recsys import Config, Experiment, setup_logging

In [4]:
setup_logging()

<Logger lavka_recsys (DEBUG)>

## Baseline Experiments

First, let's see how well a model that outputs random predictions does.

In [None]:
# Create experiment
config = (Config
    .load('default_config.yaml')
    .set('feature_generators', ['random_noise'])
    .set('model.type', 'random_baseline')
)
random_experiment = Experiment("random_baseline", config)

# Setup and run
start_time = time.time()
random_experiment.setup()
random_results = random_experiment.run()
random_time = time.time() - start_time

# Print metrics
print(f"\nRandom Baseline Results (completed in {random_time:.2f} seconds):")
for metric, value in random_results['metrics'].items():
    print(f"  {metric}: {value:.6f}")

In [None]:
# random_experiment.create_submission()

Kaggle Public Score = `0.30071`.

Now let's consider a baseline that only considers popularity (i.e. `product_total_purchases`).

In [None]:
# Create experiment
config = (Config
    .load('default_config.yaml')
    .set('feature_generators', ['product_stats'])
    .set('model.type', 'single_feature')
    .set('model.config.single_feature.feature_name', 'product_total_purchases')
    .set('model.config.single_feature.desc', True)
)
pop_experiment = Experiment(
    "popularity_baseline",
    config
)

# Setup and run
start_time = time.time()
pop_experiment.setup()
pop_results = pop_experiment.run()
pop_time = time.time() - start_time

# Print metrics
print(f"\nPopularity Baseline Results (completed in {pop_time:.2f} seconds):")
for metric, value in pop_results['metrics'].items():
    print(f"  {metric}: {value:.6f}")

In [None]:
# pop_experiment.create_submission()

Kaggle Public Score = `0.33188`.

## CatBoost Classifier Experiment

Run an experiment with CatBoost Classifier.

In [None]:
config = (Config.load('default_config.yaml')
    .set('model.type', 'catboost_classifier')
    .set('target', 'CartUpdate_Purchase_vs_View')
    .set('feature_generators', [
        "source_type",
        "count_purchase_user_product",
        "count_purchase_user_store",
        # "count_purchase_user_category",
        "ctr_product",
        "cart_to_purchase_rate",
        "purchase_view_ratio",       
        "recency_user_product",
        "recency_user_category",
        "user_stats",
        "product_stats",
        "store_stats",
        "city_stats",
        "product_temporal_patterns",
        "recency_user_store",
        "time_features_cycl",
        "time_window_user_product",
        "session_features",
        "frequency_features",
        "product_popularity_trend",
        "cross_features",
        "user_segments",
        "russian_holiday",
        "memory-based-cf",       # Collaborative filtering
        "npmi-cf",               # Collaborative filtering
        "puresvd-cf",            # Collaborative filtering
        "svd-cf" ,               # Collaborative filtering
        "bpr-popular",           # Implicit item2item
        "product_embeddings",    # NLP feature
        "category_embeddings",   # NLP feature
        #  "user_product_similarity" # Weighted similarity between user history and product
        "text_similarity_cluster", # Product clusters based on text similarity
        "text_diversity_features", # How different a product is from user's history
    ])
)

In [None]:
classifier_experiment = Experiment("catboost_class_added_features", config)

# Setup and run
classifier_experiment.setup()
classifier_results = classifier_experiment.run()

# Print metrics
print(f"\nCatBoost Classifier Results:")
for metric, value in classifier_results['metrics'].items():
    print(f"  {metric}: {value:.6f}")

In [None]:
# Print feature importance
if 'feature_importance' in classifier_results:
    print("\nTop Important Features:")
    top_features = sorted(classifier_results['feature_importance'].items(), 
                        key=lambda x: x[1], reverse=True)[:30]
    for i, (feature, importance) in enumerate(top_features):
        print(f"{i+1:>3}:\t {feature:<30}: {importance:.6f}")

In [None]:
classifier_experiment.create_submission()

Kaggle Public Score: `NDCG@10 = 0.40962`.

## CatBoost Ranker Experiment

Run an experiment with CatBoost Ranker.

### Basic

In [5]:
config = (Config.load('default_config.yaml')
    .set('target', 'Weighted')
    .set('target.cleaning.enabled', False)
    .set('feature_selector.enabled', False)
    .set('model.type', 'catboost_ranker')
    .set('model.config.catboost_ranker.loss_function', 'YetiRankPairwise')
    .set('model.config.catboost_ranker.eval_metric', 'NDCG:top=10')
    .set('model.config.catboost_ranker.task_type', 'GPU')
    .set('model.config.catboost_ranker.early_stopping_rounds', 50)
    .set('model.config.catboost_ranker.learning_rate', 0.1)
    .set('model.config.catboost_ranker.random_strength', 50)
    .set('model.config.catboost_ranker.iterations', 500)
    .set('model.config.catboost_ranker.score_function', "Cosine")
    .set('feature_generators', [
        "time_windows",
        "random_noise",
        "source_type",
        "count_purchase_user_product",
        "count_purchase_user_store",
        "count_purchase_user_category",      
        "recency_user_product",
        "recency_user_store",
        "user_stats",
        "product_stats",
        "store_stats",
        "city_stats",
        "product_temporal_patterns",
        "category_temporal_patterns",
        "time_features_cycl_v2",
        "session_features_v2",
        "frequency_features",
        "product_popularity_trend",
        "category_popularity_trend",
        "cross_features",
        "user_segments",
        "russian_holiday",
        "memory-based-cf",       # Collaborative filtering
        "npmi-cf",               # Collaborative filtering
        "puresvd-cf",            # Collaborative filtering
        "svd-cf" ,               # Collaborative filtering
        "bpr-popular",           # Implicit item2item
        "product_embeddings",    # NLP feature
        "category_embeddings",   # NLP feature
        #  "user_product_similarity" # Weighted similarity between user history and product
        "text_similarity_cluster", # Product clusters based on text similarity
        "text_diversity_features", # How different a product is from user's history
    ])
)

In [7]:
ranker_experiment = Experiment("catboost_ranker", config)
ranker_experiment.setup()

ranker_results = ranker_experiment.run()

print(f"\nCatBoost Ranker Results:")
for metric, value in ranker_results['metrics'].items():
    print(f"  {metric}: {value:.6f}")

ranker_experiment.create_submission()

2025-05-21 22:24:26,392 - lavka_recsys.Experiment(catboost_ranker_83ba29) - INFO - Initialized experiment: catboost_ranker_83ba29
2025-05-21 22:24:26,415 - lavka_recsys.Experiment(catboost_ranker_83ba29) - INFO - Config saved: results/catboost_ranker_83ba29_config.json
2025-05-21 22:24:26,423 - lavka_recsys.Experiment(catboost_ranker_83ba29) - INFO - Setting up experiment environment...
2025-05-21 22:24:26,425 - lavka_recsys.DataLoader - INFO - Loading training data from ../../data/lavka/train.parquet
2025-05-21 22:24:26,586 - lavka_recsys.DataLoader - INFO - Loading test data from ../../data/lavka/test.parquet
2025-05-21 22:24:26,764 - lavka_recsys.DataLoader - INFO - Holdout Split:
2025-05-21 22:24:26,792 - lavka_recsys.DataLoader - INFO -   train:	2022-12-31 18:46:42 → 2024-01-03 17:31:52 (15_070_276 rows, 367 days)
2025-05-21 22:24:26,796 - lavka_recsys.DataLoader - INFO -   holdout:	2024-01-03 17:56:48 → 2024-02-02 17:34:51 (1_438_338 rows, 29 days)
2025-05-21 22:24:26,796 - lavka

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because PFound, PrecisionAt, RecallAt, NDCG is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=10;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=10;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PrecisionAt:top=10 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric RecallAt:top=10 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


2025-05-21 22:26:18,080 - lavka_recsys.Experiment(catboost_ranker_83ba29) - INFO - Training completed in 96.82s
2025-05-21 22:26:18,094 - lavka_recsys.Experiment(catboost_ranker_83ba29) - INFO - Evaluating on holdout.
2025-05-21 22:26:18,095 - lavka_recsys.CachedFeatureFactory - INFO - Invoking feature generators: time_windows, random_noise, source_type, count_purchase_user_product, count_purchase_user_store, count_purchase_user_category, recency_user_product, recency_user_store, user_stats, product_stats, store_stats, city_stats, product_temporal_patterns, category_temporal_patterns, time_features_cycl_v2, session_features_v2, frequency_features, product_popularity_trend, category_popularity_trend, cross_features, user_segments, russian_holiday, memory-based-cf, npmi-cf, puresvd-cf, svd-cf, bpr-popular, product_embeddings, category_embeddings, text_similarity_cluster, text_diversity_features
2025-05-21 22:26:21,964 - lavka_recsys.CachedFeatureFactory - INFO - Using cached 'time_window

Kaggle:
* Public: 0.41585
* Private: 0.41611

### Feature selection

Now let's try feature selection:

In [None]:
config = (config
    .set('feature_selector.enabled', True)
    .set('feature_selector.correlation_threshold', 0.9)
)

ranker_featselect_experiment = Experiment("catboost_ranker_featselect", config)
ranker_featselect_experiment.setup()

ranker_results = ranker_featselect_experiment.run()

print(f"\nCatBoost Ranker With Feature Selection Results:")
for metric, value in ranker_results['metrics'].items():
    print(f"  {metric}: {value:.6f}")

ranker_featselect_experiment.create_submission()

2025-05-21 22:37:01,310 - lavka_recsys.Experiment(catboost_ranker_featselect_6e2bdd) - INFO - Initialized experiment: catboost_ranker_featselect_6e2bdd
2025-05-21 22:37:01,319 - lavka_recsys.Experiment(catboost_ranker_featselect_6e2bdd) - INFO - Config saved: results/catboost_ranker_featselect_6e2bdd_config.json
2025-05-21 22:37:01,321 - lavka_recsys.Experiment(catboost_ranker_featselect_6e2bdd) - INFO - Setting up experiment environment...
2025-05-21 22:37:01,322 - lavka_recsys.DataLoader - INFO - Loading training data from ../../data/lavka/train.parquet


2025-05-21 22:37:01,553 - lavka_recsys.DataLoader - INFO - Loading test data from ../../data/lavka/test.parquet
2025-05-21 22:37:01,824 - lavka_recsys.DataLoader - INFO - Holdout Split:
2025-05-21 22:37:01,852 - lavka_recsys.DataLoader - INFO -   train:	2022-12-31 18:46:42 → 2024-01-03 17:31:52 (15_070_276 rows, 367 days)
2025-05-21 22:37:01,856 - lavka_recsys.DataLoader - INFO -   holdout:	2024-01-03 17:56:48 → 2024-02-02 17:34:51 (1_438_338 rows, 29 days)
2025-05-21 22:37:01,857 - lavka_recsys.Experiment(catboost_ranker_featselect_6e2bdd) - INFO - Setup complete.
2025-05-21 22:37:01,858 - lavka_recsys.Experiment(catboost_ranker_featselect_6e2bdd) - INFO - Starting experiment run...
2025-05-21 22:37:02,176 - lavka_recsys.DataLoader - INFO - Validation Split:
2025-05-21 22:37:02,198 - lavka_recsys.DataLoader - INFO -   train_history:	2022-12-31 18:46:42 → 2023-11-04 17:16:23 (12_082_523 rows, 307 days)
2025-05-21 22:37:02,202 - lavka_recsys.DataLoader - INFO -   train_target:	2023-11-0

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because PFound, PrecisionAt, RecallAt, NDCG is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=10;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=10;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PrecisionAt:top=10 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric RecallAt:top=10 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


2025-05-21 22:38:43,704 - lavka_recsys.Experiment(catboost_ranker_featselect_6e2bdd) - INFO - Training completed in 88.22s
2025-05-21 22:38:43,712 - lavka_recsys.Experiment(catboost_ranker_featselect_6e2bdd) - INFO - Evaluating on holdout.
2025-05-21 22:38:43,713 - lavka_recsys.CachedFeatureFactory - INFO - Invoking feature generators: time_windows, random_noise, source_type, count_purchase_user_product, count_purchase_user_store, count_purchase_user_category, recency_user_product, recency_user_store, user_stats, product_stats, store_stats, city_stats, product_temporal_patterns, category_temporal_patterns, time_features_cycl_v2, session_features_v2, frequency_features, product_popularity_trend, category_popularity_trend, cross_features, user_segments, russian_holiday, memory-based-cf, npmi-cf, puresvd-cf, svd-cf, bpr-popular, product_embeddings, category_embeddings, text_similarity_cluster, text_diversity_features
2025-05-21 22:38:47,094 - lavka_recsys.CachedFeatureFactory - INFO - Usi



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because PFound, PrecisionAt, RecallAt, NDCG is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=10;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=10;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PrecisionAt:top=10 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric RecallAt:top=10 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


2025-05-21 22:40:29,732 - lavka_recsys.Experiment(catboost_ranker_83ba29) - INFO - Training completed in 75.35s
2025-05-21 22:40:29,744 - lavka_recsys.DataLoader - INFO - Holdout data merged back into training data.
2025-05-21 22:40:29,747 - lavka_recsys.CachedFeatureFactory - INFO - Invoking feature generators: time_windows, random_noise, source_type, count_purchase_user_product, count_purchase_user_store, count_purchase_user_category, recency_user_product, recency_user_store, user_stats, product_stats, store_stats, city_stats, product_temporal_patterns, category_temporal_patterns, time_features_cycl_v2, session_features_v2, frequency_features, product_popularity_trend, category_popularity_trend, cross_features, user_segments, russian_holiday, memory-based-cf, npmi-cf, puresvd-cf, svd-cf, bpr-popular, product_embeddings, category_embeddings, text_similarity_cluster, text_diversity_features
2025-05-21 22:40:30,494 - lavka_recsys.CachedFeatureFactory - INFO - Using cached 'time_windows'

index,request_id
u32,u64
190848,17920534181872757907
280356,17920534181872757907
168750,3207820721621783769
227713,14005521114543408356
277950,14005521114543408356
…,…
86112,3238852072925852923
16108,16986836868971557517
268591,18082910854666416347
220254,16227801852798084958


Kaggle:
* Public Score: 0.41621
* Private score: 0.41715

So feature selection SIGNIFICANTLY improved the score. Let's try increasing it's strength (correlation_threshold of 0.8 instead of 0.9).

In [12]:
config = (config
    .set('feature_selector.enabled', True)
    .set('feature_selector.correlation_threshold', 0.8)
)

ranker_featselect_experiment = Experiment("catboost_ranker_featselect", config)
ranker_featselect_experiment.setup()

ranker_results = ranker_featselect_experiment.run()

print(f"\nCatBoost Ranker With Feature Selection Results:")
for metric, value in ranker_results['metrics'].items():
    print(f"  {metric}: {value:.6f}")

ranker_featselect_experiment.create_submission()

2025-05-21 22:49:10,185 - lavka_recsys.Experiment(catboost_ranker_featselect_a2186b) - INFO - Initialized experiment: catboost_ranker_featselect_a2186b
2025-05-21 22:49:10,195 - lavka_recsys.Experiment(catboost_ranker_featselect_a2186b) - INFO - Config saved: results/catboost_ranker_featselect_a2186b_config.json
2025-05-21 22:49:10,217 - lavka_recsys.Experiment(catboost_ranker_featselect_a2186b) - INFO - Setting up experiment environment...
2025-05-21 22:49:10,219 - lavka_recsys.DataLoader - INFO - Loading training data from ../../data/lavka/train.parquet
2025-05-21 22:49:10,472 - lavka_recsys.DataLoader - INFO - Loading test data from ../../data/lavka/test.parquet
2025-05-21 22:49:10,703 - lavka_recsys.DataLoader - INFO - Holdout Split:
2025-05-21 22:49:10,731 - lavka_recsys.DataLoader - INFO -   train:	2022-12-31 18:46:42 → 2024-01-03 17:31:52 (15_070_276 rows, 367 days)
2025-05-21 22:49:10,735 - lavka_recsys.DataLoader - INFO -   holdout:	2024-01-03 17:56:48 → 2024-02-02 17:34:51 (1



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because PFound, PrecisionAt, RecallAt, NDCG is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=10;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=10;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PrecisionAt:top=10 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric RecallAt:top=10 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


2025-05-21 22:50:41,915 - lavka_recsys.Experiment(catboost_ranker_featselect_a2186b) - INFO - Training completed in 76.94s
2025-05-21 22:50:41,918 - lavka_recsys.Experiment(catboost_ranker_featselect_a2186b) - INFO - Evaluating on holdout.
2025-05-21 22:50:41,919 - lavka_recsys.CachedFeatureFactory - INFO - Invoking feature generators: time_windows, random_noise, source_type, count_purchase_user_product, count_purchase_user_store, count_purchase_user_category, recency_user_product, recency_user_store, user_stats, product_stats, store_stats, city_stats, product_temporal_patterns, category_temporal_patterns, time_features_cycl_v2, session_features_v2, frequency_features, product_popularity_trend, category_popularity_trend, cross_features, user_segments, russian_holiday, memory-based-cf, npmi-cf, puresvd-cf, svd-cf, bpr-popular, product_embeddings, category_embeddings, text_similarity_cluster, text_diversity_features
2025-05-21 22:50:45,286 - lavka_recsys.CachedFeatureFactory - INFO - Usi



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because PFound, PrecisionAt, RecallAt, NDCG is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=10;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=10;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PrecisionAt:top=10 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric RecallAt:top=10 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


2025-05-21 22:52:11,317 - lavka_recsys.Experiment(catboost_ranker_featselect_a2186b) - INFO - Training completed in 59.92s
2025-05-21 22:52:11,328 - lavka_recsys.DataLoader - INFO - Holdout data merged back into training data.
2025-05-21 22:52:11,330 - lavka_recsys.CachedFeatureFactory - INFO - Invoking feature generators: time_windows, random_noise, source_type, count_purchase_user_product, count_purchase_user_store, count_purchase_user_category, recency_user_product, recency_user_store, user_stats, product_stats, store_stats, city_stats, product_temporal_patterns, category_temporal_patterns, time_features_cycl_v2, session_features_v2, frequency_features, product_popularity_trend, category_popularity_trend, cross_features, user_segments, russian_holiday, memory-based-cf, npmi-cf, puresvd-cf, svd-cf, bpr-popular, product_embeddings, category_embeddings, text_similarity_cluster, text_diversity_features
2025-05-21 22:52:12,071 - lavka_recsys.CachedFeatureFactory - INFO - Using cached 'ti

index,request_id
u32,u64
190848,17920534181872757907
280356,17920534181872757907
260253,18076993598489998182
208560,18076993598489998182
92135,15466705642107833257
…,…
52788,1328861593636280476
72658,1328861593636280476
263989,1328861593636280476
212429,1328861593636280476


Kaggle:
* Public Score: 0.41550
* Private score: 0.41572

Okay, too strong of feature elimination decreases the score, so it has to be finetuned carefully.

### No text features

In [15]:
config = (Config.load('default_config.yaml')
    .set('target', 'Weighted')
    .set('target.cleaning.enabled', False)
    .set('feature_selector.enabled', False)
    # .set('feature_selector.correlation_threshold', 0.9)
    .set('model.type', 'catboost_ranker')
    .set('model.config.catboost_ranker.loss_function', 'YetiRankPairwise')
    .set('model.config.catboost_ranker.eval_metric', 'NDCG:top=10')
    .set('model.config.catboost_ranker.task_type', 'GPU')
    .set('model.config.catboost_ranker.early_stopping_rounds', 50)
    .set('model.config.catboost_ranker.learning_rate', 0.1)
    .set('model.config.catboost_ranker.random_strength', 50)
    .set('model.config.catboost_ranker.iterations', 500)
    .set('model.config.catboost_ranker.score_function', "Cosine")
    .set('feature_generators', [
        "time_windows",
        "random_noise",
        "source_type",
        "count_purchase_user_product",
        "count_purchase_user_store",
        "count_purchase_user_category",      
        "recency_user_product",
        "recency_user_store",
        "user_stats",
        "product_stats",
        "store_stats",
        "city_stats",
        "product_temporal_patterns",
        "category_temporal_patterns",
        "time_features_cycl_v2",
        "session_features_v2",
        "frequency_features",
        "product_popularity_trend",
        "category_popularity_trend",
        "cross_features",
        "user_segments",
        "russian_holiday",
        "memory-based-cf",       # Collaborative filtering
        "npmi-cf",               # Collaborative filtering
        "puresvd-cf",            # Collaborative filtering
        "svd-cf" ,               # Collaborative filtering
        "bpr-popular",           # Implicit item2item
        # "product_embeddings",    # NLP feature
        # "category_embeddings",   # NLP feature
        #  "user_product_similarity" # Weighted similarity between user history and product
        # "text_similarity_cluster", # Product clusters based on text similarity
        # "text_diversity_features", # How different a product is from user's history
    ])
)

ranker_notextfeats_experiment = Experiment("catboost_ranker_notextfeats", config)
ranker_notextfeats_experiment.setup()

ranker_results = ranker_notextfeats_experiment.run()

print(f"\nCatBoost Ranker With Feature Selection Results:")
for metric, value in ranker_results['metrics'].items():
    print(f"  {metric}: {value:.6f}")

ranker_notextfeats_experiment.create_submission()

2025-05-21 23:02:52,257 - lavka_recsys.Experiment(catboost_ranker_notextfeats_a2c979) - INFO - Initialized experiment: catboost_ranker_notextfeats_a2c979
2025-05-21 23:02:52,264 - lavka_recsys.Experiment(catboost_ranker_notextfeats_a2c979) - INFO - Config saved: results/catboost_ranker_notextfeats_a2c979_config.json
2025-05-21 23:02:52,265 - lavka_recsys.Experiment(catboost_ranker_notextfeats_a2c979) - INFO - Setting up experiment environment...
2025-05-21 23:02:52,266 - lavka_recsys.DataLoader - INFO - Loading training data from ../../data/lavka/train.parquet
2025-05-21 23:02:52,531 - lavka_recsys.DataLoader - INFO - Loading test data from ../../data/lavka/test.parquet
2025-05-21 23:02:52,790 - lavka_recsys.DataLoader - INFO - Holdout Split:
2025-05-21 23:02:52,818 - lavka_recsys.DataLoader - INFO -   train:	2022-12-31 18:46:42 → 2024-01-03 17:31:52 (15_070_276 rows, 367 days)
2025-05-21 23:02:52,822 - lavka_recsys.DataLoader - INFO -   holdout:	2024-01-03 17:56:48 → 2024-02-02 17:34:



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because PFound, PrecisionAt, RecallAt, NDCG is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=10;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=10;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PrecisionAt:top=10 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric RecallAt:top=10 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


2025-05-21 23:04:28,547 - lavka_recsys.Experiment(catboost_ranker_notextfeats_a2c979) - INFO - Training completed in 84.18s
2025-05-21 23:04:28,551 - lavka_recsys.Experiment(catboost_ranker_notextfeats_a2c979) - INFO - Evaluating on holdout.
2025-05-21 23:04:28,552 - lavka_recsys.CachedFeatureFactory - INFO - Invoking feature generators: time_windows, random_noise, source_type, count_purchase_user_product, count_purchase_user_store, count_purchase_user_category, recency_user_product, recency_user_store, user_stats, product_stats, store_stats, city_stats, product_temporal_patterns, category_temporal_patterns, time_features_cycl_v2, session_features_v2, frequency_features, product_popularity_trend, category_popularity_trend, cross_features, user_segments, russian_holiday, memory-based-cf, npmi-cf, puresvd-cf, svd-cf, bpr-popular
2025-05-21 23:04:31,734 - lavka_recsys.CachedFeatureFactory - INFO - Using cached 'time_windows' (key=51fedc8096ec71aa367ee5999e542586)
2025-05-21 23:04:31,787 -



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because PFound, PrecisionAt, RecallAt, NDCG is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=10;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=10;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PrecisionAt:top=10 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric RecallAt:top=10 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


2025-05-21 23:06:02,019 - lavka_recsys.Experiment(catboost_ranker_notextfeats_a2c979) - INFO - Training completed in 65.33s
2025-05-21 23:06:02,029 - lavka_recsys.DataLoader - INFO - Holdout data merged back into training data.
2025-05-21 23:06:02,031 - lavka_recsys.CachedFeatureFactory - INFO - Invoking feature generators: time_windows, random_noise, source_type, count_purchase_user_product, count_purchase_user_store, count_purchase_user_category, recency_user_product, recency_user_store, user_stats, product_stats, store_stats, city_stats, product_temporal_patterns, category_temporal_patterns, time_features_cycl_v2, session_features_v2, frequency_features, product_popularity_trend, category_popularity_trend, cross_features, user_segments, russian_holiday, memory-based-cf, npmi-cf, puresvd-cf, svd-cf, bpr-popular
2025-05-21 23:06:02,742 - lavka_recsys.CachedFeatureFactory - INFO - Using cached 'time_windows' (key=6685ea32d615adf0ec6a455aa0ca6c4b)
2025-05-21 23:06:02,784 - lavka_recsys.

index,request_id
u32,u64
260253,18076993598489998182
67254,15282462387105874540
214523,15282462387105874540
208560,18076993598489998182
190848,17920534181872757907
…,…
120053,13508208304049367032
104475,3532830511186774554
247272,3532830511186774554
175839,16137940155032071510


Kaggle (with the best feature selection config)
* Public Score: 0.41389
* Private score: 0.41360

Kaggle (no feature selection)
* Public Score: 0.41536
* Private score: 0.41477

As we see, text features are quite important.

## Feature Importance Comparison

In [None]:
# Compare feature importance between models
if 'feature_importance' in classifier_results and 'feature_importance' in ranker_results:
    # Get all features from both models
    all_features = set(list(classifier_results['feature_importance'].keys()) + 
                        list(ranker_results['feature_importance'].keys()))
    
    # Create comparison dataframe
    fi_comparison = {'Feature': []}
    fi_comparison['Classifier Importance'] = []
    fi_comparison['Ranker Importance'] = []
    
    for feature in all_features:
        fi_comparison['Feature'].append(feature)
        fi_comparison['Classifier Importance'].append(
            classifier_results['feature_importance'].get(feature, 0))
        fi_comparison['Ranker Importance'].append(
            ranker_results['feature_importance'].get(feature, 0))
        
    # Convert to DataFrame and sort by average importance
    fi_df = pd.DataFrame(fi_comparison)
    fi_df['Average Importance'] = (fi_df['Classifier Importance'] + fi_df['Ranker Importance']) / 2
    fi_df = fi_df.sort_values('Average Importance', ascending=False).reset_index(drop=True)
    
    # Save to results directory
    # fi_df.to_csv(f"{results_dir}/feature_importance_comparison.csv", index=False)
    
    # Display top features
    print("Top 20 Features by Average Importance:")
    display(fi_df.head(20))
    
    # Plot top 15 features
    plt.figure(figsize=(12, 8))
    top_n = 15
    top_features = fi_df.head(top_n)
    
    x = range(len(top_features))
    width = 0.35
    
    plt.barh([p + width/2 for p in x], top_features['Classifier Importance'], 
            height=width, label='Classifier', color='#3498db')
    plt.barh([p - width/2 for p in x], top_features['Ranker Importance'], 
            height=width, label='Ranker', color='#e74c3c')
    
    plt.yticks(x, top_features['Feature'])
    plt.xlabel('Importance')
    plt.title(f'Top {top_n} Feature Importance Comparison')
    plt.legend()
    plt.tight_layout()
    plt.show()

Kaggle Public Score `NDCG@10 = 0.41441`.

## CatBoost Ranker with hparam optimization

In [None]:
import optuna
import wandb

wandb.login()

In [None]:
def objective(trial: optuna.Trial):
    run_name = f"optuna_trial_{trial.number}"

    params_to_tune = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "depth": trial.suggest_int("depth", 4, 10),
        'iterations': trial.suggest_int("iterations", 200, 1000),
        'random_strength': trial.suggest_float('random_strength', 1e-8, 1.0, log=True),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'loss_function': trial.suggest_categorical('loss_function', [
            'YetiRankPairwise', 'PairLogitPairwise', 
        ])
    }

    config = (Config.load('default_config.yaml')
        .set('model.type', 'catboost_ranker')
        .set('target', 'Weighted')
        .set('model.config.catboost_ranker.loss_function', 'YetiRankPairwise')
        .set('model.config.catboost_ranker.eval_metric', 'NDCG:top=10')
        .set('model.config.catboost_ranker.task_type', 'GPU')
        .set('model.config.catboost_ranker.devices', '4,5')
        .set('model.config.catboost_ranker.early_stopping_rounds', 50)
        .set('model.config.catboost_ranker.random_strength', 50)
        .set('model.config.catboost_ranker.verbose', 0) # Keep Optuna trials quiet
        .set('feature_generators', [
            "source_type",
            "count_purchase_user_product",
            "count_purchase_user_store",
            "count_purchase_user_category",
            "ctr_product",
            "cart_to_purchase_rate",
            "purchase_view_ratio",       
            "recency_user_product",
            "user_stats",
            "product_stats",
            "store_stats",
            "city_stats",
            "product_temporal_patterns",
            "recency_user_store",
            "time_features_cycl",
            "time_window_user_product",
            "session_features",
            "frequency_features",
            "product_popularity_trend",
            "cross_features",
            "user_segments",
            "russian_holiday",
            "memory-based-cf",       # Collaborative filtering
            "npmi-cf",               # Collaborative filtering
            "puresvd-cf",            # Collaborative filtering
            "svd-cf" ,               # Collaborative filtering
            "bpr-popular",           # Implicit item2item
            "product_embeddings",    # NLP feature
            "category_embeddings",   # NLP feature
            #  "user_product_similarity" # Weighted similarity between user history and product
            "text_similarity_cluster", # Product clusters based on text similarity
            "text_diversity_features", # How different a product is from user's history
        ])
    )
        
    # Update config with Optuna's suggested params
    for p_name, p_value in params_to_tune.items():
        config = config.set(f'model.config.catboost_ranker.{p_name}', p_value)

    # Create and run a new experiment
    exp_name = f"optuna_trial_{trial.number}_{int(time.time())}"
    with wandb.init(
        project="rs25_catboost_ranker_optuna",
        name=run_name,
        config=params_to_tune.update(config.to_dict()),
        reinit=True,
        save_code=True
    ):
        trial_experiment = Experiment(exp_name, config)
        trial_experiment.setup()
        results = trial_experiment.run()

        ndcg_score = results['metrics']['ndcg@10']
        wandb.log(results['metrics'])
        if 'feature_importance' in results and results['feature_importance']:
            fi_to_log = {f"fi_{feat.replace('.', '_')}": val for feat, val in results['feature_importance'].items()}
            wandb.log(fi_to_log)

    return ndcg_score

In [None]:
import joblib

study = optuna.create_study(direction='maximize') 
study.optimize(objective, n_trials=50)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
best_trial = study.best_trial

print("  Value (NDCG@10): ", best_trial.value)
print("  Params: ")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

best_params_optuna = best_trial.params
joblib.dump(study, 'catboost_ranker_hparam_study.joblib')

In [None]:
from optuna import visualization

# 1. Load your study
# Replace with your actual study_name and storage URI
study = joblib.load("catboost_ranker_hparam_study.joblib")


# 2. Print best trial
best = study.best_trial
print("Best trial:")
print(f"  Value: {best.value:.5f}")
print("  Params:")
for k, v in best.params.items():
    print(f"    {k}: {v}")

# 3. Convert all trials to a DataFrame
df = study.trials_dataframe(attrs=(
    "number",
    "value",
    "params",
    "state",
    "datetime_start",
    "datetime_complete",
))
print("\nTrials DataFrame head:")
print(df.head())

# (Optional) Save to CSV for offline analysis
df.to_csv("optuna_trials.csv", index=False)


# 4. Generate and show plots
#    These return Plotly figure objects—use .show() or .write_image()

# Optimization history
fig_history = visualization.plot_optimization_history(study)
fig_history.show()

# Parameter importances
fig_importances = visualization.plot_param_importances(study)
fig_importances.show()

# Contour plots for pairwise interactions
# fig_contour = visualization.plot_contour(
#     study,
#     params=["iterations", "learning_rate"]
# )
# fig_contour.show()

# Slice plot (value vs single parameter)
fig_slice = visualization.plot_slice(
    study,
    params=["iterations", "learning_rate"]
)
fig_slice.show()

# Parallel coordinate (multidimensional view)
fig_parallel = visualization.plot_parallel_coordinate(study)
fig_parallel.show()

In [None]:
config = (Config.load('default_config.yaml')
    .set('model.type', 'catboost_ranker')
    .set('target', 'Weighted')
    .set('model.config.catboost_ranker.loss_function', 'YetiRankPairwise')
    .set('model.config.catboost_ranker.eval_metric', 'NDCG:top=10')
    .set('model.config.catboost_ranker.task_type', 'GPU')
    .set('model.config.catboost_ranker.devices', '4,5')
    .set('model.config.catboost_ranker.early_stopping_rounds', 100)
    .set('model.config.catboost_ranker.random_strength', 0.38582798604652235)
    .set('model.config.catboost_ranker.learning_rate', 0.27126067682541705)
    .set('model.config.catboost_ranker.depth', 10)
    .set('model.config.catboost_ranker.iterations', 700)
    .set('model.config.catboost_ranker.border_count', 166)
    .set('model.config.catboost_ranker.bagging_temperature', 0.030778861057174645)
    .set('feature_generators', [
        "source_type",
        "count_purchase_user_product",
        "count_purchase_user_store",
        "count_purchase_user_category",
        "ctr_product",
        "cart_to_purchase_rate",
        "purchase_view_ratio",       
        "recency_user_product",
        "user_stats",
        "product_stats",
        "store_stats",
        "city_stats",
        "product_temporal_patterns",
        "recency_user_store",
        "time_features_cycl",
        "time_window_user_product",
        "session_features",
        "frequency_features",
        "product_popularity_trend",
        "cross_features",
        "user_segments",
        "russian_holiday",
        "memory-based-cf",       # Collaborative filtering
        "npmi-cf",               # Collaborative filtering
        "puresvd-cf",            # Collaborative filtering
        "svd-cf" ,               # Collaborative filtering
        "bpr-popular",           # Implicit item2item
        "product_embeddings",    # NLP feature
        "category_embeddings",   # NLP feature
        #  "user_product_similarity" # Weighted similarity between user history and product
        "text_similarity_cluster", # Product clusters based on text similarity
        "text_diversity_features", # How different a product is from user's history
    ])
)

In [None]:
ranker_experiment = Experiment("catboost_ranker_best_hparams", config)

# Setup and run
ranker_experiment.setup()
ranker_results = ranker_experiment.run()

# Print metrics
print(f"\nCatBoost Ranker Results:")
for metric, value in ranker_results['metrics'].items():
    print(f"  {metric}: {value:.6f}")

In [None]:
ranker_experiment.create_submission()

Kaggle Public Score `NDCG@10 = 0.41410`.