In [1]:
%cd /data/majoroval/jupyter/RS-25/homework/week02

/data/majoroval/jupyter/RS-25/homework/week02


In [2]:
import requests

import numpy as np
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt

from PIL import Image
from io import BytesIO
from textwrap import wrap
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor
from sklearn.metrics import roc_auc_score, log_loss, ndcg_score
from pprint import pprint

In [3]:
from framework import Config, DataLoader, FeatureFactory, ModelFactory

In [4]:
config = Config()
dl = DataLoader(config)
dl.load_data();

2025-04-12 14:16:25,617 - lavka_recsys.DataLoader - INFO - Loaded train data: 14954417 rows
2025-04-12 14:16:25,631 - lavka_recsys.DataLoader - INFO - Normalized timestamps
2025-04-12 14:16:25,633 - lavka_recsys.DataLoader - INFO - Loaded test data: 565231 rows
2025-04-12 14:16:25,636 - lavka_recsys.DataLoader - INFO - Normalized timestamps


In [5]:
@FeatureFactory.register('count_purchase_product')
def generate_count_purchase_product(
    history_df: pl.DataFrame, target_df: pl.DataFrame
) -> tuple[pl.DataFrame, list[str]]:
    """Count purchases by user-product pairs"""
    return history_df.filter(
        pl.col('action_type') == "AT_Purchase"
    ).group_by(
        'user_id', 'product_id'
    ).agg(
        pl.len().alias('count_purchase_product')
    ).join(
        target_df,
        on=['user_id', 'product_id'],
        how='right'
    ).fill_null(0), []

In [6]:
@FeatureFactory.register('count_purchase_store')
def generate_count_purchase_store(
    history_df: pl.DataFrame, target_df: pl.DataFrame
) -> tuple[pl.DataFrame, list[str]]:
    """Count purchases by user-store pairs"""
    return history_df.filter(
        pl.col('action_type') == "AT_Purchase"
    ).group_by(
        'user_id', 'store_id'
    ).agg(
        pl.len().alias('count_purchase_store')
    ).join(
        target_df,
        on=['user_id', 'store_id'],
        how='right'
    ).fill_null(0), []

In [7]:
@FeatureFactory.register('ctr_product')
def generate_ctr_product(
    history_df: pl.DataFrame, target_df: pl.DataFrame
) -> tuple[pl.DataFrame, list[str]]:
    """Calculate CTR (Click-Through Rate) for products"""
    actions = history_df.group_by(
        'action_type', 'product_id'
    ).agg(
        pl.len()
    )
    
    clicks = actions.filter(pl.col('action_type') == "AT_Click")
    views = actions.filter(pl.col('action_type') == "AT_View")
    
    feature = clicks.join(
        views, on='product_id'
    ).with_columns(
        ctr_product=pl.col('len') / pl.col('len_right')
    ).select(
        'product_id', 'ctr_product'
    )
    return target_df.join(
        feature,
        on=['product_id'],
        how='left'
    ), []

In [8]:
@FeatureFactory.register('ctr_store')
def generate_ctr_store(
    history_df: pl.DataFrame, target_df: pl.DataFrame
) -> tuple[pl.DataFrame, list[str]]:
    """Calculate CTR (Click-Through Rate) for store"""
    actions = history_df.group_by(
        'action_type', 'store_id'
    ).agg(
        pl.len()
    )
    
    clicks = actions.filter(pl.col('action_type') == "AT_Click")
    views = actions.filter(pl.col('action_type') == "AT_View")
    
    feature = clicks.join(
        views, on='store_id'
    ).with_columns(
        ctr_store=pl.col('len') / pl.col('len_right')
    ).select(
        'store_id', 'ctr_store'
    )
    return target_df.join(
        feature,
        on=['store_id'],
        how='left'
    ), []

In [9]:
@FeatureFactory.register('recency_user_product')
def generate_recency_user_product(
    history_df: pl.DataFrame, target_df: pl.DataFrame
) -> tuple[pl.DataFrame, list[str]]:
    """Generate recency features for user-product pairs"""
    latest_time = history_df['timestamp'].max()
    
    feature = history_df.group_by(['user_id', 'product_id']).agg(
        pl.max('timestamp').alias('last_interaction_u_p')
    ).with_columns(
        days_since_interaction_u_p=(latest_time - pl.col('last_interaction_u_p')) / (24 * 60 * 60)
    )
    return target_df.join(
        feature,
        on=['user_id', 'product_id'],
        how='left'
    ), []

In [10]:
@FeatureFactory.register('recency_user_store')
def generate_recency_user_store(
    history_df: pl.DataFrame, target_df: pl.DataFrame
) -> tuple[pl.DataFrame, list[str]]:
    """Generate recency features for user-store pairs"""
    latest_time = history_df['timestamp'].max()
    
    feature = history_df.group_by(['user_id', 'store_id']).agg(
        pl.max('timestamp').alias('last_interaction_u_s')
    ).with_columns(
        days_since_interaction_u_s=(latest_time - pl.col('last_interaction_u_s')) / (24 * 60 * 60)
    )
    return target_df.join(
        feature,
        on=['user_id', 'store_id'],
        how='left'
    ), []

In [11]:
@FeatureFactory.register('user_stats')
def generate_user_stats(
    history_df: pl.DataFrame, target_df: pl.DataFrame
) -> tuple[pl.DataFrame, list[str]]:
    """Generate user-level statistics"""
    feature = history_df.group_by('user_id').agg([
        pl.len().alias('user_total_interactions'),
        pl.col('action_type').eq('AT_Purchase').sum().alias('user_total_purchases'),
        pl.col('action_type').eq('AT_View').sum().alias('user_total_views'),
        pl.n_unique('product_id').alias('user_unique_products')
    ])
    return target_df.join(
        feature,
        on=['user_id'],
        how='left'
    ), []

In [12]:
@FeatureFactory.register('product_stats')
def generate_product_stats(
    history_df: pl.DataFrame, target_df: pl.DataFrame
) -> tuple[pl.DataFrame, list[str]]:
    """Generate product-level statistics"""
    features = history_df.group_by('product_id').agg([
        pl.len().alias('product_total_interactions'),
        pl.col('action_type').eq('AT_Purchase').sum().alias('product_total_purchases'),
        pl.col('action_type').eq('AT_View').sum().alias('product_total_views'),
        pl.n_unique('user_id').alias('product_unique_users')
    ])
    return target_df.join(
        features,
        on=['product_id'],
        how='left'
    ), []

In [13]:
@FeatureFactory.register('store_stats')
def generate_store_stats(
    history_df: pl.DataFrame, target_df: pl.DataFrame
) -> tuple[pl.DataFrame, list[str]]:
    """Generate store-level statistics"""
    feature = history_df.group_by('store_id').agg([
        pl.len().alias('store_total_interactions'),
        pl.col('action_type').eq('AT_Purchase').sum().alias('store_total_purchases'),
        pl.col('action_type').eq('AT_View').sum().alias('store_total_views'),
        pl.n_unique('product_id').alias('store_unique_products')
    ])
    return target_df.join(
        feature,
        on=['store_id'],
        how='left'
    ), []

In [14]:
@FeatureFactory.register_target('CartUpdate_vs_View')
def generate_target(history_df: pl.DataFrame, target_df: pl.DataFrame) -> pl.Series:
    """Assign 0 for 'AT_View' and 1 for 'AT_CartUpdate'."""
    mapping = {
        'AT_View': 0,
        'AT_CartUpdate': 1,
    }
    target = target_df.with_columns(
        target=pl.col("action_type").map_elements(
            lambda x: mapping.get(x, None),
            return_dtype=pl.Int64
        )
    )['target']
    return target

In [15]:
@FeatureFactory.register_target('CartUpdate_Purchase_vs_View')
def generate_target(history_df: pl.DataFrame, target_df: pl.DataFrame) -> pl.Series:
    """Assign 0 for 'AT_View' and 1 for 'AT_CartUpdate' and 'AT_Purchase'."""
    mapping = {
        'AT_View': 0,
        'AT_CartUpdate': 1,
        'AT_Purchase': 1,
    }
    target = target_df.with_columns(
        target=pl.col("action_type").map_elements(
            lambda x: mapping.get(x, None),
            return_dtype=pl.Int64
        )
    )['target']
    return target

In [16]:
folds = dl.create_validation_splits(2)

2025-04-12 14:16:52,346 - lavka_recsys.DataLoader - INFO - Created 2 temporal validation folds


In [26]:
len(folds)
(history, train_df, test_df) = folds[0]

In [27]:
ff = FeatureFactory(config)

In [28]:
requested_features = [
    'count_purchase_product',
    # 'count_purchase_store',
    'ctr_product',
    # 'ctr_store',
    # 'recency_user_product',
    'user_stats',
    'store_stats',
]

In [29]:
train = train_df
train, target, _, _ = ff.generate_batch(history, train, requested_features)

2025-04-12 14:18:03,487 - lavka_recsys.FeatureFactory - INFO - Generating features: count_purchase_product, ctr_product, user_stats, store_stats
2025-04-12 14:18:04,239 - lavka_recsys.FeatureFactory - INFO - Joined features
2025-04-12 14:18:04,240 - lavka_recsys.FeatureFactory - INFO - All column names: {'user_total_interactions', 'user_total_views', 'store_total_views', 'count_purchase_product', 'ctr_product', 'user_total_purchases', 'store_unique_products', 'store_total_purchases', 'user_unique_products', 'store_total_interactions'}
2025-04-12 14:18:04,241 - lavka_recsys.FeatureFactory - INFO - All categorical column names: set()


In [30]:
train.shape

(3307808, 10)

In [31]:
val_history = pl.concat(
    [history, train_df]
)
val = test_df
val, val_target, _, _ = ff.generate_batch(val_history, val, requested_features)

2025-04-12 14:18:07,976 - lavka_recsys.FeatureFactory - INFO - Generating features: count_purchase_product, ctr_product, user_stats, store_stats
2025-04-12 14:18:09,758 - lavka_recsys.FeatureFactory - INFO - Joined features
2025-04-12 14:18:09,759 - lavka_recsys.FeatureFactory - INFO - All column names: {'user_total_interactions', 'user_total_views', 'store_total_views', 'count_purchase_product', 'ctr_product', 'user_total_purchases', 'store_unique_products', 'store_total_purchases', 'user_unique_products', 'store_total_interactions'}
2025-04-12 14:18:09,760 - lavka_recsys.FeatureFactory - INFO - All categorical column names: set()


In [32]:
val.shape

(3653832, 10)

In [65]:
print(train.columns, train.height)

['count_purchase_product', 'ctr_product'] 4590416


In [None]:
print(train.columns, train.height)

['user_unique_products', 'store_total_purchases', 'user_total_views', 'store_total_interactions', 'ctr_product', 'user_total_purchases', 'store_total_views', 'count_purchase_product', 'user_total_interactions', 'store_unique_products'] 4590416


In [66]:
print(val.columns, val.height)

['count_purchase_product', 'ctr_product'] 5727182


In [61]:
print(val.columns, val.height)

['user_unique_products', 'store_total_purchases', 'user_total_views', 'store_total_interactions', 'ctr_product', 'user_total_purchases', 'store_total_views', 'count_purchase_product', 'user_total_interactions', 'store_unique_products'] 5727182


In [76]:
val.dtypes

[UInt32,
 UInt32,
 UInt32,
 UInt32,
 Float64,
 UInt32,
 UInt32,
 UInt32,
 UInt32,
 UInt32]

In [77]:
mf = ModelFactory(config)
model = mf.create_model()

In [78]:
model.train(
    train,
    target,
    eval_set=(val, val_target)
)

0:	test: 0.5463692	best: 0.5463692 (0)	total: 387ms	remaining: 1m 55s
1:	test: 0.5617999	best: 0.5617999 (1)	total: 764ms	remaining: 1m 53s
2:	test: 0.5619614	best: 0.5619614 (2)	total: 1.14s	remaining: 1m 52s
3:	test: 0.5612056	best: 0.5619614 (2)	total: 1.49s	remaining: 1m 50s
4:	test: 0.5606505	best: 0.5619614 (2)	total: 1.87s	remaining: 1m 50s
5:	test: 0.5797054	best: 0.5797054 (5)	total: 2.23s	remaining: 1m 49s
6:	test: 0.5829886	best: 0.5829886 (6)	total: 2.59s	remaining: 1m 48s
7:	test: 0.5835265	best: 0.5835265 (7)	total: 2.95s	remaining: 1m 47s
8:	test: 0.5837335	best: 0.5837335 (8)	total: 3.3s	remaining: 1m 46s
9:	test: 0.5829920	best: 0.5837335 (8)	total: 3.65s	remaining: 1m 45s
10:	test: 0.5854915	best: 0.5854915 (10)	total: 4.01s	remaining: 1m 45s
11:	test: 0.5852344	best: 0.5854915 (10)	total: 4.37s	remaining: 1m 44s
12:	test: 0.6193729	best: 0.6193729 (12)	total: 4.73s	remaining: 1m 44s
13:	test: 0.6233774	best: 0.6233774 (13)	total: 5.09s	remaining: 1m 43s
14:	test: 0.6

<framework.model_factory.CatBoostModel at 0x7f584b61a410>