In [2]:
%cd /data/majoroval/jupyter/RS-25/homework/week02

/data/majoroval/jupyter/RS-25/homework/week02


In [3]:
import requests

import numpy as np
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt

from PIL import Image
from io import BytesIO
from textwrap import wrap
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor
from sklearn.metrics import roc_auc_score, log_loss, ndcg_score
from pprint import pprint

In [4]:
from framework import Config, DataLoader, FeatureFactory, ModelFactory

In [5]:
config = Config()
dl = DataLoader(config)
dl.load_data();

2025-04-12 13:52:25,484 - lavka_recsys.DataLoader - INFO - Loaded train data: 14954417 rows
2025-04-12 13:52:25,497 - lavka_recsys.DataLoader - INFO - Normalized timestamps
2025-04-12 13:52:25,500 - lavka_recsys.DataLoader - INFO - Loaded test data: 565231 rows
2025-04-12 13:52:25,503 - lavka_recsys.DataLoader - INFO - Normalized timestamps


In [6]:
@FeatureFactory.register('count_purchase_product')
def generate_count_purchase_product(
    history_df: pl.DataFrame, target_df: pl.DataFrame
) -> tuple[pl.DataFrame, list[str]]:
    """Count purchases by user-product pairs"""
    return history_df.filter(
        pl.col('action_type') == "AT_Purchase"
    ).group_by(
        'user_id', 'product_id'
    ).agg(
        pl.len().alias('count_purchase_product')
    ).join(
        target_df,
        on=['user_id', 'product_id'],
        how='right'
    ).fill_null(0), []

In [7]:
@FeatureFactory.register('count_purchase_store')
def generate_count_purchase_store(
    history_df: pl.DataFrame, target_df: pl.DataFrame
) -> tuple[pl.DataFrame, list[str]]:
    """Count purchases by user-store pairs"""
    return history_df.filter(
        pl.col('action_type') == "AT_Purchase"
    ).group_by(
        'user_id', 'store_id'
    ).agg(
        pl.len().alias('count_purchase_store')
    ).join(
        target_df,
        on=['user_id', 'store_id'],
        how='right'
    ).fill_null(0), []

In [8]:
@FeatureFactory.register('ctr_product')
def generate_ctr_product(
    history_df: pl.DataFrame, target_df: pl.DataFrame
) -> tuple[pl.DataFrame, list[str]]:
    """Calculate CTR (Click-Through Rate) for products"""
    actions = history_df.group_by(
        'action_type', 'product_id'
    ).agg(
        pl.len()
    )
    
    clicks = actions.filter(pl.col('action_type') == "AT_Click")
    views = actions.filter(pl.col('action_type') == "AT_View")
    
    feature = clicks.join(
        views, on='product_id'
    ).with_columns(
        ctr_product=pl.col('len') / pl.col('len_right')
    ).select(
        'product_id', 'ctr_product'
    )
    return target_df.join(
        feature,
        on=['product_id'],
        how='left'
    ), []

In [9]:
@FeatureFactory.register('ctr_store')
def generate_ctr_store(
    history_df: pl.DataFrame, target_df: pl.DataFrame
) -> tuple[pl.DataFrame, list[str]]:
    """Calculate CTR (Click-Through Rate) for store"""
    actions = history_df.group_by(
        'action_type', 'store_id'
    ).agg(
        pl.len()
    )
    
    clicks = actions.filter(pl.col('action_type') == "AT_Click")
    views = actions.filter(pl.col('action_type') == "AT_View")
    
    feature = clicks.join(
        views, on='store_id'
    ).with_columns(
        ctr_store=pl.col('len') / pl.col('len_right')
    ).select(
        'store_id', 'ctr_store'
    )
    return target_df.join(
        feature,
        on=['store_id'],
        how='left'
    ), []

In [10]:
@FeatureFactory.register('recency_user_product')
def generate_recency_user_product(
    history_df: pl.DataFrame, target_df: pl.DataFrame
) -> tuple[pl.DataFrame, list[str]]:
    """Generate recency features for user-product pairs"""
    latest_time = history_df['timestamp'].max()
    
    feature = history_df.group_by(['user_id', 'product_id']).agg(
        pl.max('timestamp').alias('last_interaction_u_p')
    ).with_columns(
        days_since_interaction_u_p=(latest_time - pl.col('last_interaction_u_p')) / (24 * 60 * 60)
    )
    return target_df.join(
        feature,
        on=['user_id', 'product_id'],
        how='left'
    ), []

In [11]:
@FeatureFactory.register('recency_user_store')
def generate_recency_user_store(
    history_df: pl.DataFrame, target_df: pl.DataFrame
) -> tuple[pl.DataFrame, list[str]]:
    """Generate recency features for user-store pairs"""
    latest_time = history_df['timestamp'].max()
    
    feature = history_df.group_by(['user_id', 'store_id']).agg(
        pl.max('timestamp').alias('last_interaction_u_s')
    ).with_columns(
        days_since_interaction_u_s=(latest_time - pl.col('last_interaction_u_s')) / (24 * 60 * 60)
    )
    return target_df.join(
        feature,
        on=['user_id', 'store_id'],
        how='left'
    ), []

In [12]:
@FeatureFactory.register('user_stats')
def generate_user_stats(
    history_df: pl.DataFrame, target_df: pl.DataFrame
) -> tuple[pl.DataFrame, list[str]]:
    """Generate user-level statistics"""
    feature = history_df.group_by('user_id').agg([
        pl.len().alias('user_total_interactions'),
        pl.col('action_type').eq('AT_Purchase').sum().alias('user_total_purchases'),
        pl.col('action_type').eq('AT_View').sum().alias('user_total_views'),
        pl.n_unique('product_id').alias('user_unique_products')
    ])
    return target_df.join(
        feature,
        on=['user_id'],
        how='left'
    ), []

In [13]:
@FeatureFactory.register('product_stats')
def generate_product_stats(
    history_df: pl.DataFrame, target_df: pl.DataFrame
) -> tuple[pl.DataFrame, list[str]]:
    """Generate product-level statistics"""
    features = history_df.group_by('product_id').agg([
        pl.len().alias('product_total_interactions'),
        pl.col('action_type').eq('AT_Purchase').sum().alias('product_total_purchases'),
        pl.col('action_type').eq('AT_View').alias('product_total_views'),
        pl.n_unique('user_id').alias('product_unique_users')
    ])
    return target_df.join(
        features,
        on=['product_id'],
        how='left'
    ), []

In [None]:
@FeatureFactory.register('store_stats')
def generate_store_stats(
    history_df: pl.DataFrame, target_df: pl.DataFrame
) -> tuple[pl.DataFrame, list[str]]:
    """Generate store-level statistics"""
    feature = history_df.group_by('store_id').agg([
        pl.len().alias('store_total_interactions'),
        pl.col('action_type').eq('AT_Purchase').sum().alias('store_total_purchases'),
        pl.col('action_type').eq('AT_View').sum().alias('store_total_views'),
        pl.n_unique('product_id').alias('store_unique_products')
    ])
    return target_df.join(
        feature,
        on=['store_id'],
        how='left'
    ), []

In [15]:
@FeatureFactory.register_target('CartUpdate_vs_View')
def generate_target(history_df: pl.DataFrame, target_df: pl.DataFrame) -> pl.Series:
    """Assign 0 for 'AT_View' and 1 for 'AT_CartUpdate'."""
    mapping = {
        'AT_View': 0,
        'AT_CartUpdate': 1,
    }
    target = target_df.with_columns(
        target=pl.col("action_type").map_elements(
            lambda x: mapping.get(x, None),
            return_dtype=pl.Int64
        )
    )['target']
    return target

In [16]:
@FeatureFactory.register_target('CartUpdate_Purchase_vs_View')
def generate_target(history_df: pl.DataFrame, target_df: pl.DataFrame) -> pl.Series:
    """Assign 0 for 'AT_View' and 1 for 'AT_CartUpdate' and 'AT_Purchase'."""
    mapping = {
        'AT_View': 0,
        'AT_CartUpdate': 1,
        'AT_Purchase': 1,
    }
    target = target_df.with_columns(
        target=pl.col("action_type").map_elements(
            lambda x: mapping.get(x, None),
            return_dtype=pl.Int64
        )
    )['target']
    return target

In [17]:
[(history, train_df, test_df)] = dl.create_validation_splits()
ff = FeatureFactory(config)

2025-04-12 13:52:28,669 - lavka_recsys.DataLoader - INFO - Created 1 temporal validation folds


In [68]:
requested_features = [
    'count_purchase_product',
    # 'count_purchase_store',
    'ctr_product',
    # 'ctr_store',
    # 'recency_user_product',
    'user_stats',
    'store_stats',
]

In [69]:
train = train_df
train, target, _, _ = ff.generate_batch(history, train, requested_features)

2025-04-12 14:08:54,377 - lavka_recsys.FeatureFactory - INFO - Generating features: count_purchase_product, ctr_product, user_stats, store_stats
2025-04-12 14:09:04,248 - lavka_recsys.FeatureFactory - INFO - Joined features
2025-04-12 14:09:04,250 - lavka_recsys.FeatureFactory - INFO - All column names: {'user_unique_products', 'store_total_purchases', 'user_total_views', 'store_total_interactions', 'ctr_product', 'user_total_purchases', 'store_total_views', 'count_purchase_product', 'user_total_interactions', 'store_unique_products'}
2025-04-12 14:09:04,251 - lavka_recsys.FeatureFactory - INFO - All categorical column names: set()


In [70]:
val_history = pl.concat(
    [history, train_df]
)
val = test_df
val, val_target, _, _ = ff.generate_batch(val_history, val, requested_features)

2025-04-12 14:09:31,333 - lavka_recsys.FeatureFactory - INFO - Generating features: count_purchase_product, ctr_product, user_stats, store_stats


2025-04-12 14:09:53,636 - lavka_recsys.FeatureFactory - INFO - Joined features
2025-04-12 14:09:53,638 - lavka_recsys.FeatureFactory - INFO - All column names: {'user_unique_products', 'store_total_purchases', 'user_total_views', 'store_total_interactions', 'ctr_product', 'user_total_purchases', 'store_total_views', 'count_purchase_product', 'user_total_interactions', 'store_unique_products'}
2025-04-12 14:09:53,639 - lavka_recsys.FeatureFactory - INFO - All categorical column names: set()


In [65]:
print(train.columns, train.height)

['count_purchase_product', 'ctr_product'] 4590416


In [None]:
print(train.columns, train.height)

['user_unique_products', 'store_total_purchases', 'user_total_views', 'store_total_interactions', 'ctr_product', 'user_total_purchases', 'store_total_views', 'count_purchase_product', 'user_total_interactions', 'store_unique_products'] 4590416


In [66]:
print(val.columns, val.height)

['count_purchase_product', 'ctr_product'] 5727182


In [61]:
print(val.columns, val.height)

['user_unique_products', 'store_total_purchases', 'user_total_views', 'store_total_interactions', 'ctr_product', 'user_total_purchases', 'store_total_views', 'count_purchase_product', 'user_total_interactions', 'store_unique_products'] 5727182


In [67]:
val.dtypes

[UInt32, Float64]

In [58]:
mf = ModelFactory(config)
model = mf.create_model()

In [None]:
model.train(
    train,
    target,
    eval_set=(val, val_target)
)


thread '<unnamed>' panicked at crates/polars-core/src/chunked_array/ops/chunkops.rs:152:13:
Polars' maximum length reached. Consider installing 'polars-u64-idx'.
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace


PanicException: Polars' maximum length reached. Consider installing 'polars-u64-idx'.