# Lavka Recommender System Example Usage

This notebook demonstrates how to use the recommender system with the unified experiment interface. It includes examples of new features like GPU acceleration, conversion-related features, and improved feature selection.

In [None]:
import os
import polars as pl
import matplotlib.pyplot as plt

from lavka_recsys.config import Config
from lavka_recsys.experiment import Experiment, ExperimentType

## 1. Load Configuration

You can either load configuration from a YAML file or create it programmatically.

In [ ]:
# Load from file if it exists
if os.path.exists('default_config.yaml'):
    config = Config.from_file('default_config.yaml')
else:
    # Create configuration programmatically
    config = Config({
        "experiment": {
            "type": "standard",
            "use_hyperparameter_tuning": False,
            "evaluation": {
                "perform_kaggle_simulation": True,
                "create_submission": True
            }
        },
        "model": {
            "type": "catboost",
            "use_gpu": False,         # Set to True to enable GPU acceleration
            "gpu_devices": "0",       # Specify GPU device ids (comma-separated for multiple GPUs)
            "thread_count": -1,       # Number of CPU threads (-1 for auto)
            "config": {
                "catboost": {
                    "iterations": 300,
                    "learning_rate": 0.1,
                    "depth": 6,
                    "l2_leaf_reg": 3.0
                }
            }
        },
        "feature_selection": {
            "enabled": False,         # Enable/disable feature selection
            "method": "importance",   # Feature selection method
            "n_features": 10          # Number of top features to select
        },
        "features": [
            "count_purchase_user_product",
            "count_purchase_user_store",
            "ctr_product",
            "cart_to_purchase_rate",  # New cart-to-purchase conversion rate
            "purchase_view_ratio",    # New purchase-view ratio
            "recency_user_product",
            "user_stats",
            "product_stats",
            "store_stats"
        ],
        "target": "CartUpdate_Purchase_vs_View",
        "data": {
            "train_path": "data/train.parquet",
            "test_path": "data/test.parquet",
            "sample_size": 50000  # Use a smaller dataset for faster execution
        },
        "training": {
            "split_type": "standard",  # Using standard split for quick demonstration
            "target_days": 7,
            "validation_days": 7
        }
    })

## 2. Create and Setup Experiment

Create an experiment and set it up by loading the data and initializing components.

In [None]:
# Create experiment
experiment = Experiment("example_experiment", config)

# Setup experiment (load data, initialize components)
experiment.setup()

## 3. Run Standard Experiment

Run a standard experiment with a single train/validation split.

In [None]:
# Set experiment type to STANDARD
experiment.experiment_type = ExperimentType.STANDARD

# Run experiment
results = experiment.run()

# Print metrics
print("Standard Experiment Metrics:")
for metric, value in results['metrics'].items():
    print(f"  {metric}: {value:.4f}")

# Print top features
print("\nTop 5 Important Features:")
top_features = sorted(results['feature_importance'].items(), key=lambda x: x[1], reverse=True)[:5]
for feature, importance in top_features:
    print(f"  {feature}: {importance:.6f}")

## 4. Full History Experiment

Run a full history experiment with time-aware training.

In [None]:
# Set experiment type to FULL_HISTORY
experiment.experiment_type = ExperimentType.FULL_HISTORY

# Run experiment (this will take longer)
results = experiment.run()

# Print metrics
print("Full History Experiment Metrics:")
for metric, value in results['metrics'].items():
    print(f"  {metric}: {value:.4f}")

# Print top features
print("\nTop 5 Important Features:")
top_features = sorted(results['feature_importance'].items(), key=lambda x: x[1], reverse=True)[:5]
for feature, importance in top_features:
    print(f"  {feature}: {importance:.6f}")

## 5. Kaggle Evaluation and Submission

Evaluate the model on simulated Kaggle test set and create a submission.

In [ ]:
# Update configuration for GPU and feature selection
gpu_config = config.copy()
gpu_config.set('model.use_gpu', True)  # Enable GPU
gpu_config.set('feature_selection.enabled', True)  # Enable feature selection
gpu_config.set('feature_selection.n_features', 8)  # Select top 8 features

# Create and setup new experiment
print("Running experiment with GPU acceleration and feature selection...")
gpu_experiment = Experiment("gpu_feature_selection_example", gpu_config)
gpu_experiment.setup()

# Run the experiment (this will be faster with GPU if available)
gpu_results = gpu_experiment.run()

# Print metrics
print("\nGPU-accelerated Experiment Metrics:")
for metric, value in gpu_results['metrics'].items():
    print(f"  {metric}: {value:.4f}")

# Print selected features
if 'selected_features' in gpu_results:
    print("\nSelected Features:")
    for feature in gpu_results['selected_features']:
        print(f"  {feature}")

# Print feature importance for selected features
if 'feature_importance' in gpu_results:
    print("\nFeature Importance:")
    # Filter to show only selected features
    selected_features = set(gpu_results.get('selected_features', []))
    filtered_importances = {k: v for k, v in gpu_results['feature_importance'].items() 
                           if not selected_features or k in selected_features}
    
    sorted_features = sorted(filtered_importances.items(), key=lambda x: x[1], reverse=True)
    for feature, importance in sorted_features:
        print(f"  {feature}: {importance:.6f}")

## 6. Hyperparameter Tuning Example

Run an experiment with hyperparameter tuning.

In [None]:
# Update configuration for tuning
config.set('experiment.type', 'tuning')
config.set('experiment.use_hyperparameter_tuning', True)
config.set('hyperparameter_tuning.n_trials', 5)  # Low number for demonstration

# Create and setup new experiment
tuning_experiment = Experiment("tuning_example", config)
tuning_experiment.setup()

# Run tuning (this will take longer)
tuning_results = tuning_experiment.run()

# Print results
print("Tuning Experiment Metrics:")
for metric, value in tuning_results['metrics'].items():
    print(f"  {metric}: {value:.4f}")

print("\nBest Parameters:")
for param, value in tuning_results['best_params'].items():
    print(f"  {param}: {value}")

<cell_type>markdown</cell_type>## 9. Examining Conversion Features

Let's focus on the new conversion features: `cart_to_purchase_rate` and `purchase_view_ratio`.

In [ ]:
# Create a configuration that only uses conversion features
conversion_config = config.copy()
conversion_config.set('features', [
    "cart_to_purchase_rate",
    "purchase_view_ratio",
    "ctr_product",  # Include this for comparison
    "source_type"   # Include a categorical feature
])

# Create and run experiment
conversion_experiment = Experiment("conversion_features", conversion_config)
conversion_experiment.setup()
conversion_results = conversion_experiment.run()

# Print metrics
print("Conversion Features Experiment Metrics:")
for metric, value in conversion_results['metrics'].items():
    print(f"  {metric}: {value:.4f}")

# Print feature importance
if 'feature_importance' in conversion_results:
    print("\nFeature Importance:")
    sorted_features = sorted(conversion_results['feature_importance'].items(), 
                            key=lambda x: x[1], reverse=True)
    for feature, importance in sorted_features:
        print(f"  {feature}: {importance:.6f}")

# Try to visualize feature distributions if data is available
print("\nAttempting to visualize feature distributions...")
try:
    # Get a sample of the data with features
    features_df = conversion_experiment.last_features
    
    if features_df is not None and not features_df.is_empty():
        # Convert to pandas for easier plotting
        pd_df = features_df.to_pandas()
        
        # Create plots for conversion features
        plt.figure(figsize=(15, 5))
        
        # Plot 1: Cart-to-Purchase Rate
        plt.subplot(1, 3, 1)
        if 'cart_to_purchase_rate' in pd_df.columns:
            plt.hist(pd_df['cart_to_purchase_rate'].dropna(), bins=20, alpha=0.7)
            plt.title('Cart-to-Purchase Rate')
            plt.xlabel('Rate')
            plt.ylabel('Count')
        else:
            plt.title('Cart-to-Purchase Rate (Not Available)')
        
        # Plot 2: Purchase-View Ratio
        plt.subplot(1, 3, 2)
        if 'purchase_view_ratio' in pd_df.columns:
            plt.hist(pd_df['purchase_view_ratio'].dropna(), bins=20, alpha=0.7)
            plt.title('Purchase-View Ratio')
            plt.xlabel('Ratio')
            plt.ylabel('Count')
        else:
            plt.title('Purchase-View Ratio (Not Available)')
        
        # Plot 3: CTR Product (for comparison)
        plt.subplot(1, 3, 3)
        if 'ctr_product' in pd_df.columns:
            plt.hist(pd_df['ctr_product'].dropna(), bins=20, alpha=0.7)
            plt.title('Product CTR')
            plt.xlabel('CTR')
            plt.ylabel('Count')
        else:
            plt.title('Product CTR (Not Available)')
        
        plt.tight_layout()
        plt.show()
    else:
        print("No feature data available for visualization")
except Exception as e:
    print(f"Could not visualize features: {str(e)}")
    print("This is expected if using a cached model")

## 8. Visualize Comparison

Compare metrics across different strategies.

In [None]:
# Create bar chart for AUC comparison
plt.figure(figsize=(10, 6))
auc_values = [metrics[split].get('auc', 0) for split in split_configs.keys()]
ndcg_values = [metrics[split].get('ndcg@10', 0) for split in split_configs.keys()]

x = range(len(split_configs))
width = 0.35

plt.bar(x, auc_values, width, label='AUC')
plt.bar([i + width for i in x], ndcg_values, width, label='nDCG@10')

plt.xlabel('Time Splitting Strategy')
plt.ylabel('Metric Value')
plt.title('Performance Comparison of Time Splitting Strategies')
plt.xticks([i + width/2 for i in x], list(split_configs.keys()))
plt.legend()
plt.tight_layout()
plt.show()