# 02: Data Preprocessing

This notebook runs the preprocessing pipeline to:
1. Merge datasets (ratings + metadata via links)
2. Clean and validate data
3. Filter sparse users/items
4. Create time-based train/val/test splits
5. Create cold-start evaluation sets
6. Save processed data and mappings

In [None]:
# Imports
import sys
sys.path.append('..')

import json
import pandas as pd
from src.preprocessing import DataPreprocessor
from src.config import config

# Ensure directories exist
config.paths.ensure_dirs()

## Run Preprocessing Pipeline

In [None]:
# Create preprocessor and run pipeline
preprocessor = DataPreprocessor()
preprocessor.run()

## Load and Verify Processed Data

In [None]:
# Load statistics
with open(config.paths.statistics_path, 'r') as f:
    stats = json.load(f)

print("=" * 60)
print("PREPROCESSING RESULTS")
print("=" * 60)

print("\n--- Before Filtering ---")
for key, value in stats['before_filtering'].items():
    print(f"{key}: {value:,}")

print("\n--- After Filtering ---")
for key, value in stats['after_filtering'].items():
    print(f"{key}: {value:,}")

print("\n--- Data Splits ---")
for split, count in stats['split_counts'].items():
    print(f"{split}: {count:,} ratings")

## Load Data Splits

In [None]:
# Load data splits
train_df = pd.read_pickle(config.paths.train_path)
val_df = pd.read_pickle(config.paths.val_path)
test_df = pd.read_pickle(config.paths.test_path)
cold_start_df = pd.read_pickle(config.paths.cold_start_test_path)

print("\nData splits loaded successfully!")
print(f"Train: {train_df.shape}")
print(f"Val: {val_df.shape}")
print(f"Test: {test_df.shape}")
print(f"Cold-start test: {cold_start_df.shape}")

## Examine Processed Data

In [None]:
# Sample from training data
print("\nTraining data sample:")
print(train_df.head())

# Check data types
print("\nData types:")
print(train_df.dtypes)

## Summary

In [None]:
print("\n" + "=" * 60)
print("PREPROCESSING COMPLETE!")
print("=" * 60)
print("\nProcessed data saved to:")
print(f"  {config.paths.DATA_DIR}")
print("\nNext steps:")
print("  1. Train baseline models (GMF, MLP, NeuMF)")
print("  2. Train NeuMF+ with content features")
print("  3. Run ablation study")
print("  4. Evaluate on cold-start set")