# Colab Data Preparation Launcher

This notebook is designed for Google Colab. It will:

1. Clone the project repo
2. Install dependencies
3. Run the MovieLens baseline demo (optional)
4. Run the Census-Income (KDD) preparation script (PLE) and sanity-check DataLoaders

In [None]:
# Detect Colab, clone the repo, install dependencies, and set up paths
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False

repo_url = 'https://github.com/allyoushawn/recsys_playground.git'
repo_dir = 'recsys_playground'

import os, sys
if IN_COLAB:
    if not os.path.exists(repo_dir):
        !git clone $repo_url
    %cd $repo_dir
    # Install base and PLE experiment requirements
    !pip -q install -r requirements.txt
    if os.path.exists('ple_experiment/requirements.txt'):
        !pip -q install -r ple_experiment/requirements.txt
    # Make src importable
    src_path = os.path.abspath('src')
    if src_path not in sys.path:
        sys.path.insert(0, src_path)
else:
    # Local usage: ensure src is importable
    repo_root = os.getcwd()
    src_path = os.path.join(repo_root, 'src')
    if src_path not in sys.path:
        sys.path.insert(0, src_path)


## Optional: Quick MovieLens baseline demo

In [None]:
# Loads MovieLens-100K and prints top-10 popular movie IDs from the training split
import pandas as pd
from sklearn.model_selection import train_test_split
from data.movielens import load_movielens_100k
from models.popularity import get_top_n

df = load_movielens_100k()
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
top10 = get_top_n(train_df, n=10)
print('Top-10 movie IDs by popularity (train):', top10)


## Prepare Census-Income (KDD) for PLE

In [None]:
# Runs the PLE prep script to produce train/val/test arrays and feature metadata
output_dir = './data/census_kdd'
!python ple_experiment/prepare_census_income.py \
+        --output_dir $output_dir \
+        --test_size 0.15 \
+        --val_size 0.10 \
+        --batch_size 4096 \
+        --num_workers 4 \
+        --onehot_min_freq 10 \
+        --seed 42


In [None]:
# Inspect generated artifacts and do a minimal load
import os, json, numpy as np
from ple_experiment.dataset import CensusKDDDataset, make_dataloaders

print('Files in output_dir:')
print(sorted(os.listdir(output_dir))[:10])

ds = CensusKDDDataset(output_dir, split='train')
print('Train shapes:', ds.X.shape, ds.y_income.shape, ds.y_never.shape)

loaders = make_dataloaders(output_dir, batch_size=1024, num_workers=2)
batch = next(iter(loaders['train']))
print('Batch shapes:', batch['x'].shape, batch['y_income'].shape, batch['y_never_married'].shape)
