# TIGER SemanticID on Amazon Beauty — Experiment Plan

Goal: Implement Semantic IDs via RQ-VAE and a compact seq2seq Transformer for generative retrieval on Amazon Beauty 5-core; produce metrics and visualizations validating paper claims.

Datasets: Amazon Product Reviews (Beauty, 5-core).

Key steps: Download & preprocess; Sentence-T5 embeddings; RQ-VAE (3 levels, K=256) to 3-tuple codes + collision code c4; visualizations (c1↔category, hierarchy); seq2seq generative retrieval; metrics Recall@5/10, NDCG@5/10 and invalid-ID rate; ablations (Random/LSH); mini cold-start probe.

Artifacts: save to /content/artifacts. Keep configs modest for Colab; add knobs for smoke tests.

In [None]:
# Clone repo, install dependencies, and make src importable (Colab-friendly)
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False

repo_url = 'https://github.com/allyoushawn/recsys_playground.git'
repo_dir = 'recsys_playground'
branch_name = '20250908_tiger_dev'

import os
if IN_COLAB:
    if os.path.exists(repo_dir):
        ! rm -rf $repo_dir
    !git clone $repo_url
    %cd $repo_dir
    !git fetch --all
    !git checkout $branch_name || echo 'Branch not found; staying on default.'


In [None]:
# Runtime & installs
import os, sys, subprocess, torch
print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('GPU:', torch.cuda.get_device_name(0))

# Install module dependencies (Colab).
!pip -q install -r tiger_semantic_id_amazon_beauty/requirements.txt

# Make src importable
src_path = os.path.abspath('tiger_semantic_id_amazon_beauty/src')
if src_path not in sys.path: sys.path.insert(0, src_path)

from tiger_semantic_id_amazon_beauty.src.utils import set_seed, ensure_dirs, Paths
set_seed(42)
ensure_dirs(Paths.data_dir, Paths.artifacts_dir)


In [None]:
# Config dataclass
from dataclasses import dataclass

@dataclass
class Config:
    dataset_name: str = 'Beauty'
    min_user_interactions: int = 5
    max_hist_len: int = 20
    embed_model_name: str = 'sentence-t5-base'
    rqvae_latent_dim: int = 32
    rqvae_levels: int = 3
    rqvae_codebook_size: int = 256
    rqvae_beta: float = 0.25
    rqvae_epochs: int = 20  # Reduced from 50 for faster testing
    rqvae_batch_size: int = 1024
    rqvae_lr: float = 1e-3  # FIXED: Reduced from 4e-1 to 1e-3 for stability
    seq2seq_d_model: int = 128
    seq2seq_ff: int = 1024
    seq2seq_heads: int = 8  # Changed from 6 to 8 so it divides 128 evenly
    seq2seq_layers_enc: int = 4
    seq2seq_layers_dec: int = 4
    seq2seq_dropout: float = 0.1
    seq2seq_batch_size: int = 256
    seq2seq_steps: int = 20000
    seq2seq_lr: float = 1e-2
    user_vocab_hash: int = 2000
    topk_list: tuple = (5, 10)

cfg = Config()
cfg

In [None]:
# Fix for Python dict format in metadata files BEFORE any data imports
def _parse_python_dict_lines(path: str):
    """Parse Python dict lines (not JSON) from a gzipped file using ast.literal_eval."""
    import ast
    import gzip
    
    opener = gzip.open if path.endswith(".gz") else open
    rows = []
    with opener(path, "rt") as f:
        for raw in f:
            try:
                line = raw.strip()
                if line:
                    # Use ast.literal_eval to safely parse Python dict strings
                    data = ast.literal_eval(line)
                    rows.append(data)
            except (ValueError, SyntaxError, MemoryError):
                # Skip malformed lines
                continue
    return rows

# Apply the fix BEFORE importing data functions
from tiger_semantic_id_amazon_beauty.src import data
data._parse_json_lines = _parse_python_dict_lines
print("✓ Applied Python dict parser fix")

# Now import data functions and download data
from tiger_semantic_id_amazon_beauty.src.data import SNAP_REVIEWS, SNAP_META
from tiger_semantic_id_amazon_beauty.src.utils import Paths
!cd /content 2>/dev/null || true
!mkdir -p {Paths.data_dir}
!wget -q -O {Paths.data_dir}/reviews_Beauty_5.json.gz {SNAP_REVIEWS}
!wget -q -O {Paths.data_dir}/meta_Beauty.json.gz {SNAP_META}
!gzip -t {Paths.data_dir}/reviews_Beauty_5.json.gz && gzip -t {Paths.data_dir}/meta_Beauty.json.gz && echo 'gz ok'
!zcat -f {Paths.data_dir}/reviews_Beauty_5.json.gz | head -n 2
!zcat -f {Paths.data_dir}/meta_Beauty.json.gz | head -n 2

In [None]:
# Parse and preprocess
import pandas as pd
from tiger_semantic_id_amazon_beauty.src.data import load_reviews_df, load_meta_df, filter_and_split, build_id_maps, apply_id_maps, save_mappings
reviews = load_reviews_df(f"{Paths.data_dir}/reviews_Beauty_5.json.gz")
meta = load_meta_df(f"{Paths.data_dir}/meta_Beauty.json.gz")
# Merge item_idx later after mapping
train_df, val_df, test_df = filter_and_split(reviews, __import__('tiger_semantic_id_amazon_beauty.src.data', fromlist=['BeautyConfig']).BeautyConfig())
user2id, item2id = build_id_maps([train_df, val_df, test_df])
save_mappings(Paths.artifacts_dir, user2id, item2id)
train_df = apply_id_maps(train_df, user2id, item2id)
val_df = apply_id_maps(val_df, user2id, item2id)
test_df = apply_id_maps(test_df, user2id, item2id)
# Robust merge: ensure metadata has 'item_id' even if source used 'asin'
meta_merge = meta.copy()
print("Meta columns:", meta.columns.tolist())
print("Meta shape:", meta.shape)
items = pd.DataFrame({'item_id': list(item2id.keys()), 'item_idx': list(item2id.values())}).merge(meta, on='item_id', how='left')
print('Shapes:', train_df.shape, val_df.shape, test_df.shape, items.shape)


In [None]:
# Build item text & embed with Sentence-T5 (GPU-optimized)
import torch
from tiger_semantic_id_amazon_beauty.src.embeddings import build_item_text, encode_items

# Build item texts from metadata
texts = build_item_text(items)
print(f"Built {len(texts)} item text descriptions")

# Encode with GPU acceleration (auto-detects CUDA if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Encoding items with device: {device}")

item_emb = encode_items(
    texts, 
    model_name=cfg.embed_model_name, 
    batch_size=256 if device == "cuda" else 128,  # Larger batch size for GPU
    device=device  # Explicit device specification
)

# Save embeddings to disk
torch.save(item_emb, f"{Paths.artifacts_dir}/item_embeddings.pt")
print(f"Saved embeddings to {Paths.artifacts_dir}/item_embeddings.pt")
print(f"Embeddings device: {item_emb.device}")

# Debug: Check if the input embeddings themselves are diverse
print("\n=== INPUT DATA ANALYSIS ===")
sample_items = item_emb[:10]
print(f"First 10 item embeddings are identical? {torch.allclose(sample_items[0], sample_items[1])}")
print(f"All 10 embeddings identical? {all(torch.allclose(sample_items[0], sample_items[i]) for i in range(1, 10))}")

# Check actual values
print(f"Item 0 first 10 dims: {sample_items[0][:10]}")
print(f"Item 1 first 10 dims: {sample_items[1][:10]}")
print(f"Item 2 first 10 dims: {sample_items[2][:10]}")

# Check if there's variance within each embedding
for i in range(5):
    print(f"Item {i} internal variance: {sample_items[i].var():.6f}")

print(f"\nFinal embedding shape: {item_emb.shape}")
print(f"Embedding device: {item_emb.device}")

In [None]:
# The improved RQ-VAE architecture is now integrated into the main RQVAE class in rqvae.py
# No need for separate ImprovedRQVAE class or train_rqvae patches
print("✓ Using improved RQ-VAE architecture from rqvae.py")

In [None]:
# RQ-VAE training with improved architecture (GPU-optimized)
import torch
from tiger_semantic_id_amazon_beauty.src.rqvae import RQVAE, RQVAEConfig, train_rqvae, encode_codes

# Setup device for GPU acceleration
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"RQ-VAE training will use device: {device}")

# Create RQ-VAE configuration
rqcfg = RQVAEConfig(input_dim=item_emb.shape[1], latent_dim=cfg.rqvae_latent_dim, levels=cfg.rqvae_levels, codebook_size=cfg.rqvae_codebook_size)
rqcfg.beta = cfg.rqvae_beta

print("=== CREATING IMPROVED RQ-VAE MODEL ===")
model = RQVAE(rqcfg).to(device)
print(f"Model created with {sum(p.numel() for p in model.parameters())} parameters")

# Move embeddings to same device as training will occur
if item_emb.device != torch.device(device):
    print(f"Moving embeddings from {item_emb.device} to {device}")
    item_emb = item_emb.to(device)
else:
    print(f"Embeddings already on {device}")

print("\n=== TESTING MODEL DIVERSITY (PRE-TRAINING) ===") 
# Test diversity before training to ensure the improved architecture works
with torch.no_grad():
    # Test encoder diversity
    encoded = model.encoder(model.normalize(item_emb[:20]))
    dists = torch.cdist(encoded[:10], encoded[:10])
    print(f"Encoder diversity: mean pairwise distance = {dists.fill_diagonal_(float('inf')).mean():.6f}")
    
    # Test quantization diversity 
    z = model.encoder(model.normalize(item_emb[:100]))
    q, codes = model.codebook(z)
    unique_codes = len(torch.unique(codes, dim=0))
    print(f"Quantization diversity: {unique_codes} unique codes out of 100 items")
    print(f"Sample codes: {codes[:5].tolist()}")

# Run RQ-VAE training with GPU acceleration
print(f"\n=== STARTING RQ-VAE TRAINING ON {device.upper()} ===")
print(f"Training parameters: epochs={cfg.rqvae_epochs}, batch_size={cfg.rqvae_batch_size}, lr={cfg.rqvae_lr}")

# Adjust batch size based on device (larger for GPU)
training_batch_size = cfg.rqvae_batch_size if device == "cuda" else min(cfg.rqvae_batch_size, 512)
if training_batch_size != cfg.rqvae_batch_size:
    print(f"Adjusted batch size for {device}: {cfg.rqvae_batch_size} -> {training_batch_size}")

model = train_rqvae(
    model, 
    item_emb, 
    epochs=cfg.rqvae_epochs, 
    batch_size=training_batch_size, 
    lr=cfg.rqvae_lr,
    device=device  # Explicit device specification
)

# Save trained model
torch.save(model.state_dict(), f"{Paths.artifacts_dir}/rqvae.pt")
print(f"Saved trained model to {Paths.artifacts_dir}/rqvae.pt")

# Generate codes for the full dataset  
print("\n=== GENERATING SEMANTIC CODES ===")
codes = encode_codes(model, item_emb, device=device)
final_unique = len(torch.unique(codes, dim=0))
print(f"Final codes shape: {codes.shape}")
print(f"Final unique codes: {final_unique} out of {len(codes)} items ({100*final_unique/len(codes):.1f}% diversity)")
print(f"Sample final codes: {codes[:5].tolist()}")

# Check if diversity was preserved through training
if final_unique > len(codes) * 0.8:  # More than 80% unique
    print("✅ Excellent code diversity preserved through training!")
elif final_unique > len(codes) * 0.5:  # More than 50% unique  
    print("✅ Good code diversity maintained")
else:
    print("⚠️  Code diversity may need improvement")

In [None]:
# Assign Semantic IDs & save maps
import numpy as np
from tiger_semantic_id_amazon_beauty.src.semantic_id import assign_semantic_ids
sid, sid_to_items, prefix_to_items = assign_semantic_ids(codes, Paths.artifacts_dir, codebook_size=cfg.rqvae_codebook_size)
sid.shape, len(sid_to_items)


In [None]:
# Visualizations: c1 vs category, and hierarchy
from tiger_semantic_id_amazon_beauty.src.visualize import plot_c1_category_distribution, plot_hierarchy_c1_c2
fig1 = plot_c1_category_distribution(codes.numpy(), items)
fig1.savefig(f"{Paths.artifacts_dir}/figs_c1_category.png")
c1_vals = list(pd.Series(codes[:,0].numpy()).value_counts().head(3).index)
fig2 = plot_hierarchy_c1_c2(codes.numpy(), items, c1_vals)
fig2.savefig(f"{Paths.artifacts_dir}/figs_hierarchy.png")
fig1, fig2


In [None]:
# Sequence dataset for generative retrieval
from collections import defaultdict
from tiger_semantic_id_amazon_beauty.src.seq2seq import TIGERSeqDataset, VocabConfig, Seq2SeqConfig
user_hist = defaultdict(list)
for r in train_df.sort_values(['user_idx','ts']).itertuples(index=False):
    user_hist[int(r.user_idx)].append(int(r.item_idx))
# Fix: use cfg.rqvae_levels to match the RQ-VAE configuration (3 levels)
vocab_cfg = VocabConfig(codebook_size=cfg.rqvae_codebook_size, levels=cfg.rqvae_levels, user_vocab_hash=cfg.user_vocab_hash)
seq_cfg = Seq2SeqConfig(d_model=cfg.seq2seq_d_model, ff=cfg.seq2seq_ff, heads=cfg.seq2seq_heads, layers_enc=cfg.seq2seq_layers_enc, layers_dec=cfg.seq2seq_layers_dec, dropout=cfg.seq2seq_dropout, max_hist_len=cfg.max_hist_len, batch_size=cfg.seq2seq_batch_size, lr=cfg.seq2seq_lr)
train_ds = TIGERSeqDataset(user_hist, sid, user_hash_size=vocab_cfg.user_vocab_hash, codebook_size=vocab_cfg.codebook_size, max_hist_len=seq_cfg.max_hist_len)
len(train_ds), train_ds[0][1][:8], train_ds[0][2]

In [None]:
# Seq2Seq model & training (compact)
import torch
from torch.utils.data import DataLoader
from tiger_semantic_id_amazon_beauty.src.seq2seq import TinyTransformer, collate_batch
V = 1 + vocab_cfg.semantic_vocab + vocab_cfg.user_vocab_hash + 2  # PAD=0, BOS=1, then others
model = TinyTransformer(vocab_size=V, d_model=seq_cfg.d_model, ff=seq_cfg.ff, heads=seq_cfg.heads, layers_enc=seq_cfg.layers_enc, layers_dec=seq_cfg.layers_dec, dropout=seq_cfg.dropout)
model = model.cuda() if torch.cuda.is_available() else model
opt = torch.optim.Adam(model.parameters(), lr=seq_cfg.lr)
loader = DataLoader(train_ds, batch_size=seq_cfg.batch_size, shuffle=True, collate_fn=collate_batch)
steps = 0
for src, tgt in loader:
    if torch.cuda.is_available(): src, tgt = src.cuda(), tgt.cuda()
    logits = model(src, tgt[:, :-1])
    loss = torch.nn.functional.cross_entropy(logits.reshape(-1, logits.size(-1)), tgt[:, 1:].reshape(-1), ignore_index=0)
    opt.zero_grad(); loss.backward(); torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0); opt.step()
    steps += 1
    if steps % 200 == 0:
        print('steps', steps, 'loss', float(loss))
    if steps >= 1000: break  # knob for Colab runtime
torch.save(model.state_dict(), f"{Paths.artifacts_dir}/seq2seq.pt")
steps
