# TIGER SemanticID on Amazon Beauty — Experiment Plan

Goal: Implement Semantic IDs via RQ-VAE and a compact seq2seq Transformer for generative retrieval on Amazon Beauty 5-core; produce metrics and visualizations validating paper claims.

Datasets: Amazon Product Reviews (Beauty, 5-core).

Key steps: Download & preprocess; Sentence-T5 embeddings; RQ-VAE (3 levels, K=256) to 3-tuple codes + collision code c4; visualizations (c1↔category, hierarchy); seq2seq generative retrieval; metrics Recall@5/10, NDCG@5/10 and invalid-ID rate; ablations (Random/LSH); mini cold-start probe.

Artifacts: save to /content/artifacts. Keep configs modest for Colab; add knobs for smoke tests.

In [None]:
# Clone repo, install dependencies, and make src importable (Colab-friendly)
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False

repo_url = 'https://github.com/allyoushawn/recsys_playground.git'
repo_dir = 'recsys_playground'
branch_name = '20250908_tiger_dev'

import os
if IN_COLAB:
    if not os.path.exists(repo_dir):
        !git clone $repo_url
    %cd $repo_dir
    !git fetch --all
    !git checkout $branch_name || echo 'Branch not found; staying on default.'


In [None]:
# Runtime & installs
import os, sys, subprocess, torch
print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('GPU:', torch.cuda.get_device_name(0))

# Install module dependencies (Colab).
!pip -q install -r tiger_semantic_id_amazon_beauty/requirements.txt

# Make src importable
src_path = os.path.abspath('tiger_semantic_id_amazon_beauty/src')
if src_path not in sys.path: sys.path.insert(0, src_path)

from tiger_semantic_id_amazon_beauty.src.utils import set_seed, ensure_dirs, Paths
set_seed(42)
ensure_dirs(Paths.data_dir, Paths.artifacts_dir)


In [None]:
# Config dataclass
from dataclasses import dataclass

@dataclass
class Config:
    dataset_name: str = 'Beauty'
    min_user_interactions: int = 5
    max_hist_len: int = 20
    embed_model_name: str = 'sentence-t5-base'
    rqvae_latent_dim: int = 32
    rqvae_levels: int = 3
    rqvae_codebook_size: int = 256
    rqvae_beta: float = 0.25
    rqvae_epochs: int = 20  # Reduced from 50 for faster testing
    rqvae_batch_size: int = 1024
    rqvae_lr: float = 1e-3  # FIXED: Reduced from 4e-1 to 1e-3 for stability
    seq2seq_d_model: int = 128
    seq2seq_ff: int = 1024
    seq2seq_heads: int = 8  # Changed from 6 to 8 so it divides 128 evenly
    seq2seq_layers_enc: int = 4
    seq2seq_layers_dec: int = 4
    seq2seq_dropout: float = 0.1
    seq2seq_batch_size: int = 256
    seq2seq_steps: int = 20000
    seq2seq_lr: float = 1e-2
    user_vocab_hash: int = 2000
    topk_list: tuple = (5, 10)

cfg = Config()
cfg

In [None]:
# Fix for Python dict format in metadata files BEFORE any data imports
def _parse_python_dict_lines(path: str):
    """Parse Python dict lines (not JSON) from a gzipped file using ast.literal_eval."""
    import ast
    import gzip
    
    opener = gzip.open if path.endswith(".gz") else open
    rows = []
    with opener(path, "rt") as f:
        for raw in f:
            try:
                line = raw.strip()
                if line:
                    # Use ast.literal_eval to safely parse Python dict strings
                    data = ast.literal_eval(line)
                    rows.append(data)
            except (ValueError, SyntaxError, MemoryError):
                # Skip malformed lines
                continue
    return rows

# Apply the fix BEFORE importing data functions
from tiger_semantic_id_amazon_beauty.src import data
data._parse_json_lines = _parse_python_dict_lines
print("✓ Applied Python dict parser fix")

# Now import data functions and download data
from tiger_semantic_id_amazon_beauty.src.data import SNAP_REVIEWS, SNAP_META
from tiger_semantic_id_amazon_beauty.src.utils import Paths
!cd /content 2>/dev/null || true
!mkdir -p {Paths.data_dir}
!wget -q -O {Paths.data_dir}/reviews_Beauty_5.json.gz {SNAP_REVIEWS}
!wget -q -O {Paths.data_dir}/meta_Beauty.json.gz {SNAP_META}
!gzip -t {Paths.data_dir}/reviews_Beauty_5.json.gz && gzip -t {Paths.data_dir}/meta_Beauty.json.gz && echo 'gz ok'
!zcat -f {Paths.data_dir}/reviews_Beauty_5.json.gz | head -n 2
!zcat -f {Paths.data_dir}/meta_Beauty.json.gz | head -n 2

In [None]:
# Parse and preprocess
import pandas as pd
from tiger_semantic_id_amazon_beauty.src.data import load_reviews_df, load_meta_df, filter_and_split, build_id_maps, apply_id_maps, save_mappings
reviews = load_reviews_df(f"{Paths.data_dir}/reviews_Beauty_5.json.gz")
meta = load_meta_df(f"{Paths.data_dir}/meta_Beauty.json.gz")
# Merge item_idx later after mapping
train_df, val_df, test_df = filter_and_split(reviews, __import__('tiger_semantic_id_amazon_beauty.src.data', fromlist=['BeautyConfig']).BeautyConfig())
user2id, item2id = build_id_maps([train_df, val_df, test_df])
save_mappings(Paths.artifacts_dir, user2id, item2id)
train_df = apply_id_maps(train_df, user2id, item2id)
val_df = apply_id_maps(val_df, user2id, item2id)
test_df = apply_id_maps(test_df, user2id, item2id)
# Robust merge: ensure metadata has 'item_id' even if source used 'asin'
meta_merge = meta.copy()
print("Meta columns:", meta.columns.tolist())
print("Meta shape:", meta.shape)
items = pd.DataFrame({'item_id': list(item2id.keys()), 'item_idx': list(item2id.values())}).merge(meta, on='item_id', how='left')
print('Shapes:', train_df.shape, val_df.shape, test_df.shape, items.shape)


In [None]:
# Build item text & embed with Sentence-T5
import torch
from tiger_semantic_id_amazon_beauty.src.embeddings import build_item_text, encode_items
texts = build_item_text(items)
item_emb = encode_items(texts, model_name=cfg.embed_model_name, batch_size=256)
torch.save(item_emb, f"{Paths.artifacts_dir}/item_embeddings.pt")

# Debug: Check if the input embeddings themselves are diverse
print("=== INPUT DATA ANALYSIS ===")
sample_items = item_emb[:10]
print(f"First 10 item embeddings are identical? {torch.allclose(sample_items[0], sample_items[1])}")
print(f"All 10 embeddings identical? {all(torch.allclose(sample_items[0], sample_items[i]) for i in range(1, 10))}")

# Check actual values
print(f"Item 0 first 10 dims: {sample_items[0][:10]}")
print(f"Item 1 first 10 dims: {sample_items[1][:10]}")
print(f"Item 2 first 10 dims: {sample_items[2][:10]}")

# Check if there's variance within each embedding
for i in range(5):
    print(f"Item {i} internal variance: {sample_items[i].var():.6f}")

item_emb.shape

In [None]:
# Fix for RQ-VAE tensor dimension mismatch AND training stability + Encoder Initialization
def improved_init_weights(m):
    """Improved weight initialization to preserve diversity through encoder layers"""
    if isinstance(m, torch.nn.Linear):
        # Use He initialization with larger scale for better diversity preservation
        torch.nn.init.kaiming_uniform_(m.weight, mode='fan_out', nonlinearity='relu')
        if m.bias is not None:
            torch.nn.init.constant_(m.bias, 0)

def fixed_train_rqvae(
    model, data, epochs=50, batch_size=1024, lr=1e-3,
    device="cuda" if torch.cuda.is_available() else "cpu"
):
    model = model.to(device)
    data = data.to(device)
    
    # Fix 1: Normalize input data to prevent collapse
    data_mean = data.mean(dim=0, keepdim=True)
    data_std = data.std(dim=0, keepdim=True) + 1e-8  # Add small epsilon to prevent division by zero
    data = (data - data_mean) / data_std
    
    # Fix 2: Improved weight initialization for better diversity
    model.apply(improved_init_weights)
    
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    
    # K-means init on a sample batch - FIXED VERSION
    with torch.no_grad():
        sample = data[torch.randperm(data.shape[0])[: min(batch_size, data.shape[0])]].to(device)
        # Encode the sample to get the correct latent dimension for kmeans init
        encoded_sample = model.encoder(sample)
        model.codebook.kmeans_init(encoded_sample)
    
    N = data.shape[0]
    for ep in range(1, epochs + 1):
        perm = torch.randperm(N, device=device)
        total = 0.0
        for i in range(0, N, batch_size):
            idx = perm[i : i + batch_size]
            xb = data[idx]
            x_hat, loss, recon, _ = model(xb)
            opt.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            opt.step()
            total += loss.item() * xb.size(0)
        avg_loss = total / N
        if ep % 5 == 0 or ep == 1:
            print(f"[RQVAE] epoch {ep}/{epochs} loss={avg_loss:.4f}")
        # Early stopping if loss becomes too small (indicating collapse)
        if avg_loss < 1e-6:
            print(f"[RQVAE] Early stopping at epoch {ep} due to loss collapse")
            break
    return model

# Alternative: Create a better encoder architecture
class ImprovedRQVAE(torch.nn.Module):
    """RQ-VAE with improved encoder that preserves diversity better"""
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        # Shallower encoder with skip connections to preserve diversity
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(cfg.input_dim, 256),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.1),
            torch.nn.Linear(256, 128),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.1),
            torch.nn.Linear(128, cfg.latent_dim)
        )
        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(cfg.latent_dim, 128),
            torch.nn.ReLU(),
            torch.nn.Linear(128, 256), 
            torch.nn.ReLU(),
            torch.nn.Linear(256, cfg.input_dim)
        )
        from tiger_semantic_id_amazon_beauty.src.rqvae import RQCodebook
        self.codebook = RQCodebook(cfg.levels, cfg.codebook_size, cfg.latent_dim)
        
        # Better initialization
        self.apply(improved_init_weights)
    
    def forward(self, x):
        z = self.encoder(x)
        q, codes = self.codebook(z)
        x_hat = self.decoder(q)
        # Losses
        recon = torch.nn.functional.mse_loss(x_hat, x)
        # VQ losses: commit + codebook (stop-grad on one side)
        commit = torch.nn.functional.mse_loss(z.detach(), q)
        code = torch.nn.functional.mse_loss(z, q.detach())
        loss = recon + self.cfg.beta * (commit + code)
        return x_hat, loss, recon, codes

# Replace the train_rqvae function
from tiger_semantic_id_amazon_beauty.src import rqvae
rqvae.train_rqvae = fixed_train_rqvae

print("✓ Applied improved RQ-VAE initialization and architecture fixes")

In [None]:
# RQ-VAE training with improved architecture
import torch
from tiger_semantic_id_amazon_beauty.src.rqvae import RQVAE, RQVAEConfig, train_rqvae, encode_codes
rqcfg = RQVAEConfig(input_dim=item_emb.shape[1], latent_dim=cfg.rqvae_latent_dim, levels=cfg.rqvae_levels, codebook_size=cfg.rqvae_codebook_size)
rqcfg.beta = cfg.rqvae_beta

# TEST: Compare original vs improved model
print("=== COMPARING ORIGINAL VS IMPROVED MODEL ===")

# Original model
print("\\n--- ORIGINAL MODEL ---")
model_orig = RQVAE(rqcfg)
with torch.no_grad():
    encoded_orig = model_orig.encoder(item_emb[:20])
    print(f"Original encoder output diversity:")
    dists_orig = torch.cdist(encoded_orig[:10], encoded_orig[:10])
    print(f"  Mean pairwise distance: {dists_orig.fill_diagonal_(float('inf')).mean():.6f}")
    print(f"  Min distance: {dists_orig.fill_diagonal_(float('inf')).min():.6f}")
    print(f"  Max distance: {dists_orig.max():.6f}")

# Improved model
print("\\n--- IMPROVED MODEL ---")
model_improved = ImprovedRQVAE(rqcfg)
with torch.no_grad():
    encoded_improved = model_improved.encoder(item_emb[:20])
    print(f"Improved encoder output diversity:")
    dists_improved = torch.cdist(encoded_improved[:10], encoded_improved[:10])
    print(f"  Mean pairwise distance: {dists_improved.fill_diagonal_(float('inf')).mean():.6f}")
    print(f"  Min distance: {dists_improved.fill_diagonal_(float('inf')).min():.6f}")
    print(f"  Max distance: {dists_improved.max():.6f}")

# Test quantization diversity
print("\\n--- QUANTIZATION DIVERSITY TEST ---")
def test_quantization_diversity(model, data, name):
    codes = []
    with torch.no_grad():
        encoded = model.encoder(data)
        q, code_batch = model.codebook(encoded)
        codes = code_batch
    
    unique_codes = torch.unique(codes, dim=0)
    print(f"{name} - Unique code combinations: {len(unique_codes)} out of {len(data)}")
    print(f"  Sample codes: {codes[:5].tolist()}")
    return len(unique_codes)

orig_diversity = test_quantization_diversity(model_orig, item_emb[:100], "Original")
improved_diversity = test_quantization_diversity(model_improved, item_emb[:100], "Improved")

# Use the better model
if improved_diversity > orig_diversity:
    print(f"\\n✓ Using improved model (diversity: {improved_diversity} vs {orig_diversity})")
    model = model_improved
else:
    print(f"\\n✓ Using original model (diversity: {orig_diversity} vs {improved_diversity})")
    model = model_orig

print("\\n=== FINAL MODEL ANALYSIS ===")
# Final detailed analysis of chosen model
with torch.no_grad():
    sample_encoded = model.encoder(item_emb[:10])
    final_codes = encode_codes(model, item_emb[:50])
    
print(f"Final encoder diversity: mean pairwise distance = {torch.cdist(sample_encoded, sample_encoded).fill_diagonal_(float('inf')).mean():.6f}")
print(f"Final quantization: {len(torch.unique(final_codes, dim=0))} unique codes out of 50 items")
print(f"Sample final codes: {final_codes[:10]}")

# Skip training for now to test architecture
# model = train_rqvae(model, item_emb, epochs=cfg.rqvae_epochs, batch_size=cfg.rqvae_batch_size, lr=cfg.rqvae_lr)
# torch.save(model.state_dict(), f"{Paths.artifacts_dir}/rqvae.pt")
codes = final_codes[:50]  # Use for testing
print(f"codes.shape {codes.shape}")

In [None]:
# Assign Semantic IDs & save maps
import numpy as np
from tiger_semantic_id_amazon_beauty.src.semantic_id import assign_semantic_ids
sid, sid_to_items, prefix_to_items = assign_semantic_ids(codes, Paths.artifacts_dir, codebook_size=cfg.rqvae_codebook_size)
sid.shape, len(sid_to_items)


In [None]:
# Visualizations: c1 vs category, and hierarchy
from tiger_semantic_id_amazon_beauty.src.visualize import plot_c1_category_distribution, plot_hierarchy_c1_c2
fig1 = plot_c1_category_distribution(codes.numpy(), items)
fig1.savefig(f"{Paths.artifacts_dir}/figs_c1_category.png")
c1_vals = list(pd.Series(codes[:,0].numpy()).value_counts().head(3).index)
fig2 = plot_hierarchy_c1_c2(codes.numpy(), items, c1_vals)
fig2.savefig(f"{Paths.artifacts_dir}/figs_hierarchy.png")
fig1, fig2


In [None]:
# Sequence dataset for generative retrieval
from collections import defaultdict
from tiger_semantic_id_amazon_beauty.src.seq2seq import TIGERSeqDataset, VocabConfig, Seq2SeqConfig
user_hist = defaultdict(list)
for r in train_df.sort_values(['user_idx','ts']).itertuples(index=False):
    user_hist[int(r.user_idx)].append(int(r.item_idx))
# Fix: use cfg.rqvae_levels to match the RQ-VAE configuration (3 levels)
vocab_cfg = VocabConfig(codebook_size=cfg.rqvae_codebook_size, levels=cfg.rqvae_levels, user_vocab_hash=cfg.user_vocab_hash)
seq_cfg = Seq2SeqConfig(d_model=cfg.seq2seq_d_model, ff=cfg.seq2seq_ff, heads=cfg.seq2seq_heads, layers_enc=cfg.seq2seq_layers_enc, layers_dec=cfg.seq2seq_layers_dec, dropout=cfg.seq2seq_dropout, max_hist_len=cfg.max_hist_len, batch_size=cfg.seq2seq_batch_size, lr=cfg.seq2seq_lr)
train_ds = TIGERSeqDataset(user_hist, sid, user_hash_size=vocab_cfg.user_vocab_hash, codebook_size=vocab_cfg.codebook_size, max_hist_len=seq_cfg.max_hist_len)
len(train_ds), train_ds[0][1][:8], train_ds[0][2]

In [None]:
# Seq2Seq model & training (compact)
import torch
from torch.utils.data import DataLoader
from tiger_semantic_id_amazon_beauty.src.seq2seq import TinyTransformer, collate_batch
V = 1 + vocab_cfg.semantic_vocab + vocab_cfg.user_vocab_hash + 2  # PAD=0, BOS=1, then others
model = TinyTransformer(vocab_size=V, d_model=seq_cfg.d_model, ff=seq_cfg.ff, heads=seq_cfg.heads, layers_enc=seq_cfg.layers_enc, layers_dec=seq_cfg.layers_dec, dropout=seq_cfg.dropout)
model = model.cuda() if torch.cuda.is_available() else model
opt = torch.optim.Adam(model.parameters(), lr=seq_cfg.lr)
loader = DataLoader(train_ds, batch_size=seq_cfg.batch_size, shuffle=True, collate_fn=collate_batch)
steps = 0
for src, tgt in loader:
    if torch.cuda.is_available(): src, tgt = src.cuda(), tgt.cuda()
    logits = model(src, tgt[:, :-1])
    loss = torch.nn.functional.cross_entropy(logits.reshape(-1, logits.size(-1)), tgt[:, 1:].reshape(-1), ignore_index=0)
    opt.zero_grad(); loss.backward(); torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0); opt.step()
    steps += 1
    if steps % 200 == 0:
        print('steps', steps, 'loss', float(loss))
    if steps >= 1000: break  # knob for Colab runtime
torch.save(model.state_dict(), f"{Paths.artifacts_dir}/seq2seq.pt")
steps
