# TIGER SemanticID on Amazon Beauty — Experiment Plan

Goal: Implement Semantic IDs via RQ-VAE and a compact seq2seq Transformer for generative retrieval on Amazon Beauty 5-core; produce metrics and visualizations validating paper claims.

Datasets: Amazon Product Reviews (Beauty, 5-core).

Key steps: Download & preprocess; Sentence-T5 embeddings; RQ-VAE (3 levels, K=256) to 3-tuple codes + collision code c4; visualizations (c1↔category, hierarchy); seq2seq generative retrieval; metrics Recall@5/10, NDCG@5/10 and invalid-ID rate; ablations (Random/LSH); mini cold-start probe.

Artifacts: save to /content/artifacts. Keep configs modest for Colab; add knobs for smoke tests.

In [None]:
# Clone repo, install dependencies, and make src importable (Colab-friendly)
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False

repo_url = 'https://github.com/allyoushawn/recsys_playground.git'
repo_dir = 'recsys_playground'
branch_name = '20250908_tiger_dev'

import os
if IN_COLAB:
    if not os.path.exists(repo_dir):
        !git clone $repo_url
    %cd $repo_dir
    !git fetch --all
    !git checkout $branch_name || echo 'Branch not found; staying on default.'


In [None]:
# Runtime & installs
import os, sys, subprocess, torch
print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('GPU:', torch.cuda.get_device_name(0))

# Install module dependencies (Colab).
!pip -q install -r tiger_semantic_id_amazon_beauty/requirements.txt

# Make src importable
src_path = os.path.abspath('tiger_semantic_id_amazon_beauty/src')
if src_path not in sys.path: sys.path.insert(0, src_path)

from tiger_semantic_id_amazon_beauty.src.utils import set_seed, ensure_dirs, Paths
set_seed(42)
ensure_dirs(Paths.data_dir, Paths.artifacts_dir)


In [None]:
# Config dataclass
from dataclasses import dataclass

@dataclass
class Config:
    dataset_name: str = 'Beauty'
    min_user_interactions: int = 5
    max_hist_len: int = 20
    embed_model_name: str = 'sentence-t5-base'
    rqvae_latent_dim: int = 32
    rqvae_levels: int = 3
    rqvae_codebook_size: int = 256
    rqvae_beta: float = 0.25
    rqvae_epochs: int = 50
    rqvae_batch_size: int = 1024
    rqvae_lr: float = 4e-1
    seq2seq_d_model: int = 128
    seq2seq_ff: int = 1024
    seq2seq_heads: int = 6
    seq2seq_layers_enc: int = 4
    seq2seq_layers_dec: int = 4
    seq2seq_dropout: float = 0.1
    seq2seq_batch_size: int = 256
    seq2seq_steps: int = 20000
    seq2seq_lr: float = 1e-2
    user_vocab_hash: int = 2000
    topk_list: tuple = (5, 10)

cfg = Config()
cfg


In [None]:
# Download data
from tiger_semantic_id_amazon_beauty.src.data import SNAP_REVIEWS, SNAP_META
from tiger_semantic_id_amazon_beauty.src.utils import Paths
!cd /content 2>/dev/null || true
!mkdir -p {Paths.data_dir}
!wget -q -O {Paths.data_dir}/reviews_Beauty_5.json.gz {SNAP_REVIEWS}
!wget -q -O {Paths.data_dir}/meta_Beauty.json.gz {SNAP_META}
!gzip -t {Paths.data_dir}/reviews_Beauty_5.json.gz && gzip -t {Paths.data_dir}/meta_Beauty.json.gz && echo 'gz ok'
!zcat -f {Paths.data_dir}/reviews_Beauty_5.json.gz | head -n 2
!zcat -f {Paths.data_dir}/meta_Beauty.json.gz | head -n 2


In [None]:
# Parse and preprocess
import pandas as pd
from tiger_semantic_id_amazon_beauty.src.data import load_reviews_df, load_meta_df, filter_and_split, build_id_maps, apply_id_maps, save_mappings
reviews = load_reviews_df(f"{Paths.data_dir}/reviews_Beauty_5.json.gz")
meta = load_meta_df(f"{Paths.data_dir}/meta_Beauty.json.gz")
# Merge item_idx later after mapping
train_df, val_df, test_df = filter_and_split(reviews, __import__('tiger_semantic_id_amazon_beauty.src.data', fromlist=['BeautyConfig']).BeautyConfig())
user2id, item2id = build_id_maps([train_df, val_df, test_df])
save_mappings(Paths.artifacts_dir, user2id, item2id)
train_df = apply_id_maps(train_df, user2id, item2id)
val_df = apply_id_maps(val_df, user2id, item2id)
test_df = apply_id_maps(test_df, user2id, item2id)
items = meta.merge(pd.DataFrame({'item_id': list(item2id.keys()), 'item_idx': list(item2id.values())}), on='item_id', how='right')
print('Shapes:', train_df.shape, val_df.shape, test_df.shape, items.shape)


In [None]:
# Build item text & embed with Sentence-T5
import torch
from tiger_semantic_id_amazon_beauty.src.embeddings import build_item_text, encode_items
texts = build_item_text(items)
item_emb = encode_items(texts, model_name=cfg.embed_model_name, batch_size=256)
torch.save(item_emb, f"{Paths.artifacts_dir}/item_embeddings.pt")
item_emb.shape


In [None]:
# RQ-VAE training
import torch
from tiger_semantic_id_amazon_beauty.src.rqvae import RQVAE, RQVAEConfig, train_rqvae, encode_codes
rqcfg = RQVAEConfig(input_dim=item_emb.shape[1], latent_dim=cfg.rqvae_latent_dim, levels=cfg.rqvae_levels, codebook_size=cfg.rqvae_codebook_size)
rqcfg.beta = cfg.rqvae_beta
model = RQVAE(rqcfg)
model = train_rqvae(model, item_emb, epochs=cfg.rqvae_epochs, batch_size=cfg.rqvae_batch_size, lr=cfg.rqvae_lr)
torch.save(model.state_dict(), f"{Paths.artifacts_dir}/rqvae.pt")
codes = encode_codes(model, item_emb)
codes.shape


In [None]:
# Assign Semantic IDs & save maps
import numpy as np
from tiger_semantic_id_amazon_beauty.src.semantic_id import assign_semantic_ids
sid, sid_to_items, prefix_to_items = assign_semantic_ids(codes, Paths.artifacts_dir, codebook_size=cfg.rqvae_codebook_size)
sid.shape, len(sid_to_items)


In [None]:
# Visualizations: c1 vs category, and hierarchy
from tiger_semantic_id_amazon_beauty.src.visualize import plot_c1_category_distribution, plot_hierarchy_c1_c2
fig1 = plot_c1_category_distribution(codes.numpy(), items)
fig1.savefig(f"{Paths.artifacts_dir}/figs_c1_category.png")
c1_vals = list(pd.Series(codes[:,0].numpy()).value_counts().head(3).index)
fig2 = plot_hierarchy_c1_c2(codes.numpy(), items, c1_vals)
fig2.savefig(f"{Paths.artifacts_dir}/figs_hierarchy.png")
fig1, fig2


In [None]:
# Sequence dataset for generative retrieval
from collections import defaultdict
from tiger_semantic_id_amazon_beauty.src.seq2seq import TIGERSeqDataset, VocabConfig, Seq2SeqConfig
user_hist = defaultdict(list)
for r in train_df.sort_values(['user_idx','ts']).itertuples(index=False):
    user_hist[int(r.user_idx)].append(int(r.item_idx))
vocab_cfg = VocabConfig(codebook_size=cfg.rqvae_codebook_size, levels=4, user_vocab_hash=cfg.user_vocab_hash)
seq_cfg = Seq2SeqConfig(d_model=cfg.seq2seq_d_model, ff=cfg.seq2seq_ff, heads=cfg.seq2seq_heads, layers_enc=cfg.seq2seq_layers_enc, layers_dec=cfg.seq2seq_layers_dec, dropout=cfg.seq2seq_dropout, max_hist_len=cfg.max_hist_len, batch_size=cfg.seq2seq_batch_size, lr=cfg.seq2seq_lr)
train_ds = TIGERSeqDataset(user_hist, sid, user_hash_size=vocab_cfg.user_vocab_hash, codebook_size=vocab_cfg.codebook_size, max_hist_len=seq_cfg.max_hist_len)
len(train_ds), train_ds[0][1][:8], train_ds[0][2]


In [None]:
# Seq2Seq model & training (compact)
import torch
from torch.utils.data import DataLoader
from tiger_semantic_id_amazon_beauty.src.seq2seq import TinyTransformer, collate_batch
V = 1 + vocab_cfg.semantic_vocab + vocab_cfg.user_vocab_hash + 2  # PAD=0, BOS=1, then others
model = TinyTransformer(vocab_size=V, d_model=seq_cfg.d_model, ff=seq_cfg.ff, heads=seq_cfg.heads, layers_enc=seq_cfg.layers_enc, layers_dec=seq_cfg.layers_dec, dropout=seq_cfg.dropout)
model = model.cuda() if torch.cuda.is_available() else model
opt = torch.optim.Adam(model.parameters(), lr=seq_cfg.lr)
loader = DataLoader(train_ds, batch_size=seq_cfg.batch_size, shuffle=True, collate_fn=collate_batch)
steps = 0
for src, tgt in loader:
    if torch.cuda.is_available(): src, tgt = src.cuda(), tgt.cuda()
    logits = model(src, tgt[:, :-1])
    loss = torch.nn.functional.cross_entropy(logits.reshape(-1, logits.size(-1)), tgt[:, 1:].reshape(-1), ignore_index=0)
    opt.zero_grad(); loss.backward(); torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0); opt.step()
    steps += 1
    if steps % 200 == 0:
        print('steps', steps, 'loss', float(loss))
    if steps >= 1000: break  # knob for Colab runtime
torch.save(model.state_dict(), f"{Paths.artifacts_dir}/seq2seq.pt")
steps
