# Data EDA - Amazon Beauty Dataset

Investigating why all item texts are identical in the TIGER SemanticID pipeline.

In [None]:
# Clone repo, install dependencies, and make src importable (Colab-friendly)
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False

repo_url = 'https://github.com/allyoushawn/recsys_playground.git'
repo_dir = 'recsys_playground'
branch_name = '20250908_tiger_dev'

import os
if IN_COLAB:
    if not os.path.exists(repo_dir):
        !git clone $repo_url
    %cd $repo_dir
    !git fetch --all
    !git checkout $branch_name || echo 'Branch not found; staying on default.'


In [None]:
# Runtime & setup - same as main notebook
import os, sys, subprocess
import pandas as pd
import numpy as np

# Make src importable - same path structure as main notebook
src_path = os.path.abspath('tiger_semantic_id_amazon_beauty/src')
if src_path not in sys.path: 
    sys.path.insert(0, src_path)

# Import utilities first to setup paths and ensure directories exist
from tiger_semantic_id_amazon_beauty.src.utils import set_seed, ensure_dirs, Paths
set_seed(42)
ensure_dirs(Paths.data_dir, Paths.artifacts_dir)

print(f'Data directory: {Paths.data_dir}')
print(f'Artifacts directory: {Paths.artifacts_dir}')

# Now import data processing functions
from tiger_semantic_id_amazon_beauty.src.data import load_reviews_df, load_meta_df, filter_and_split, build_id_maps, apply_id_maps
from tiger_semantic_id_amazon_beauty.src.embeddings import build_item_text

In [None]:
# Load raw data
print("=== LOADING RAW DATA ===")
reviews = load_reviews_df(f"{Paths.data_dir}/reviews_Beauty_5.json.gz")
meta = load_meta_df(f"{Paths.data_dir}/meta_Beauty.json.gz")

print(f"Reviews shape: {reviews.shape}")
print(f"Meta shape: {meta.shape}")
print(f"Reviews columns: {reviews.columns.tolist()}")
print(f"Meta columns: {meta.columns.tolist()}")

In [None]:
# Download data if needed
from tiger_semantic_id_amazon_beauty.src.data import SNAP_REVIEWS, SNAP_META

# Check if data files exist, download if not
reviews_path = f"{Paths.data_dir}/reviews_Beauty_5.json.gz"
meta_path = f"{Paths.data_dir}/meta_Beauty.json.gz"

if not os.path.exists(reviews_path):
    print("Downloading reviews data...")
    !wget -q -O {reviews_path} {SNAP_REVIEWS}
    
if not os.path.exists(meta_path):
    print("Downloading meta data...")
    !wget -q -O {meta_path} {SNAP_META}
    
print("Data files ready!")

In [None]:
# Examine meta data in detail
print("=== META DATA ANALYSIS ===")
print(f"Meta data sample (first 5 rows):")
print(meta.head())
print(f"\nMeta data dtypes:")
print(meta.dtypes)
print(f"\nNull values:")
print(meta.isnull().sum())

In [None]:
# Check specific columns that should contain item information
print("=== CHECKING KEY COLUMNS ===")
key_cols = ['title', 'description', 'category', 'categories', 'brand', 'price']
for col in key_cols:
    if col in meta.columns:
        print(f"\n{col} column:")
        print(f"  Unique values: {meta[col].nunique()}")
        print(f"  Null values: {meta[col].isnull().sum()}")
        print(f"  Sample values: {meta[col].dropna().head(3).tolist()}")
    else:
        print(f"\n{col} column: NOT FOUND")

In [None]:
# Process data as in main notebook
print("=== DATA PROCESSING ===")
from tiger_semantic_id_amazon_beauty.src.data import BeautyConfig
train_df, val_df, test_df = filter_and_split(reviews, BeautyConfig())
user2id, item2id = build_id_maps([train_df, val_df, test_df])
train_df = apply_id_maps(train_df, user2id, item2id)
val_df = apply_id_maps(val_df, user2id, item2id)
test_df = apply_id_maps(test_df, user2id, item2id)

print(f"Number of unique items in item2id: {len(item2id)}")
print(f"Sample item_ids: {list(item2id.keys())[:5]}")

In [None]:
# Create items dataframe as in main notebook
print("=== ITEMS DATAFRAME CREATION ===")
items = pd.DataFrame({
    'item_id': list(item2id.keys()), 
    'item_idx': list(item2id.values())
}).merge(meta, on='item_id', how='left')

print(f"Items shape: {items.shape}")
print(f"Items columns: {items.columns.tolist()}")
print(f"\nFirst 5 items:")
print(items.head())

In [None]:
# Check if items have diverse metadata
print("=== ITEMS DIVERSITY CHECK ===")
for col in ['title', 'description', 'category', 'categories', 'brand']:
    if col in items.columns:
        unique_vals = items[col].nunique(dropna=True)
        total_vals = len(items)
        print(f"{col}: {unique_vals} unique values out of {total_vals} items")
        if unique_vals <= 10:  # Show actual values if small number
            print(f"  Values: {items[col].dropna().unique().tolist()}")
        else:
            print(f"  Sample: {items[col].dropna().head(5).tolist()}")
    else:
        print(f"{col}: Column not found")

In [None]:
# Test the build_item_text function
print("=== TESTING build_item_text FUNCTION ===")
# Take first 10 items
sample_items = items.head(10)
texts = build_item_text(sample_items)

print(f"Generated {len(texts)} texts")
print(f"All texts identical? {all(texts[0] == text for text in texts)}")
print(f"\nFirst 3 generated texts:")
for i, text in enumerate(texts[:3]):
    print(f"Text {i}: {repr(text)}")

In [None]:
# Examine the build_item_text function implementation
print("=== EXAMINING build_item_text IMPLEMENTATION ===")
import inspect
print("Function source:")
print(inspect.getsource(build_item_text))

In [None]:
# Manual text building to debug
print("=== MANUAL TEXT BUILDING DEBUG ===")
sample_items = items.head(5)
print("Raw item data:")
for i, (_, item) in enumerate(sample_items.iterrows()):
    print(f"\nItem {i}:")
    for col in ['item_id', 'title', 'description', 'category', 'categories', 'brand']:
        if col in item:
            print(f"  {col}: {repr(item[col])}")
        else:
            print(f"  {col}: NOT FOUND")