# Data EDA - Amazon Beauty Dataset

Investigating why all item texts are identical in the TIGER SemanticID pipeline.

In [None]:
# Clone repo, install dependencies, and make src importable (Colab-friendly)
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False

repo_url = 'https://github.com/allyoushawn/recsys_playground.git'
repo_dir = 'recsys_playground'
branch_name = '20250908_tiger_dev'

import os
if IN_COLAB:
    if not os.path.exists(repo_dir):
        !git clone $repo_url
    %cd $repo_dir
    !git fetch --all
    !git checkout $branch_name || echo 'Branch not found; staying on default.'


In [None]:
# Runtime & setup - same as main notebook
import os, sys, subprocess
import pandas as pd
import numpy as np

# Make src importable - same path structure as main notebook
src_path = os.path.abspath('tiger_semantic_id_amazon_beauty/src')
if src_path not in sys.path: 
    sys.path.insert(0, src_path)

# Import utilities first to setup paths and ensure directories exist
from tiger_semantic_id_amazon_beauty.src.utils import set_seed, ensure_dirs, Paths
set_seed(42)
ensure_dirs(Paths.data_dir, Paths.artifacts_dir)

print(f'Data directory: {Paths.data_dir}')
print(f'Artifacts directory: {Paths.artifacts_dir}')

# Now import data processing functions
from tiger_semantic_id_amazon_beauty.src.data import load_reviews_df, load_meta_df, filter_and_split, build_id_maps, apply_id_maps
from tiger_semantic_id_amazon_beauty.src.embeddings import build_item_text

In [None]:
# Check raw downloaded files before processing
import gzip
import json

print("=== RAW FILE INSPECTION ===")

# Check if files exist
reviews_path = f"{Paths.data_dir}/reviews_Beauty_5.json.gz"
meta_path = f"{Paths.data_dir}/meta_Beauty.json.gz"

print(f"Reviews file exists: {os.path.exists(reviews_path)}")
print(f"Meta file exists: {os.path.exists(meta_path)}")

if os.path.exists(meta_path):
    print(f"\nMeta file size: {os.path.getsize(meta_path)} bytes")
    
    # Read first few lines of meta file to see what columns are available
    print("First 3 lines of raw meta file:")
    with gzip.open(meta_path, 'rt') as f:
        for i, line in enumerate(f):
            if i >= 3:
                break
            try:
                data = json.loads(line.strip())
                print(f"Line {i+1}: {list(data.keys())}")
                if i == 0:  # Show first record in detail
                    print(f"  Sample data: {data}")
            except json.JSONDecodeError as e:
                print(f"Line {i+1}: JSON decode error: {e}")
                print(f"  Raw line: {repr(line[:100])}...")
                
    # Count total lines
    print(f"\nCounting total lines in meta file...")
    with gzip.open(meta_path, 'rt') as f:
        line_count = sum(1 for _ in f)
    print(f"Total lines in meta file: {line_count}")
else:
    print("Meta file not found - need to download first")

In [None]:
# Fix JSON parsing issue - file contains Python dict format, not JSON format
import gzip
import ast  # Use ast.literal_eval to parse Python dict format

print("=== PARSING WITH ast.literal_eval ===")

meta_path = f"{Paths.data_dir}/meta_Beauty_5.json.gz"
if not os.path.exists(meta_path):
    meta_path = f"{Paths.data_dir}/meta_Beauty.json.gz"

# Parse first few lines using ast.literal_eval instead of json.loads
print("First 3 records using ast.literal_eval:")
with gzip.open(meta_path, 'rt') as f:
    for i, line in enumerate(f):
        if i >= 3:
            break
        try:
            # Use ast.literal_eval to safely parse Python dict strings
            data = ast.literal_eval(line.strip())
            print(f"\\nRecord {i+1}:")
            print(f"  Keys: {list(data.keys())}")
            print(f"  asin: {data.get('asin', 'N/A')}")
            print(f"  title: {data.get('title', 'N/A')}")
            print(f"  description: {data.get('description', 'N/A')[:100]}...")
            print(f"  brand: {data.get('brand', 'N/A')}")
            print(f"  categories: {data.get('categories', 'N/A')}")
        except (ValueError, SyntaxError) as e:
            print(f"Record {i+1}: Parse error: {e}")
            print(f"  Raw line: {repr(line[:100])}...")

print("\\n=== CREATING FIXED PARSER FUNCTION ===")

def _parse_python_dict_lines(path: str):
    """Parse Python dict lines (not JSON) from a gzipped file."""
    import ast
    
    opener = gzip.open if path.endswith(".gz") else open
    rows = []
    with opener(path, "rt") as f:
        for line_num, raw in enumerate(f):
            try:
                line = raw.strip()
                if line:
                    # Use ast.literal_eval to parse Python dict strings safely
                    data = ast.literal_eval(line)
                    rows.append(data)
            except (ValueError, SyntaxError, MemoryError) as e:
                # Skip malformed lines
                continue
                
    return rows

# Test the fixed parser
print("Testing fixed parser on first 10 records...")
sample_data = _parse_python_dict_lines(meta_path)
print(f"Successfully parsed {len(sample_data)} records")

if sample_data:
    print(f"\\nSample record keys: {list(sample_data[0].keys())}")
    print(f"Sample record: {sample_data[0]}")

# Apply the fix to the data.py functions
print("\\n=== PATCHING DATA LOADING FUNCTIONS ===")
from tiger_semantic_id_amazon_beauty.src import data

# Replace the broken JSON parser with our fixed parser
data._parse_json_lines = _parse_python_dict_lines
print("✓ Patched data._parse_json_lines with Python dict parser")

In [None]:
# Load raw data with FIXED parser
print("=== LOADING RAW DATA WITH FIXED PARSER ===")
reviews = load_reviews_df(f"{Paths.data_dir}/reviews_Beauty_5.json.gz")
meta = load_meta_df(f"{Paths.data_dir}/meta_Beauty.json.gz")

print(f"Reviews shape: {reviews.shape}")
print(f"Meta shape: {meta.shape}")
print(f"Reviews columns: {reviews.columns.tolist()}")
print(f"Meta columns: {meta.columns.tolist()}")

print(f"\\nMeta data sample (first 3 rows):")
print(meta.head(3))

print(f"\\nMeta data dtypes:")
print(meta.dtypes)

print(f"\\nNull values in meta:")
print(meta.isnull().sum())

In [None]:
# Check specific columns that should contain item information - FIXED for list columns
print("=== CHECKING KEY COLUMNS ===")
key_cols = ['title', 'description', 'category', 'categories', 'brand', 'price']

def safe_analyze_column(df, col):
    """Safely analyze a column that might contain lists or other unhashable types."""
    if col not in df.columns:
        print(f"\n{col} column: NOT FOUND")
        return
        
    print(f"\n{col} column:")
    
    # Check if column contains lists by examining a sample value
    sample_val = df[col].dropna().iloc[0] if not df[col].dropna().empty else None
    is_list_column = isinstance(sample_val, list)
    
    if is_list_column:
        # Handle list columns specially (can't use nunique on lists)
        non_null_count = df[col].dropna().shape[0]
        print(f"  Non-null values: {non_null_count} (contains lists)")
        print(f"  Null values: {df[col].isnull().sum()}")
        print(f"  Sample values: {df[col].dropna().head(3).tolist()}")
    else:
        # Handle regular columns normally
        try:
            print(f"  Unique values: {df[col].nunique()}")
            print(f"  Null values: {df[col].isnull().sum()}")
            print(f"  Sample values: {df[col].dropna().head(3).tolist()}")
        except Exception as e:
            print(f"  Analysis error: {e}")
            print(f"  Sample values: {df[col].dropna().head(3).tolist()}")

for col in key_cols:
    safe_analyze_column(meta, col)

In [None]:
# Process data as in main notebook
print("=== DATA PROCESSING ===")
from tiger_semantic_id_amazon_beauty.src.data import BeautyConfig
train_df, val_df, test_df = filter_and_split(reviews, BeautyConfig())
user2id, item2id = build_id_maps([train_df, val_df, test_df])
train_df = apply_id_maps(train_df, user2id, item2id)
val_df = apply_id_maps(val_df, user2id, item2id)
test_df = apply_id_maps(test_df, user2id, item2id)

print(f"Number of unique items in item2id: {len(item2id)}")
print(f"Sample item_ids: {list(item2id.keys())[:5]}")

In [None]:
# Create items dataframe as in main notebook
print("=== ITEMS DATAFRAME CREATION ===")
items = pd.DataFrame({
    'item_id': list(item2id.keys()), 
    'item_idx': list(item2id.values())
}).merge(meta, on='item_id', how='left')

print(f"Items shape: {items.shape}")
print(f"Items columns: {items.columns.tolist()}")
print(f"\nFirst 5 items:")
print(items.head())

In [None]:
# Check if items have diverse metadata - FIXED for list columns
print("=== ITEMS DIVERSITY CHECK ===")

def safe_analyze_column(df, col):
    """Safely analyze a column that might contain lists or other unhashable types."""
    if col not in df.columns:
        print(f"{col}: Column not found")
        return
        
    # Check if column contains lists
    sample_val = df[col].dropna().iloc[0] if not df[col].dropna().empty else None
    is_list_column = isinstance(sample_val, list)
    
    if is_list_column:
        # Handle list columns specially
        non_null_count = df[col].dropna().shape[0]
        total_vals = len(df)
        print(f"{col}: {non_null_count} non-null values out of {total_vals} items (contains lists)")
        print(f"  Sample values (first 3):")
        for i, val in enumerate(df[col].dropna().head(3)):
            print(f"    {i+1}: {val}")
        
        # Count unique list lengths
        if non_null_count > 0:
            lengths = df[col].dropna().apply(lambda x: len(x) if isinstance(x, list) else 0)
            print(f"  List lengths - min: {lengths.min()}, max: {lengths.max()}, mean: {lengths.mean():.1f}")
    else:
        # Handle regular columns
        try:
            unique_vals = df[col].nunique(dropna=True)
            total_vals = len(df)
            print(f"{col}: {unique_vals} unique values out of {total_vals} items")
            
            if unique_vals <= 10:  # Show actual values if small number
                print(f"  Values: {df[col].dropna().unique().tolist()}")
            else:
                print(f"  Sample: {df[col].dropna().head(5).tolist()}")
        except Exception as e:
            print(f"{col}: Analysis error - {e}")
            print(f"  Sample: {df[col].dropna().head(3).tolist()}")

for col in ['title', 'description', 'category', 'categories', 'brand']:
    safe_analyze_column(items, col)

In [None]:
# Test the build_item_text function
print("=== TESTING build_item_text FUNCTION ===")
# Take first 10 items
sample_items = items.head(10)
texts = build_item_text(sample_items)

print(f"Generated {len(texts)} texts")
print(f"All texts identical? {all(texts[0] == text for text in texts)}")
print(f"\nFirst 3 generated texts:")
for i, text in enumerate(texts[:3]):
    print(f"Text {i}: {repr(text)}")

In [None]:
# Examine the build_item_text function implementation
print("=== EXAMINING build_item_text IMPLEMENTATION ===")
import inspect
print("Function source:")
print(inspect.getsource(build_item_text))

In [None]:
# Manual text building to debug
print("=== MANUAL TEXT BUILDING DEBUG ===")
sample_items = items.head(5)
print("Raw item data:")
for i, (_, item) in enumerate(sample_items.iterrows()):
    print(f"\nItem {i}:")
    for col in ['item_id', 'title', 'description', 'category', 'categories', 'brand']:
        if col in item:
            print(f"  {col}: {repr(item[col])}")
        else:
            print(f"  {col}: NOT FOUND")