# Item Lookup (ASIN â†’ metadata)

Provide a list of Amazon item IDs (ASINs) and this notebook will fetch their metadata (title, category, brand, price) from the SNAP metadata file.


In [1]:
# Set your ASIN list here (from API output)
ASINS = [
    "B007WTAJTO",
    "B003ES5ZUU",
    "B00DR0PDNE",
    "B0019EHU8G",
    "B002WE6D44",
    "B003ELYQGG",
    "B0002L5R78",
    "B009SYZ8OC",
    "B00BGGDVOO",
    "B002V88HFE",
]
ASINS = list(dict.fromkeys(ASINS))  # de-duplicate preserving order
ASINS


['B007WTAJTO',
 'B003ES5ZUU',
 'B00DR0PDNE',
 'B0019EHU8G',
 'B002WE6D44',
 'B003ELYQGG',
 'B0002L5R78',
 'B009SYZ8OC',
 'B00BGGDVOO',
 'B002V88HFE']

In [2]:
from pathlib import Path
import requests

RAW_META_URL = "https://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Electronics.json.gz"
meta_path = Path("../data/raw/meta_Electronics.json.gz")
meta_path.parent.mkdir(parents=True, exist_ok=True)

if not meta_path.exists():
    print(f"Downloading metadata to {meta_path} ...")
    with requests.get(RAW_META_URL, stream=True, timeout=60) as r:
        r.raise_for_status()
        total = int(r.headers.get("Content-Length", 0))
        downloaded = 0
        chunk_size = 1 << 20  # 1 MB
        report_every = 10     # MB
        next_report = report_every
        with open(meta_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=chunk_size):
                if not chunk:
                    continue
                f.write(chunk)
                downloaded += len(chunk)
                if total:
                    mb_done = downloaded // (1 << 20)
                    if mb_done >= next_report or downloaded == total:
                        pct = downloaded / total * 100
                        print(f"Downloaded {mb_done}MB / {total // (1 << 20)}MB ({pct:.1f}%)", flush=True)
                        next_report += report_every
    print("Download complete.")
else:
    print(f"Found existing metadata: {meta_path}")
meta_path


Found existing metadata: ../data/raw/meta_Electronics.json.gz


PosixPath('../data/raw/meta_Electronics.json.gz')

In [3]:
import gzip, json
import pandas as pd

cols_keep = ["asin", "title", "category", "brand", "price"]
rows = []
remaining = set(ASINS)

# Robust line-by-line JSONL reader to avoid parser issues
with gzip.open(meta_path, "rt", encoding="utf-8") as f:
    for i, line in enumerate(f, 1):
        line = line.strip()
        if not line:
            continue
        try:
            obj = json.loads(line)
        except Exception:
            continue
        asin = obj.get("asin")
        if asin in remaining:
            title = obj.get("title")
            cats = obj.get("category") or obj.get("categories")
            cat_str = None
            if isinstance(cats, list):
                if cats and isinstance(cats[0], list):
                    cats = cats[0]
                try:
                    cat_str = " > ".join(str(c) for c in cats)
                except Exception:
                    cat_str = str(cats)
            brand = obj.get("brand")
            price = obj.get("price")
            rows.append({
                "asin": asin,
                "title": title,
                "category": cat_str,
                "brand": brand,
                "price": price,
            })
            remaining.remove(asin)
            if not remaining:
                break
        if i % 500000 == 0:
            print(f"Scanned {i:,} lines, found {len(rows)} / {len(ASINS)}", flush=True)

if rows:
    df_items = pd.DataFrame(rows, columns=cols_keep)
    # Preserve input order of ASINS
    order = {asin: i for i, asin in enumerate(ASINS)}
    df_items["_ord"] = df_items["asin"].map(order)
    df_items = df_items.sort_values("_ord").drop(columns=["_ord"]).reset_index(drop=True)
else:
    df_items = pd.DataFrame(columns=cols_keep)

print(f"Found {len(df_items)} / {len(ASINS)} items.")
df_items


Found 0 / 10 items.


Unnamed: 0,asin,title,category,brand,price


If some items are missing, they might not exist in the 5-core metadata file. You can also inspect reviews for these ASINs to get summaries.
