In [None]:
# =============================================================
#  Synthetic Data-Augmentation Pipeline (Metadata-driven, no embeddings)
# =============================================================

"""
This script implements a structured data augmentation pipeline tailored for recommender systems research.
Its primary objective is to systematically enrich item metadata with high-quality synthetic textual representations
to support downstream recommendation modeling tasks. It does so through a multi-step workflow:

1. Dataset Acquisition and Harmonization:
   - Downloads and prepares raw datasets (Amazon-2023 or MovieLens).
   - Harmonizes identifiers and metadata schemas.
   - Optionally aggregates user reviews per item with configurable constraints.
   - Applies k-core filtering to ensure data quality and sparsity control.

2. Synthetic Text Generation via LLMs:
   - Generates two types of synthetic item descriptions from metadata only (without any user-generated content):
     - *Dense summaries:* Persuasive, paragraph-style product descriptions.
     - *Aspect-based representations:* Structured bullet lists highlighting product features, pros, and cons.
   - Produces these synthetic representations in three predefined lengths (short, medium, long).

3. (Optional) Quality Assurance (Critic Module):
   - Evaluates all synthetic textual representations using explicit quality metrics:
     - **Perplexity:** Measures the textual coherence and fluency.
     - **Toxicity:** Assesses appropriateness and safety for public-facing recommender scenarios.
     - **Sentiment Analysis:** Provides a sentiment polarity score to maintain balanced product portrayals.
   - Flags low-quality synthetic representations per item based on predefined thresholds.

4. Artifact Generation:
   - Compiles and serializes comprehensive side-information tables for immediate use in recommendation algorithms.

The following structured artifacts are generated and saved by this pipeline:
- `side_all.csv`: Full augmented dataset including synthetic texts, human reviews (if applicable), and metadata.
- `quality_metrics.csv`: Detailed textual quality metrics (perplexity, toxicity, sentiment) for synthetic data.
- `manifest.json`: Run-specific metadata including configuration details and aggregated review counts per item.

These output files provide comprehensive, immediately usable data resources intended to streamline experimental setups for
recommender system model evaluations such as Collaborative Deep Learning (CDL), Collaborative Topic Regression (CTR), and Hierarchical Factorization Trees (HFT).

Example.

The script creates three side-information tables (S, M, L), adds two synthetic
views that never read user reviews, runs a language-quality critic,
and writes:

    side_S.csv   side_M.csv   side_L.csv
    quality_metrics.csv
    manifest.json   (includes review_per_item)
All items are kept; each table contains a *_critic_bad flag.

"""


'\nThis script implements a structured data augmentation pipeline tailored for recommender systems research.\nIts primary objective is to systematically enrich item metadata with high-quality synthetic textual representations\nto support downstream recommendation modeling tasks. It does so through a multi-step workflow:\n\n1. Dataset Acquisition and Harmonization:\n   - Downloads and prepares raw datasets (Amazon-2023 or MovieLens).\n   - Harmonizes identifiers and metadata schemas.\n   - Optionally aggregates user reviews per item with configurable constraints.\n   - Applies k-core filtering to ensure data quality and sparsity control.\n\n2. Synthetic Text Generation via LLMs:\n   - Generates two types of synthetic item descriptions from metadata only (without any user-generated content):\n     - *Dense summaries:* Persuasive, paragraph-style product descriptions.\n     - *Aspect-based representations:* Structured bullet lists highlighting product features, pros, and cons.\n   - Produ

In [None]:
OPENAI_API_KEY = ""



#  Block-0 : One-shot Installs  


In [None]:
"""
Block-0 – Environment bootstrap
Installs every external dependency exactly once in a fresh runtime.
"""

# ---- Block-0 : one-shot installs  -------------------
!pip -q install openai==0.28.0 nltk sentence-transformers transformers \
                scikit-learn textstat detoxify tqdm seaborn torch


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.3/105.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m68.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m101.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m42.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━



#  Block-1 & 2: config/helpers & loaders/k‑core preview




In [None]:
# =============================================================
#  Block‑1 — Configuration & generic helpers  (with safe_sample)
# =============================================================
from __future__ import annotations
from dataclasses import dataclass
import os, contextlib, time, json, gzip, shutil, requests, warnings, random, re, zipfile
from typing import List, Dict, Tuple, Literal
import pandas as pd, numpy as np
from tqdm.auto import tqdm

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY            # add key or leave blank

# ─── Configuration ────────────────────────────────────────────────────────
SampleMode = Literal["twopass", "reservoir", "per_user"]

@dataclass
class Config:
    # ---------- dataset & sampling ----------------------------------------
    ds_index:    int        = 17          # 34→ML‑100k, 35→ML‑1M, otherwise Amazon idx
    fraction:    float      = 1.00       # keep this share of lines (0‑1]
    rng_seed:    int        = 42         # reproducible random sample
    sample_mode: SampleMode = "twopass"  # "twopass" | "reservoir" | "per_user"
    # ---------- data quality & augmentation -------------------------------
    k_core:      int        = 4
    max_reviews_per_item: int | None = 10
    include_reviews:      bool = True
    bullet_points: int   = 5
    enable_critic: bool  = True
    # ---------- verbosity --------------------------------------------------
    verbose_progress: bool = True
    verbose_tables:   bool = True

CFG = Config()

LEN_TOK = dict(short=50, mid=120, long=250)
LENS    = [("S", "short"), ("M", "mid"), ("L", "long")]

@contextlib.contextmanager
def timer(msg: str):
    if CFG.verbose_progress:
        print("▶️", msg); t=time.time()
    yield
    if CFG.verbose_progress:
        print(f"⏱️ {msg}: {time.time()-t:.1f}s")

# ─── Safe & flexible sampling utility ─────────────────────────────────────
def _reservoir_indices(n_total:int, k_keep:int, seed:int):
    rnd = random.Random(seed)
    res = list(range(k_keep))
    for i in range(k_keep, n_total):
        j = rnd.randint(0, i)
        if j < k_keep:
            res[j] = i
    return res

# ─── Safe & flexible sampling utility ─────────────────────────────────────
def safe_sample(
    df: pd.DataFrame,
    frac: float,
    *,
    min_per_user: int = 2,
    mode: SampleMode = "twopass",   # "per_user" | "twopass" | "reservoir"
    seed: int = 42,
) -> pd.DataFrame:
    """
    Return a random sample of *df* while guaranteeing that every user who
    appears in the result has *≥ min_per_user* interactions.

    Strategy
    --------
    • **per_user**   – sample inside each user group, but never ask for more
                       rows than the group contains.  Users whose original
                       size < min_per_user are dropped.
    • **twopass** / **reservoir** – do a global sample first, then *filter*
                       away users that end up with too few rows.  (If you
                       prefer to *top-up* those users instead, replace the
                       final filter with custom logic to pull extra rows
                       from the original DF.)
    """
    if frac >= 1.0 or frac <= 0.0:
        return df.copy()

    rng = np.random.default_rng(seed)

    # ---------- per-user sampling ----------------------------------------
    if mode == "per_user":

        def _take(g: pd.DataFrame) -> pd.DataFrame:
            m = len(g)                                        # group size
            if m < min_per_user:
                # too small – drop the user entirely
                return pd.DataFrame(columns=g.columns)

            # target sample size, capped at group size
            n_keep = min(m, max(min_per_user, int(round(m * frac))))
            if n_keep == m:
                return g                                      # keep all
            return g.sample(n=n_keep,
                            random_state=int(rng.integers(0, 1e9)))

        out = df.groupby("userId", group_keys=False).apply(_take)

    # ---------- global sampling (twopass / reservoir) --------------------
    else:

        def _global_once() -> pd.DataFrame:
            state = int(rng.integers(0, 1e9))
            if mode == "twopass":
                return df.sample(frac=frac, random_state=state)
            # reservoir
            k = int(round(len(df) * frac))
            idx = _reservoir_indices(len(df), k, seed=state)
            return df.iloc[idx]

        # retry up to 5× to retain as many users as possible
        for _ in range(5):
            out = _global_once()
            # keep only users that meet the minimum
            out = out.groupby("userId").filter(
                lambda g: len(g) >= min_per_user
            )
            if len(out):
                break
        else:
            warnings.warn("safe_sample: fell back to empty sample; "
                          "returning full dataset.")
            return df.copy()

    return out.reset_index(drop=True)

    # ---------- global sampling (twopass / reservoir) --------------------
    def _global_once():
        state = int(rng.integers(0,1e9))
        if mode == "twopass":
            return df.sample(frac=frac, random_state=state)
        k = int(round(len(df)*frac))
        return df.iloc[_reservoir_indices(len(df), k, state)]

    for _ in range(5):
        out = _global_once()
        if (out.groupby("userId").size() >= min_per_user).all():
            return out.reset_index(drop=True)

    warnings.warn("Global sampling left some users with < "
                  f"{min_per_user} interactions; using full dataset.")
    return df.copy().reset_index(drop=True)

# =============================================================
#  Block‑2 — dataset loaders  + k‑core preview (uses safe_sample)
# =============================================================
BASE_AMAZON = "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw"
AMAZON_LIST = [
    "All_Beauty","Amazon_Fashion","Appliances","Arts_Crafts_and_Sewing","Automotive",
    "Baby_Products","Beauty_and_Personal_Care","Books","CDs_and_Vinyl",
    "Cell_Phones_and_Accessories","Clothing_Shoes_and_Jewelry","Digital_Music",
    "Electronics","Gift_Cards","Grocery_and_Gourmet_Food","Handmade_Products",
    "Health_and_Household","Health_and_Personal_Care","Home_and_Kitchen",
    "Industrial_and_Scientific","Kindle_Store","Magazine_Subscriptions","Movies_and_TV",
    "Musical_Instruments","Office_Products","Patio_Lawn_and_Garden","Pet_Supplies",
    "Software","Sports_and_Outdoors","Subscription_Boxes","Tools_and_Home_Improvement",
    "Toys_and_Games","Video_Games",
]


# ------------------------------------------------------------------
#  Robust download + gunzip helpers (retry + size check)
# ------------------------------------------------------------------
def _download(url: str, dest: str, label: str = "file", *,
              max_attempts: int = 3) -> None:
    """Download *url* to *dest* with integrity check & retries."""
    for attempt in range(1, max_attempts + 1):
        if os.path.exists(dest):
            os.remove(dest)                     # start fresh each attempt
        try:
            r = requests.get(url, stream=True, timeout=30)
            r.raise_for_status()
            total = int(r.headers.get("content-length", 0))

            with open(dest, "wb") as f, tqdm(
                total=total or None,
                unit="B", unit_scale=True,
                desc=f"{label} (try {attempt}/{max_attempts})",
                disable=not CFG.verbose_progress,
            ) as bar:
                for chunk in r.iter_content(1 << 20):          # 1 MiB
                    f.write(chunk)
                    bar.update(len(chunk))

            # -------- size check -----------------------------------------
            size = os.path.getsize(dest)
            if total and size != total:
                raise IOError(f"size mismatch: got {size:,} B, "
                              f"expected {total:,} B")

            return                                             # success
        except Exception as e:
            warnings.warn(f"download failed ({e}); retrying…")
            time.sleep(1)

    raise RuntimeError(f"failed to download {url} after {max_attempts} attempts")


def _extract_gz(src: str, dst: str, *, max_attempts: int = 3) -> None:
    """Gunzip *src* → *dst*; retry if the .gz is corrupt."""
    if os.path.exists(dst):
        return
    for attempt in range(1, max_attempts + 1):
        try:
            with gzip.open(src, "rb") as s, open(dst, "wb") as d:
                shutil.copyfileobj(s, d)
            return                                         # success
        except EOFError:
            warnings.warn(f"corrupt gzip at {src} (try {attempt}); "
                          "re-downloading…")
            # remove both files and re-download
            for p in (src, dst):
                if os.path.exists(p):
                    os.remove(p)
            _download(src.replace("amazon/", BASE_AMAZON + "/")  # rebuild URL
                      .replace(".jsonl", ".jsonl"),
                      src, label=os.path.basename(src))
    raise RuntimeError(f"failed to extract {src} after {max_attempts} attempts")

# -------------------------------------------------------------------------
#  Robust random-sampling JSONL reader
# -------------------------------------------------------------------------
def _safe_load(line: str, *, path: str, lineno: int) -> Dict | None:
    """Best-effort JSON parse.  Returns None for malformed rows."""
    try:
        return json.loads(line)
    except json.JSONDecodeError as e:
        warnings.warn(
            f"[{os.path.basename(path)}] bad JSON at line {lineno}: {e.msg}"
        )
        return None


def _load_jsonl(
    path: str,
    fraction: float,
    *,
    seed: int | None = None,
    reservoir: bool = False,
) -> pd.DataFrame:
    """
    Read a newline-delimited JSONL file, keeping a *random* fraction of lines.
    Malformed rows are skipped with a warning.
    """
    if fraction >= 1 or fraction <= 0:
        return pd.read_json(path, lines=True)  # full load, trust pandas

    if seed is not None:
        random.seed(seed)

    # ----- choose sample size (need N even for reservoir so we know k) -----
    n_total = sum(1 for _ in open(path, encoding="utf-8"))
    k_keep  = max(1, int(n_total * fraction))

    rows: List[Dict] = []
    bad = 0                                        # malformed counter

    # ----- two-pass --------------------------------------------------------
    if not reservoir:
        target = set(random.sample(range(n_total), k_keep))
        with open(path, encoding="utf-8") as fh, tqdm(
            total=k_keep,
            desc=f"{os.path.basename(path)} (random {fraction:.0%})",
            disable=not CFG.verbose_progress,
        ) as bar:
            for i, line in enumerate(fh):
                if i not in target:
                    continue
                obj = _safe_load(line, path=path, lineno=i + 1)
                if obj is None:
                    bad += 1
                    continue
                rows.append(obj)
                bar.update()
        if bad:
            print(f"⚠️  skipped {bad} malformed lines "
                  f"({bad / (bad + len(rows)):.3%})")
        return pd.DataFrame(rows)

    # ----- reservoir -------------------------------------------------------
    with open(path, encoding="utf-8") as fh, tqdm(
        desc=f"{os.path.basename(path)} (reservoir {fraction:.0%})",
        disable=not CFG.verbose_progress,
    ) as bar:
        for idx, line in enumerate(fh, 1):         # idx starts at 1
            obj = _safe_load(line, path=path, lineno=idx)
            if obj is None:
                bad += 1
                bar.update()
                continue
            if idx <= k_keep:
                rows.append(obj)
            else:
                j = random.randint(1, idx)
                if j <= k_keep:
                    rows[j - 1] = obj
            bar.update()
    if bad:
        print(f"⚠️  skipped {bad} malformed lines "
              f"({bad / (bad + len(rows)):.3%})")
    return pd.DataFrame(rows)


# --------------------------- dataset loaders ------------------------------
def load_amazon(cat: str, fraction: float):
    """Return (ratings_df, metadata_df) for a single Amazon category."""
    os.makedirs("amazon", exist_ok=True)
    rv_gz = f"amazon/{cat}.jsonl.gz"
    md_gz = f"amazon/meta_{cat}.jsonl.gz"
    rv    = f"amazon/{cat}.jsonl"
    md    = f"amazon/meta_{cat}.jsonl"

    _download(f"{BASE_AMAZON}/review_categories/{cat}.jsonl.gz", rv_gz, f"{cat}-rv")
    _download(f"{BASE_AMAZON}/meta_categories/meta_{cat}.jsonl.gz", md_gz, f"{cat}-md")
    _extract_gz(rv_gz, rv)
    _extract_gz(md_gz, md)

    kw = dict(seed=CFG.rng_seed, reservoir=(CFG.sample_mode == "reservoir"))
    return _load_jsonl(rv, fraction, **kw), _load_jsonl(md, fraction, **kw)


def _dl_unzip(url: str, folder: str) -> str:
    """Download + unzip MovieLens archive; return extraction folder path."""
    os.makedirs(folder, exist_ok=True)
    z = os.path.join(folder, os.path.basename(url))
    _download(url, z, "MovieLens")
    zipfile.ZipFile(z).extractall(folder)
    return folder


def load_movielens(name: str = "100k"):
    """Return (ratings_df, metadata_df) for MovieLens 100K or 1M."""
    if name == "100k":
        p = _dl_unzip(
            "http://files.grouplens.org/datasets/movielens/ml-100k.zip", "ml"
        ) + "/ml-100k"

        df_r = pd.read_csv(
            f"{p}/u.data",
            sep="\t",
            names=["user_id", "item_id", "rating", "timestamp"],
        )
        df_r["title"] = df_r["text"] = ""

        df_i = pd.read_csv(
            f"{p}/u.item",
            sep="|",
            encoding="ISO-8859-1",
            names=["movie_id", "title", "r", "v", "imdb"] + [f"g{i}" for i in range(19)],
        )
        genres = df_i.iloc[:, 5:].eq(1).dot(df_i.columns[5:] + "|").str.rstrip("|")
        df_m = pd.DataFrame(
            {
                "movie_id": df_i["movie_id"],
                "title": df_i["title"],
                "features": genres,
                "description": "",
            }
        )

    else:  # MovieLens 1M
        p = _dl_unzip(
            "http://files.grouplens.org/datasets/movielens/ml-1m.zip", "ml"
        ) + "/ml-1m"

        df_r = pd.read_csv(
            f"{p}/ratings.dat",
            sep="::",
            engine="python",
            names=["user_id", "item_id", "rating", "timestamp"],
        )
        df_r["title"] = df_r["text"] = ""

        df_m = pd.read_csv(
            f"{p}/movies.dat",
            sep="::",
            engine="python",
            names=["movie_id", "title", "features"],
        )
        df_m["description"] = ""

    return df_r, df_m


# --------------------------- harmonisation -------------------------------
MAX_REVIEWS_SENTINEL = CFG.max_reviews_per_item  # shortcut

def harmonise_ids(df_r: pd.DataFrame, df_m: pd.DataFrame):
    """Unify disparate ID schemes across ratings and metadata tables."""
    df_r, df_m = df_r.copy(), df_m.copy()

    df_r["combined_item_id"] = df_r.apply(
        lambda r: r.get("parent_asin") or r.get("asin") or r.get("movie_id"), axis=1
    )
    df_m["combined_item_id"] = df_m.apply(
        lambda r: r.get("parent_asin") or r.get("asin") or r.get("movie_id"), axis=1
    )

    u_map = {u: i for i, u in enumerate(df_r["user_id"].unique())}
    it_map = {
        it: i
        for i, it in enumerate(
            pd.concat([df_r["combined_item_id"], df_m["combined_item_id"]]).unique()
        )
    }

    df_r["userId"] = df_r["user_id"].map(u_map)
    df_r["itemId"] = df_r["combined_item_id"].map(it_map)
    df_m["itemId"] = df_m["combined_item_id"].map(it_map)

    # ------- aggregate review texts ---------------------------------------
    def _agg_reviews(series: pd.Series) -> str:
        if MAX_REVIEWS_SENTINEL == 0:
            return ""
        if MAX_REVIEWS_SENTINEL is None:
            return " [SEP] ".join(series.dropna().map(str))
        return " [SEP] ".join(series.dropna().map(str).head(MAX_REVIEWS_SENTINEL))

    txt = df_r.groupby("itemId")["text"].apply(_agg_reviews)
    review_counts = (
        df_r.groupby("itemId")["text"].apply(
            lambda s: 0
            if MAX_REVIEWS_SENTINEL == 0
            else min(len(s.dropna()), MAX_REVIEWS_SENTINEL or 10 ** 9)
        ).to_dict()
    )

    side = (
        df_m[["itemId", "title", "features", "description"]]
        .drop_duplicates("itemId")
        .merge(txt.rename("all_reviews_text"), on="itemId", how="left")
    )

    inter = df_r[["userId", "itemId", "rating", "timestamp", "text"]]
    return inter, side, review_counts


# ------------------------------ utilities -------------------------------
def k_core_filter(df: pd.DataFrame, k: int):
    """Iteratively drop users *and* items with < k interactions."""
    out, changed = df.copy(), True
    while changed:
        changed = False
        for col in ("userId", "itemId"):
            bad = out[col].value_counts()[lambda s: s < k].index
            if len(bad):
                out = out[~out[col].isin(bad)]
                changed = True
    return out


def ui_stats(df: pd.DataFrame, label: str):
    """Print a one-liner with basic dataset statistics."""
    u = df["userId"].nunique()
    i = df["itemId"].nunique()
    n = len(df)
    print(f"\n{'=' * 50}")
    print(f"📌 [{label}] Dataset Stats:")
    print(f"{'-' * 50}")
    print(f"• Users           : {u:,}")
    print(f"• Items           : {i:,}")
    print(f"• Ratings         : {n:,}")
    print(f"• Avg ratings/user: {n / u if u else 0:.2f}")
    print(f"• Avg ratings/item: {n / i if i else 0:.2f}")
    print(f"• Sparsity        : {n / (u * i) if u * i else 0:.6f}")
    print(f"{'=' * 50}\n")

def preview_only(cfg: Config = CFG) -> Tuple[pd.DataFrame, pd.DataFrame]:
    with timer("dataset preview"):
        if cfg.ds_index in (34, 35):              # MovieLens shortcuts
            df_r, df_m = load_movielens("100k" if cfg.ds_index == 34 else "1m")
        else:                                     # Amazon
            df_r, df_m = load_amazon(AMAZON_LIST[cfg.ds_index], cfg.fraction)

        inter, side, rc = harmonise_ids(df_r, df_m)
        side["n_reviews"] = side.itemId.map(rc)

        if cfg.verbose_tables:
            ui_stats(inter, "raw")

        # -------- safe sampling BEFORE k‑core ------------------------------
        inter = safe_sample(
            inter,
            cfg.fraction,
            min_per_user = 3,
            mode         = cfg.sample_mode,
            seed         = cfg.rng_seed,
        )

        inter = k_core_filter(inter, cfg.k_core)

        # keep only items that survived k‑core
        common = set(inter.itemId) & set(side.itemId)
        inter  = inter[inter.itemId.isin(common)]
        side   = side[side.itemId.isin(common)].reset_index(drop=True)

        if cfg.verbose_tables:
            ui_stats(inter, f"{cfg.k_core}-core")

    return inter, side

# -------------------------- smoke‑test ------------------------------------
if __name__ == "__main__":
    inter, side = preview_only()
    print("✓ preview OK –", len(inter), "ratings /", len(side), "items")


▶️ dataset preview


Health_and_Personal_Care-rv (try 1/3):   0%|          | 0.00/69.5M [00:00<?, ?B/s]

Health_and_Personal_Care-md (try 1/3):   0%|          | 0.00/23.3M [00:00<?, ?B/s]


📌 [raw] Dataset Stats:
--------------------------------------------------
• Users           : 461,656
• Items           : 60,274
• Ratings         : 494,121
• Avg ratings/user: 1.07
• Avg ratings/item: 8.20
• Sparsity        : 0.000018


📌 [4-core] Dataset Stats:
--------------------------------------------------
• Users           : 613
• Items           : 747
• Ratings         : 4,706
• Avg ratings/user: 7.68
• Avg ratings/item: 6.30
• Sparsity        : 0.010277

⏱️ dataset preview: 29.9s
✓ preview OK – 4706 ratings / 747 items


In [None]:
# ╔════════════════════════════════════════════════════════════╗
# ║ BLOCK-2-SCOUT — quick census over Amazon categories        ║
# ╚════════════════════════════════════════════════════════════╝
"""
Iterates over all 34 Amazon-2023 categories

• measures wall-clock load+extract time
• counts items before filtering, after k=2 and k=5
• skips the category if download+extract > 120 s
• returns a summary DataFrame  (and prints nice progress)

⚠️  Insert this cell right after your existing Block-2 definitions
    (it re-uses load_amazon / harmonise_ids / k_core_filter).
"""

import gc, time
from types import SimpleNamespace

def _census_one(cat:str, frac:float=0.05) -> dict|None:
    """Return row-dict with timings & item counts, or None if too slow."""
    t0=time.time()
    try:
        df_r, df_m = load_amazon(cat, frac)
    except Exception as e:
        print(f"⚠️  {cat}: download failure → {e}")
        return None
    wall=time.time()-t0
    if wall>120:
        print(f"⏩ {cat}: skipped (took {wall:.0f}s)")
        return None

    inter, side, _ = harmonise_ids(df_r, df_m)
    raw_items=len(side)

    inter2 = k_core_filter(inter, 2);  items2 = inter2.itemId.nunique()
    inter5 = k_core_filter(inter, 5);  items5 = inter5.itemId.nunique()
    return dict(category=cat, raw_items=raw_items,
                items_k2=items2, items_k5=items5, seconds=round(wall,1))

def scout_amazon(fraction:float=0.05) -> pd.DataFrame:
    rows=[]
    for cat in AMAZON_LIST:
        print(f"▶️ {cat:<35}", end="")
        row=_census_one(cat, fraction)
        if row: rows.append(row)
        gc.collect()
    df=pd.DataFrame(rows).sort_values("items_k2", ascending=False).reset_index(drop=True)
    return df

# ---- run once & cache to CSV --------------------------------
#if "get_ipython" in globals():
#    with timer("Amazon census"):
#        census_df = scout_amazon(0.20)     # 5 % sample is enough to judge scale
#    display(census_df.head(15))
#    census_df.to_csv("amazon_census.csv", index=False)
#    print("✓ wrote amazon_census.csv")


In [None]:
# --- one-click download helpers ---------------------------------
try:
    from google.colab import files               # Colab ↔ your browser
    _colab = True
except ImportError:
    from IPython.display import FileLink, display # classic Jupyter/Lab
    _colab = False

def download_artifacts(paths: dict):
    """
    Trigger browser downloads for every artifact created by run_pipeline().
    Pass in the *paths* dict that run_pipeline returns.
    """
    for label, p in paths.items():
        if not os.path.exists(p):
            print(f"⚠️  {label}: file not found → {p}")
            continue

        if _colab:
            # Colab: inline download dialogue
            print(f"⬇️  downloading {label} → {os.path.basename(p)}")
            files.download(p)
        else:
            # Jupyter/Lab: clickable links in the output cell
            print(f"➡️  {label}:")
            display(FileLink(p))

# ---------------------------------------------------------------


In [None]:
# ╔════════════════════════════════════════════════════════════╗
# ║ BLOCK-3 — augmentation + critic (patched)                  ║
# ╚════════════════════════════════════════════════════════════╝
"""
Creates synthetic ‘dense’ and ‘aspect’ views in three lengths (S/M/L),
ensures they never rely on user reviews, and runs a lightweight critic
(perplexity, toxicity, sentiment).  Results are added to `side_all`.
"""

import os, json, re, warnings, nltk, torch
from typing import List
import pandas as pd, numpy as np
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from nltk.sentiment import SentimentIntensityAnalyzer
from detoxify import Detoxify
import gzip
import zipfile


# ─── OpenAI chat wrapper (works for ≥1.0 and <1.0 SDKs) ─────────
try:
    import openai as _oai
    if hasattr(_oai, "OpenAI"):                       # new SDK
        _client = _oai.OpenAI()
        def chat(msgs, **kw):
            kw.setdefault("model", "gpt-4o-mini")
            return _client.chat.completions.create(**kw, messages=msgs)\
                          .choices[0].message.content.strip()
    else:                                             # legacy SDK
        _oai.api_key = os.getenv("OPENAI_API_KEY", "")
        def chat(msgs, **kw):
            kw.setdefault("model", "gpt-3.5-turbo")
            return _oai.ChatCompletion.create(**kw, messages=msgs)\
                       ["choices"][0]["message"]["content"].strip()
except Exception:                                     # offline fallback
    chat = None

# ─── prompts & constants ───────────────────────────────────────
SYSTEM = ("You are an e-commerce copywriter.  Use ONLY the metadata "
          "(title, features, description).  NEVER cite user reviews and "
          "NEVER invent specifications.")
def _prompt_dict(words:int)->dict[str,str]:
    bp = CFG.bullet_points
    return {
        "synth_dense":
            f"Write one persuasive paragraph (~{words} words) summarising the product.",
        "synth_aspect":
            f"Write exactly {bp} bullet-points, ≤20 words each:\n"
            "- **Feature**: …\n- **Pros**: …\n- **Cons**: …"
    }

CACHE_DIR = "aug_cache"; os.makedirs(CACHE_DIR, exist_ok=True)

# ─── text-block helpers ───────────────────────────────────────
def _to_str(x):
    """Join list/tuple into one string; pass through other types."""
    if isinstance(x, (list, tuple)): return " ".join(map(str, x))
    return str(x or "")

def meta_only(row: pd.Series) -> str:
    """Metadata-only text block (no reviews)."""
    return "\n".join(filter(None, [_to_str(row.title),
                                   _to_str(row.features),
                                   _to_str(row.description)]))

def _pseudo(row: pd.Series, strat: str, words: int) -> str:
    """Local fallback so synthetic columns are never empty."""
    tokens = meta_only(row).split()
    if strat == "synth_dense":
        return " ".join(tokens[:words]).capitalize() + "."
    bullet = lambda tok: "- " + " ".join(tok[:min(15, len(tok))])
    return "\n".join(bullet(tokens[i*15:(i+1)*15]) for i in range(CFG.bullet_points))

def _generate(row: pd.Series, strat: str, words: int) -> str:
    """LLM generation with on-disk cache and safe fallback."""
    cache = f"{CACHE_DIR}/{row.itemId}_{strat}_{words}.json"
    if os.path.exists(cache):
        return json.load(open(cache))["t"]

    if chat:
        try:
            txt = chat(
                [{"role":"system","content":SYSTEM},
                 {"role":"user","content":_prompt_dict(words)[strat] +
                                          "\n\nSOURCE:\n"+meta_only(row)}],
                max_tokens=words*4, temperature=0.25)
            if not txt.strip():
                txt = _pseudo(row, strat, words)
        except Exception as e:
            warnings.warn(f"LLM error ({strat}): {e}")
            txt = _pseudo(row, strat, words)
    else:
        txt = _pseudo(row, strat, words)

    json.dump({"t": txt}, open(cache, "w"))
    return txt

# ------------------------------------------------------------
#  Improved augment()  ·  shows progress bars when verbose=True
# ------------------------------------------------------------
from tqdm.auto import tqdm

def augment(side: pd.DataFrame, *, verbose: bool | None = None) -> pd.DataFrame:
    """
    Enrich *side* with
        • meta_only
        • meta_human
        • six synthetic columns   (s_/m_/l_ × synth_dense, synth_aspect)
        • six combo  columns      (s_/m_/l_ × combo_dense,  combo_aspect)
    and return the augmented DataFrame.

    When *verbose* (or CFG.verbose_progress) is True, a timer and six
    tqdm progress-bars are displayed.
    """
    if verbose is None:
        verbose = globals().get("CFG", type("Tmp",(),{"verbose_progress":True}))\
                           .verbose_progress

    side = side.copy()

    # --- 0. quick helpers ----------------------------------------------------
    def _pb(iterable, desc):
        "helper: wrap *iterable* in tqdm when verbose, else pass through."
        return tqdm(iterable, total=len(side), desc=desc, disable=not verbose)

    # --- 1. metadata scaffolding --------------------------------------------
    side["meta_only"]  = side.apply(meta_only, axis=1)
    side["meta_human"] = (side.meta_only + " " +
                          side.all_reviews_text.fillna("")).str.strip()

    # --- 2. synthetic generation --------------------------------------------
    with timer("augmentation (synthetic text)") if verbose else contextlib.nullcontext():

        for lab, key in LENS:                         # S, M, L
            pref, words = lab.lower()+"_", LEN_TOK[key]

            # 2a) two synthetic columns
            for strat in ("synth_dense", "synth_aspect"):
                col = pref + strat
                side[col] = [
                    _generate(r, strat, words)
                    for _, r in _pb(side.iterrows(), f"{lab}-{strat}")
                ]

            # 2b) the two combi columns that depend on the synthetics above
            side[pref + "combo_dense"]  = (side.meta_human + " " +
                                           side[pref + "synth_dense"]).str.strip()
            side[pref + "combo_aspect"] = (side.meta_human + " " +
                                           side[pref + "synth_aspect"]).str.strip()

    return side

# ─── critic (perplexity / toxicity / sentiment) ───────────────
nltk.download("vader_lexicon", quiet=True)
_sia  = SentimentIntensityAnalyzer()
_detox= Detoxify("original")
try:
    _tok = GPT2TokenizerFast.from_pretrained("distilgpt2")
    _gpt = GPT2LMHeadModel.from_pretrained("distilgpt2").eval()
except Exception:
    _tok = _gpt = None

# ─── critic helpers ────────────────────────────────────────────
_MAX_POS = _gpt.config.n_positions if _gpt else 1024  # distilgpt2 → 1024

def _ppl(txt: str) -> float:
    """Return (truncated) perplexity score or NaN if unavailable."""
    if _tok is None or not txt.strip():
        return np.nan

    # hard-truncate to the model’s maximum context length
    enc = _tok(
        txt,
        return_tensors="pt",
        truncation=True,
        max_length=_MAX_POS,
    )
    ids = enc.input_ids

    with torch.no_grad():
        loss = _gpt(input_ids=ids, labels=ids).loss
    return float(torch.exp(loss))

def critic(side: pd.DataFrame) -> pd.DataFrame:
    """Compute metrics on every synthetic / combo column and add *_critic_bad flags."""
    synth_cols = [c for c in side.columns if re.match(r"[sml]_(synth|combo)_", c)]
    rows: List[dict] = []
    for _, r in tqdm(side.iterrows(), total=len(side), desc="critic"):
        for col in synth_cols:
            txt = r[col]
            rows.append(dict(itemId=r.itemId, col=col,
                             perplexity=_ppl(txt),
                             toxicity=_detox.predict(txt)["toxicity"] if txt else np.nan,
                             sentiment=_sia.polarity_scores(txt)["compound"]))
    met = pd.DataFrame(rows)

    # flag “bad” items per length bucket
    for lab, _ in LENS:
        bad = met.query("col.str.startswith(@lab.lower()) and "
                        "(perplexity>150 or toxicity>0.2)", engine="python")\
                 .itemId.unique()
        side[f"{lab.lower()}_critic_bad"] = side.itemId.isin(bad)
    return met



def run_pipeline(cfg: Config = CFG) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, dict, dict]:
    inter, side = preview_only(cfg)
    side_all    = augment(side)
    metrics     = critic(side_all) if cfg.enable_critic else pd.DataFrame()

    # build dataset-specific folder & filename prefix
    if cfg.ds_index in (34, 35):
        ds_name = "movielens100k" if cfg.ds_index == 34 else "movielens1m"
    else:
        ds_name = AMAZON_LIST[cfg.ds_index]


    # 1️⃣ old
    suffix = f"k{cfg.k_core}"
    out_dir = f"artifacts/{ds_name}_{suffix}"
    os.makedirs(out_dir, exist_ok=True)
    prefix  = f"{out_dir}/{ds_name}_{suffix}"

    # save interactions and side_all as compressed CSVs
    with gzip.open(f"{prefix}_interactions.csv.gz", "wt", newline="") as f:
        inter.to_csv(f, index=False)
    with gzip.open(f"{prefix}_side_all.csv.gz", "wt", newline="") as f:
        side_all.to_csv(f, index=False)

    # save metrics and manifest
    metrics.to_csv(f"{prefix}_quality_metrics.csv", index=False)
    # 2️⃣  add sampling parameters to manifest.json
    manifest = {
        "review_per_item": side_all.groupby("itemId").n_reviews.first().to_dict(),
        "config":          cfg.__dict__,
        "sampling": {
            "fraction":     cfg.fraction,
            "sample_mode":  cfg.sample_mode,
            "rng_seed":     cfg.rng_seed,
        },
    }

    with open(f"{prefix}_manifest.json", "w") as f:
        json.dump(manifest, f, indent=2)

    # archive the entire folder
    archive_path = f"{out_dir}.zip"
    with zipfile.ZipFile(archive_path, "w", zipfile.ZIP_DEFLATED) as z:
        for fname in os.listdir(out_dir):
            z.write(os.path.join(out_dir, fname), arcname=fname)

    paths = {
        "interactions": f"{prefix}_interactions.csv.gz",
        "side_all":     f"{prefix}_side_all.csv.gz",
        "metrics":      f"{prefix}_quality_metrics.csv",
        "manifest":     f"{prefix}_manifest.json",
        "archive":      archive_path,
    }

    print("✓ all artifacts saved")
    return inter, side_all, metrics, manifest, paths


# ---------- Block-5 – demo run -------------------------------
if __name__=="__main__":
    inter, side_all, metrics, manifest, paths = run_pipeline()
    download_artifacts(paths)

    display(inter.head())
    display(side_all.head())
    display(metrics.head())

Downloading: "https://github.com/unitaryai/detoxify/releases/download/v0.1-alpha/toxic_original-c1212f89.ckpt" to /root/.cache/torch/hub/checkpoints/toxic_original-c1212f89.ckpt
100%|██████████| 418M/418M [00:02<00:00, 182MB/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

▶️ dataset preview


Gift_Cards-rv (try 1/3):   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Gift_Cards-md (try 1/3):   0%|          | 0.00/320k [00:00<?, ?B/s]


📌 [raw] Dataset Stats:
--------------------------------------------------
• Users           : 132,732
• Items           : 1,137
• Ratings         : 152,410
• Avg ratings/user: 1.15
• Avg ratings/item: 134.05
• Sparsity        : 0.001010


📌 [0-core] Dataset Stats:
--------------------------------------------------
• Users           : 132,732
• Items           : 1,137
• Ratings         : 152,410
• Avg ratings/user: 1.15
• Avg ratings/item: 134.05
• Sparsity        : 0.001010

⏱️ dataset preview: 5.6s
▶️ augmentation (synthetic text)


S-synth_dense:   0%|          | 0/1137 [00:00<?, ?it/s]

S-synth_aspect:   0%|          | 0/1137 [00:00<?, ?it/s]

M-synth_dense:   0%|          | 0/1137 [00:00<?, ?it/s]

M-synth_aspect:   0%|          | 0/1137 [00:00<?, ?it/s]

L-synth_dense:   0%|          | 0/1137 [00:00<?, ?it/s]

L-synth_aspect:   0%|          | 0/1137 [00:00<?, ?it/s]

⏱️ augmentation (synthetic text): 7421.3s


critic:   0%|          | 0/1137 [00:00<?, ?it/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


✓ all artifacts saved
⬇️  downloading interactions → Gift_Cards_k0_interactions.csv.gz


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

⬇️  downloading side_all → Gift_Cards_k0_side_all.csv.gz


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

⬇️  downloading metrics → Gift_Cards_k0_quality_metrics.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

⬇️  downloading manifest → Gift_Cards_k0_manifest.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

⬇️  downloading archive → Gift_Cards_k0.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,userId,itemId,rating,timestamp,text
0,0,0,5,2019-02-11 06:22:38.332,Having Amazon money is always good.
1,1,1,5,2020-09-12 01:45:58.120,Always the perfect gift. I have never given o...
2,1,2,5,2018-09-03 01:58:49.239,When you have a person who is hard to shop for...
3,1,3,5,2014-12-13 02:59:37.000,The tin is a nice touch and pretty large. It'...
4,2,4,1,2021-11-28 03:06:48.115,I bought this pack of Starbucks Gift cards in ...


Unnamed: 0,itemId,title,features,description,all_reviews_text,n_reviews,meta_only,meta_human,s_synth_dense,s_synth_aspect,...,m_synth_aspect,m_combo_dense,m_combo_aspect,l_synth_dense,l_synth_aspect,l_combo_dense,l_combo_aspect,s_critic_bad,m_critic_bad,l_critic_bad
0,729,Amazon.com Gift Card in Gift Tag (Various Desi...,"[Gift Card is affixed inside a gift tag, Gift ...",[Amazon.com Gift Cards are the perfect way to ...,exactly as pictured [SEP] Super cute. A great ...,7,Amazon.com Gift Card in Gift Tag (Various Desi...,Amazon.com Gift Card in Gift Tag (Various Desi...,Give the gift of endless possibilities with an...,- **Feature**: Gift card in gift tag\n- **Pros...,...,- **Feature**: Various designs available\n- **...,Amazon.com Gift Card in Gift Tag (Various Desi...,Amazon.com Gift Card in Gift Tag (Various Desi...,Give the gift of endless possibilities with th...,- **Feature**: Gift card in gift tag\n- **Pros...,Amazon.com Gift Card in Gift Tag (Various Desi...,Amazon.com Gift Card in Gift Tag (Various Desi...,False,False,False
1,678,$25 Mastercard Gift Card (plus $3.95 Purchase ...,[This card is non-reloadable. No expiration of...,[Mastercard Gift Cards are the ideal gift – to...,I got this card as a replacement for the first...,6,$25 Mastercard Gift Card (plus $3.95 Purchase ...,$25 Mastercard Gift Card (plus $3.95 Purchase ...,Give the gift of endless possibilities with th...,- **Feature**: $25 Mastercard Gift Card\n- **P...,...,- **Feature**: $25 Mastercard Gift Card\n- **P...,$25 Mastercard Gift Card (plus $3.95 Purchase ...,$25 Mastercard Gift Card (plus $3.95 Purchase ...,Give the gift of endless possibilities with th...,- **Feature**: $25 Mastercard Gift Card\n- **P...,$25 Mastercard Gift Card (plus $3.95 Purchase ...,$25 Mastercard Gift Card (plus $3.95 Purchase ...,False,False,False
2,45,Tractor Supply Company Gift Card,[Tractor Supply Company is the largest operato...,[Tractor Supply Company is the largest operato...,Not much to review. Perfect fit for Dad's birt...,10,Tractor Supply Company Gift Card\nTractor Supp...,Tractor Supply Company Gift Card\nTractor Supp...,Give the gift of choice with the Tractor Suppl...,"- **Feature**: Redeemable at over 1,500 Tracto...",...,"- **Feature**: Redeemable at over 1,500 Tracto...",Tractor Supply Company Gift Card\nTractor Supp...,Tractor Supply Company Gift Card\nTractor Supp...,Give the gift of choice with the Tractor Suppl...,"- **Feature**: Redeemable at over 1,500 Tracto...",Tractor Supply Company Gift Card\nTractor Supp...,Tractor Supply Company Gift Card\nTractor Supp...,False,False,False
3,392,Staples Gift Card,[Staples Gift Cards - Giving the perfect gift ...,[Giving the perfect gift has never been easier...,Had no issues using this on the Staples.com we...,10,Staples Gift Card\nStaples Gift Cards - Giving...,Staples Gift Card\nStaples Gift Cards - Giving...,Give the gift of choice with a Staples Gift Ca...,- **Feature**: Thousands of choices in one loc...,...,- **Feature**: Thousands of choices in one loc...,Staples Gift Card\nStaples Gift Cards - Giving...,Staples Gift Card\nStaples Gift Cards - Giving...,Give the gift of endless possibilities with a ...,- **Feature**: Thousands of choices in one loc...,Staples Gift Card\nStaples Gift Cards - Giving...,Staples Gift Card\nStaples Gift Cards - Giving...,False,False,False
4,239,Ninety Nine Restaurants Gift Card,[Ninety Nine Restaurants are a comfortable pla...,[The Ninety Nine Restaurants is a comfortable ...,Can't go wrong [SEP] Great and easy Christmas ...,10,Ninety Nine Restaurants Gift Card\nNinety Nine...,Ninety Nine Restaurants Gift Card\nNinety Nine...,Give the gift of a delicious dining experience...,"- **Feature**: Redeemable for food, beverages,...",...,"- **Feature**: Redeemable for food, beverages,...",Ninety Nine Restaurants Gift Card\nNinety Nine...,Ninety Nine Restaurants Gift Card\nNinety Nine...,Give the gift of delicious dining experiences ...,"- **Feature**: Redeemable for food, beverages,...",Ninety Nine Restaurants Gift Card\nNinety Nine...,Ninety Nine Restaurants Gift Card\nNinety Nine...,False,False,False


Unnamed: 0,itemId,col,perplexity,toxicity,sentiment
0,729,s_synth_dense,41.208061,0.000578,0.9607
1,729,s_synth_aspect,28.737604,0.000791,0.9337
2,729,s_combo_dense,45.755905,0.000692,0.9989
3,729,s_combo_aspect,36.212559,0.000761,0.9988
4,729,m_synth_dense,29.805641,0.000572,0.9934
