In [1]:
# ==============================================================
# 🧠 HYBRID FAKE REVIEW DETECTION PIPELINE (CLEAN VERSION)
# Paths: input=./nb2, output=./nb3
# ==============================================================
import pandas as pd
import numpy as np
from pathlib import Path
import json
from tqdm.auto import tqdm
import networkx as nx

In [2]:
# -------------------------------
# 📂 Folder setup
# -------------------------------
BASE_PATH = Path(".")                              # current folder (Text Mining)
DATA_PATH = BASE_PATH / "nb2"            # input data
OUT_PATH  = BASE_PATH / "nb3"                      # output folder
OUT_PATH.mkdir(exist_ok=True, parents=True)

# Input files
HISN_PARQUET  = DATA_PATH / "hisn_final.parquet"
USER_STATS    = DATA_PATH / "user_stats.csv"
ITEM_STATS    = DATA_PATH / "item_stats.csv"
REV_PROCESSED = DATA_PATH / "reviews_processed.parquet"
REV_EMB_NPY   = DATA_PATH / "review_features.npy"

# Output files
OUT_NODES     = OUT_PATH / "nodes.csv"
OUT_EDGES     = OUT_PATH / "edges.csv"
OUT_EDGE_EMB  = OUT_PATH / "edge_emb.npy"
OUT_REPORT    = OUT_PATH / "hisn_report.json"
OUT_TARGETS   = OUT_PATH / "targets.csv"

EDGE_SCALARS = [
    "rating_scaled", "review_length_scaled", "log_helpful_scaled",
    "verified_int_scaled", "dup_count_scaled",
    "sin_day", "cos_day", "sin_hour", "cos_hour"
]

In [3]:
# ==============================================================
# 1️⃣ EXPORT HISN GRAPH (nodes / edges / embeddings)
# ==============================================================

def export_hisn_graph():
    """Build edges, nodes, and aligned embeddings for HISN."""
    print("🚀 Building HISN graph...")

    # --- Load base tables
    df_h = pd.read_parquet(HISN_PARQUET)
    df_rp = pd.read_parquet(REV_PROCESSED) if REV_PROCESSED.exists() else None
    emb = np.load(str(REV_EMB_NPY), mmap_mode="r") if REV_EMB_NPY.exists() else None

    # --- Build stable key for alignment
    def normalize_ts(s):
        ts = pd.to_datetime(s, errors="coerce", utc=True)
        return ts.view("int64") // 1_000_000_000

    for df in [df_h] + ([df_rp] if df_rp is not None else []):
        df["_ts"] = normalize_ts(df.get("timestamp", pd.Timestamp.now()))
        df.sort_values(["user_id", "asin", "_ts"], inplace=True)
        df["_dup"] = df.groupby(["user_id", "asin", "_ts"]).cumcount()
        df["rid_key"] = (
            df["user_id"].astype(str) + "|" +
            df["asin"].astype(str) + "|" +
            df["_ts"].astype(str) + "|" +
            df["_dup"].astype(str)
        )

    # --- Align embeddings
    emb_aligned = None
    if emb is not None and df_rp is not None:
        pos = pd.Series(np.arange(len(df_rp)), index=df_rp["rid_key"])
        idx = pos.reindex(df_h["rid_key"])
        emb_aligned = np.zeros((len(df_h), emb.shape[1]), dtype=np.float32)
        mask = ~idx.isna()
        emb_aligned[mask.to_numpy()] = emb[idx[mask].astype(int).to_numpy()]
        emb_aligned = np.nan_to_num(emb_aligned, nan=0.0)

    # --- Edges
    edges = df_h.rename(columns={"user_id": "src", "asin": "dst"}).copy()
    edges["edge_type"] = "user-reviews-product"
    edge_keep = ["src", "dst", "edge_type"] + [c for c in EDGE_SCALARS if c in edges]
    edges_out = edges[edge_keep]
    edges_out.to_csv(str(OUT_EDGES), index=False)

    # --- Nodes
    users = pd.DataFrame({"node_id": df_h["user_id"].astype(str).unique(), "node_type": "user"})
    items = pd.DataFrame({"node_id": df_h["asin"].astype(str).unique(), "node_type": "product"})
    nodes = pd.concat([users, items], ignore_index=True)

    # --- Merge stats
    if USER_STATS.exists():
        us = pd.read_csv(USER_STATS).rename(columns={"user_id": "node_id"})
        us["node_type"] = "user"
        us = us.rename(columns={c: f"user_{c}" for c in us.columns if c not in ["node_id", "node_type"]})
        nodes = nodes.merge(us, on=["node_id", "node_type"], how="left")

    if ITEM_STATS.exists():
        it = pd.read_csv(ITEM_STATS).rename(columns={"asin": "node_id"})
        it["node_type"] = "product"
        it = it.rename(columns={c: f"item_{c}" for c in it.columns if c not in ["node_id", "node_type"]})
        nodes = nodes.merge(it, on=["node_id", "node_type"], how="left")

    nodes.to_csv(str(OUT_NODES), index=False)

    # --- Save embeddings
    if emb_aligned is not None:
        np.save(str(OUT_EDGE_EMB), emb_aligned.astype(np.float32))
        print(f"✅ Exported HISN graph → {OUT_NODES.name}, {OUT_EDGES.name}, {OUT_EDGE_EMB.name}")
    else:
        print("⚠️ No embeddings found — skipped edge_emb.npy")

In [4]:
# ==============================================================
# 2️⃣ STRUCTURE CHECK / HISN REPORT
# ==============================================================

def build_graph(nodes, edges):
    G = nx.MultiDiGraph()
    for _, r in nodes.iterrows():
        G.add_node(r["node_id"], **r.to_dict())
    for i, r in edges.iterrows():
        G.add_edge(r["src"], r["dst"], key=i, **r.to_dict())
    return G

def run_structure_report():
    """Basic structure summary for sanity checking."""
    print("🧩 Checking HISN structure...")
    nodes = pd.read_csv(OUT_NODES)
    edges = pd.read_csv(OUT_EDGES)
    G = build_graph(nodes, edges)

    report = {
        "num_nodes": int(G.number_of_nodes()),
        "num_edges": int(G.number_of_edges()),
        "node_types": nodes["node_type"].value_counts().to_dict(),
        "edge_types": edges["edge_type"].value_counts().to_dict(),
        "components": nx.number_connected_components(nx.Graph(G)),
    }
    json.dump(report, open(OUT_REPORT, "w"), indent=2)
    print(f"✅ HISN report written → {OUT_REPORT.name}")

In [5]:
# ==============================================================
# 3️⃣ NFS SCORING (suspicious product scoring)
# ==============================================================

def zscore(s):
    s = s.astype(float)
    return (s - s.mean()) / (s.std(ddof=0) + 1e-8)

def entropy_from_counts(arr):
    arr = arr[arr > 0].astype(float)
    p = arr / arr.sum()
    return -float(np.sum(p * np.log(p + 1e-12)))

def resultant_length(sin_vals, cos_vals):
    s, c = sin_vals.mean(), cos_vals.mean()
    return np.sqrt(s*s + c*c)

def compute_nfs():
    """Compute product-level suspiciousness scores."""
    print("📊 Computing NFS (suspiciousness) scores...")
    df = pd.read_csv(OUT_EDGES)
    ucol, pcol = "src", "dst"

    user_deg = df.groupby(ucol).size().rename("user_outdeg")
    gb_prod = df.groupby(pcol)

    # product stats
    prod_deg = gb_prod.size().rename("prod_num_reviews")
    unique_users = gb_prod[ucol].nunique().rename("prod_unique_users")

    def _entropy(g):
        users = g[ucol].values
        degs = user_deg.reindex(users).fillna(1).values
        return entropy_from_counts(degs)

    ent = gb_prod.apply(_entropy).rename("reviewer_activity_entropy")
    burst = gb_prod.apply(lambda g: resultant_length(g["sin_day"], g["cos_day"])).rename("burst_day")

    dup = gb_prod["dup_count_scaled"].mean().rename("dup_mean") if "dup_count_scaled" in df else None
    ver = gb_prod["verified_int_scaled"].mean().rename("verified_mean") if "verified_int_scaled" in df else None
    rating = gb_prod["rating_scaled"].apply(lambda s: s.abs().mean()).rename("rating_abs_mean") if "rating_scaled" in df else None

    feats = [prod_deg, unique_users, ent, burst]
    for opt in [dup, ver, rating]:
        if opt is not None:
            feats.append(opt)

    prod_df = pd.concat(feats, axis=1).fillna(0).reset_index().rename(columns={pcol: "product_id"})

    # z-scores
    z = {
        "z_deg": zscore(prod_df["prod_num_reviews"]),
        "z_entropy": -zscore(prod_df["reviewer_activity_entropy"]),
        "z_burst": zscore(prod_df["burst_day"]),
        "z_dup": zscore(prod_df["dup_mean"]) if "dup_mean" in prod_df else 0,
        "z_ver": -zscore(prod_df["verified_mean"]) if "verified_mean" in prod_df else 0,
        "z_rating": zscore(prod_df["rating_abs_mean"]) if "rating_abs_mean" in prod_df else 0,
    }

    zdf = pd.DataFrame(z)
    nfs = (
        1.0*zdf["z_deg"] +
        1.0*zdf["z_entropy"] +
        1.0*zdf["z_burst"] +
        1.0*zdf["z_dup"] +
        0.5*zdf["z_ver"] +
        0.5*zdf["z_rating"]
    )
    prod_df["NFS_post"] = nfs
    prod_df.sort_values("NFS_post", ascending=False, inplace=True)
    prod_df.to_csv(OUT_TARGETS, index=False)
    print(f"✅ Saved NFS scores → {OUT_TARGETS.name}")

In [6]:
# ==============================================================
# 4️⃣ BUILD HISN BATCHES PER TARGET PRODUCT
# ==============================================================

def build_batches():
    print("🧩 Building per-target HISNs...")
    nodes = pd.read_csv(OUT_NODES)
    edges = pd.read_csv(OUT_EDGES)
    targets = pd.read_csv(OUT_TARGETS)

    out_root = OUT_PATH / "hisn_batch"
    out_root.mkdir(exist_ok=True, parents=True)

    for tgt in tqdm(targets["product_id"].astype(str).tolist(), desc="Targets"):
        e1 = edges[edges["dst"].astype(str) == tgt]
        if e1.empty:
            continue
        users = e1["src"].astype(str).unique()
        e2 = edges[edges["src"].astype(str).isin(users)]
        sub_edges = pd.concat([e1, e2]).drop_duplicates(subset=["src","dst"])
        node_ids = pd.unique(pd.concat([sub_edges["src"], sub_edges["dst"]]))
        sub_nodes = nodes[nodes["node_id"].isin(node_ids)]

        out_dir = out_root / tgt
        out_dir.mkdir(exist_ok=True, parents=True)
        sub_nodes.to_csv(out_dir / "nodes.csv", index=False)
        sub_edges.to_csv(out_dir / "edges.csv", index=False)

    print(f"✅ HISN batches saved → {out_root}")

In [8]:
print("Export HISN graph")
export_hisn_graph()
print("Run structure report")
run_structure_report()
print("Computing NFS scores")
compute_nfs()
print("Building HISN batches")
build_batches()
print("🎉 All pipeline stages complete!")

Export HISN graph
🚀 Building HISN graph...


  return ts.view("int64") // 1_000_000_000
  return ts.view("int64") // 1_000_000_000


✅ Exported HISN graph → nodes.csv, edges.csv, edge_emb.npy
Run structure report
🧩 Checking HISN structure...
✅ HISN report written → hisn_report.json
Computing NFS scores
📊 Computing NFS (suspiciousness) scores...


  ent = gb_prod.apply(_entropy).rename("reviewer_activity_entropy")
  burst = gb_prod.apply(lambda g: resultant_length(g["sin_day"], g["cos_day"])).rename("burst_day")


✅ Saved NFS scores → targets.csv
Building HISN batches
🧩 Building per-target HISNs...


Targets:   0%|          | 0/15881 [00:00<?, ?it/s]

✅ HISN batches saved → nb3\hisn_batch
🎉 All pipeline stages complete!
