In [None]:
pip install pandas nltk gensim pyarrow fastparquet matplotlib


In [None]:
# ========= POS-aware Cleaning Pipeline (NLTK) with lemma fixes =========
import os, re, glob, json
from collections import Counter
import pandas as pd
import nltk

# ---------- Ensure NLTK resources ----------
for pkg in ("stopwords", "punkt", "wordnet"):
    try:
        nltk.data.find(f"corpora/{pkg}" if pkg != "punkt" else "tokenizers/punkt")
    except LookupError:
        nltk.download(pkg)

# Tagger (try new name first, fallback to old one)
try:
    nltk.data.find("taggers/averaged_perceptron_tagger_eng")
except LookupError:
    try:
        nltk.download("averaged_perceptron_tagger_eng")
    except:
        nltk.download("averaged_perceptron_tagger")

from nltk.corpus import stopwords, wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag_sents  # batch POS tagging for speed

# ---------- Config ----------
INPUT_ROOTS = [
    os.path.join("..", "data", "raw"),
]
OUTPUT_DIR  = os.path.join("..", "data", "final")
os.makedirs(OUTPUT_DIR, exist_ok=True)

GLOB_PATTERNS = {
    "Legend of Zelda":        "Legend*Wild*Comments*Analysis.parquet",
    "Baldur's Gate 3":        "Baldur*Gate*3*Comments*Analysis.parquet",
    "Elden Ring":             "Elden*Ring*Comments*Analysis.parquet",
    "Hollow Knight":          "Hollow*Knight*Comments*Analysis.parquet",
    "Red Dead Redemption 2":  "Red*Dead*Redemption*2*Comments*Analysis.parquet",
}

# Phrase thresholds
BIGRAM_MIN_COUNT = 5
PHRASE_THRESHOLD = 8.0
MIN_TOKENS_ROW   = 5

# Dictionary pruning
NO_BELOW = 5
NO_ABOVE = 0.5
KEEP_N   = 100_000

# ---------- Stopwords ----------
NLTK_STOP = set(stopwords.words("english"))
CUSTOM_STOP = {
    'video','game','online','youtube','series','pls','lol','omg','xd','people','thing',
    'play','make','time','love','look','want','think','watch','know','got','use','cant',
    'going','never','ever','part','help','played','getting','doesnt','bad','pretty',
    'show','fuck','talk','went','comment','cool','amazing','seen','best','like','get','one',
    'dont','would','first','really','see','also','dan','way','guy','good','say','back',
    'much','still','even','man','thats','need','bro','new','kid','every','always','could',
    'said','please','youre','actually','didnt','feel','ive','dude','name',
    'keep','gon','watching','everyone','hey','someone','made','come','great',
    'give','well','fun','nice','let','right','day','friend','thought','work','mean','take',
    'vid','lmao','lot','god','something','hope','put','cause','literally','since','next','hate',
    'used','saying','funny','many','vids','episode','playthrough','playing','thank','thanks','sure',
    'two','though','last','stuff','without','everything','maybe','second','around','long','place',
    'point','already','year','little','another','better','fucking','shit','area','found','wait','merg',
    'wouldnt','wouldve','youve','youll','wasnt','aint','couldnt','seems','happens','happened','taking',
    'honestly','definitely','either','looking','looked','open','add','top','full','mine','kept','tried','gave','minute','damn',
    'channel','walkthrough','content','using','done','start'
}
CREATOR_NAMES = {'arin','jack','brad','delirious','theradbrad','gamegrumps'}
STOP_WORDS = NLTK_STOP.union(CUSTOM_STOP).union(CREATOR_NAMES)

# ---------- Regex & helpers ----------
URL_RE   = re.compile(r"(?:\@|http?\://|https?\://|www)\S+")
HTML_RE  = re.compile(r"<.*?>")
PUNC_RE  = re.compile(r"[^\w\s]")
DIGIT_RE = re.compile(r"\d+")
WS_RE    = re.compile(r"\s+")

LEMM = WordNetLemmatizer()

# Fix lemmatization quirks (e.g., 'boss' -> 'bos')
LEMMA_FIX = {
    "bos": "boss",
    # add more if spotted later
}

def _wn_pos(tag: str):
    """Map Penn POS to WordNet POS."""
    if not tag:
        return wn.NOUN
    t = tag[0]
    if t == 'J': return wn.ADJ
    if t == 'V': return wn.VERB
    if t == 'N': return wn.NOUN
    if t == 'R': return wn.ADV
    return wn.NOUN

def normalize(text: str) -> str:
    text = text.lower()
    text = URL_RE.sub(" ", text)
    text = HTML_RE.sub(" ", text)
    text = PUNC_RE.sub(" ", text)
    text = DIGIT_RE.sub(" ", text)
    text = WS_RE.sub(" ", text).strip()
    return text

def tokenize_simple(text: str):
    return text.split()

def pos_lemmatize_tokens(tokens):
    tagged = list(pos_tag_sents([tokens]))[0]
    return [LEMM.lemmatize(w, _wn_pos(tag)) for (w, tag) in tagged]

def apply_lemma_fixes(tokens):
    return [LEMMA_FIX.get(t, t) for t in tokens]

def clean_text_pos(text: str):
    """Full cleaner with POS-aware lemmatization + lemma fixes."""
    if not isinstance(text, str) or not text:
        return []
    text = normalize(text)
    toks = tokenize_simple(text)
    toks = [t for t in toks if len(t) > 2]
    if not toks:
        return []
    toks = pos_lemmatize_tokens(toks)        # POS-aware lemmatization
    toks = apply_lemma_fixes(toks)           # fix known quirks (boss/bos)
    toks = [t for t in toks if t not in STOP_WORDS and len(t) > 2]
    return toks

# ---------- Robust file resolver ----------
def resolve_path(pattern, roots):
    for root in roots:
        matches = glob.glob(os.path.join(root, pattern))
        if matches:
            matches.sort(key=lambda p: os.path.getmtime(p), reverse=True)
            return matches[0]
    return None

# ---------- Load, clean, combine ----------
cleaned_dfs, missing = [], []
for game, pat in GLOB_PATTERNS.items():
    fpath = resolve_path(pat, INPUT_ROOTS)
    if not fpath:
        print(f"⚠️ No match for {game} with pattern {pat} in {INPUT_ROOTS}")
        missing.append(game)
        continue
    df = pd.read_parquet(fpath)
    if not {'author','text'}.issubset(df.columns):
        print(f"⚠️ Required columns missing in {os.path.basename(fpath)} — skipping.")
        continue
    df = df.dropna(subset=['author','text']).copy()
    df['tokens'] = df['text'].map(clean_text_pos)
    df = df[df['tokens'].str.len() > 0].drop_duplicates(subset=['text'])
    df['game'] = game
    cleaned_dfs.append(df)
    print(f"✅ {game}: {len(df)} rows — {os.path.basename(fpath)}")

if not cleaned_dfs:
    raise SystemExit("No valid input files loaded.")

story_comments = pd.concat(cleaned_dfs, ignore_index=True)
print("📊 Per-game counts:", story_comments['game'].value_counts().to_dict())

# ---------- Phrase modeling (then refilter stopwords & reapply lemma fixes) ----------
from gensim.models import Phrases
from gensim.models.phrases import Phraser

bigram  = Phrases(story_comments['tokens'], min_count=BIGRAM_MIN_COUNT, threshold=PHRASE_THRESHOLD)
trigram = Phrases(bigram[story_comments['tokens']], threshold=PHRASE_THRESHOLD)
bigram_phraser  = Phraser(bigram)
trigram_phraser = Phraser(trigram)

def apply_phrases_and_refilter(toks):
    phrased = trigram_phraser[bigram_phraser[toks]]
    phrased = apply_lemma_fixes(phrased)  # catch any phrase-stage quirks
    return [w for w in phrased if w not in STOP_WORDS and len(w) > 2]

story_comments['tokens'] = story_comments['tokens'].apply(apply_phrases_and_refilter)

# ---------- Row-level min-length filter ----------
initial = len(story_comments)
story_comments = story_comments[story_comments['tokens'].str.len() >= MIN_TOKENS_ROW]
print(f"✅ Removed {initial - len(story_comments)} short comments (<{MIN_TOKENS_ROW} tokens).")

# ---------- Token peek ----------
all_tokens = [w for toks in story_comments['tokens'] for w in toks]
print("🔹 Top 50 tokens:", Counter(all_tokens).most_common(50))

# ---------- Save cleaned ----------
clean_path = os.path.join(OUTPUT_DIR, "Filtered_Combined_SD_Cleaned.parquet")
story_comments.to_parquet(clean_path, index=False)
print(f"💾 Saved cleaned data -> {clean_path}")

# ---------- Dictionary / Corpus (with pruning knobs) ----------
from gensim.corpora import Dictionary
dictionary = Dictionary(story_comments['tokens'])
dictionary.filter_extremes(no_below=NO_BELOW, no_above=NO_ABOVE, keep_n=KEEP_N)
corpus = [dictionary.doc2bow(t) for t in story_comments['tokens']]
print(f"📚 Dictionary: {len(dictionary)} tokens | Corpus docs: {len(corpus)}")

dictionary.save(os.path.join(OUTPUT_DIR, "lda_dictionary_SD.dict"))

# Save phrasers & corpus to avoid recompute later
bigram_phraser.save(os.path.join(OUTPUT_DIR, "bigram_SD.pkl"))
trigram_phraser.save(os.path.join(OUTPUT_DIR, "trigram_SD.pkl"))
import pickle
with open(os.path.join(OUTPUT_DIR, "lda_corpus_SD.pkl"), "wb") as f:
    pickle.dump(corpus, f)

# Save basic metadata (handy for reproducibility)
with open(os.path.join(OUTPUT_DIR, "cleaning_meta_SD.json"), "w") as f:
    json.dump({
        "no_below": NO_BELOW,
        "no_above": NO_ABOVE,
        "keep_n": KEEP_N,
        "bigram_min_count": BIGRAM_MIN_COUNT,
        "phrase_threshold": PHRASE_THRESHOLD,
        "min_tokens_row": MIN_TOKENS_ROW,
        "stopwords_sizes": {"nltk": len(NLTK_STOP), "custom": len(CUSTOM_STOP), "creators": len(CREATOR_NAMES)},
        "lemma_fixes": list(LEMMA_FIX.items()),
    }, f, indent=2)

In [None]:
# === LDA K sweep (K = 1..35) — stratified split, CSV, and plots ===
import os, math, json, random
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel

# ---- Config ----
INPUT_FILE      = os.path.join("..", "data", "final", "Filtered_Combined_SD_Cleaned.parquet")
OUT_DIR         = os.path.join("..", "data", "final")
DICT_PATH       = os.path.join(OUT_DIR, "lda_dictionary_SD.dict")  # use the name you saved
K_GRID          = list(range(1, 36))  # 1..35 inclusive
RANDOM_STATE    = 11
PASSES, ITERS   = 5, 400              # can bump later when retraining best_k
CHUNKSIZE       = 2000
WORKERS         = os.cpu_count()

RESULTS_CSV     = os.path.join(OUT_DIR, "lda_k_selection_SD_metrics_1_35.csv")
SPLIT_JSON      = os.path.join(OUT_DIR, "lda_split_SD_stratified.json")
PLOT_COMBINED   = os.path.join(OUT_DIR, "lda_k_sweep_SD_1_35.png")
PLOT_COH_ONLY   = os.path.join(OUT_DIR, "lda_k_sweep_SD_coherence_only.png")
PLOT_LP_ONLY    = os.path.join(OUT_DIR, "lda_k_sweep_SD_logperp_only.png")

os.makedirs(OUT_DIR, exist_ok=True)

# ---- Load ----
print("📂 Loading data/dictionary...")
df = pd.read_parquet(INPUT_FILE)
texts = df["tokens"].tolist()
games = df["game"].tolist() if "game" in df.columns else ["ALL"] * len(texts)

dictionary = Dictionary.load(DICT_PATH)
corpus = [dictionary.doc2bow(t) for t in texts]
print(f"✅ docs={len(corpus)}  vocab={len(dictionary)}")

# ---- Stratified train/test split by game (90/10) ----
rng = random.Random(RANDOM_STATE)
by_game = defaultdict(list)
for i, g in enumerate(games):
    by_game[g].append(i)

hold_idx = set()
for g, idxs in by_game.items():
    rng.shuffle(idxs)
    k = max(1, int(0.10 * len(idxs)))  # 10% per game
    hold_idx.update(idxs[:k])

train_idx = [i for i in range(len(corpus)) if i not in hold_idx]
test_idx  = [i for i in range(len(corpus)) if i in hold_idx]

train_corpus = [corpus[i] for i in train_idx]
test_corpus  = [corpus[i] for i in test_idx]
train_texts  = [texts[i] for i in train_idx]

with open(SPLIT_JSON, "w") as f:
    json.dump({"random_state": RANDOM_STATE, "train_idx": train_idx, "test_idx": test_idx}, f)
print(f"🧪 Stratified split — Train: {len(train_corpus)}  Test: {len(test_corpus)}")

# ---- Train/eval helper ----
def train_eval(k: int):
    model = LdaMulticore(
        corpus=train_corpus,
        id2word=dictionary,
        num_topics=k,
        passes=PASSES,
        iterations=ITERS,
        random_state=RANDOM_STATE,
        workers=WORKERS,
        chunksize=CHUNKSIZE,
        eval_every=None,
        # Optional priors to try later:
        # alpha='asymmetric', eta=None
    )
    # Coherence on TRAIN to avoid leakage
    c_v = CoherenceModel(model=model, texts=train_texts, dictionary=dictionary, coherence="c_v").get_coherence()
    # Held-out log_perplexity: higher (less negative) is better
    log_perp = model.log_perplexity(test_corpus)
    return model, c_v, log_perp

# ---- Sweep K ----
rows = []
best = {"k": None, "c_v": -math.inf, "log_perplexity": -math.inf, "model": None}

for k in K_GRID:
    print(f"\n⏳ Training LDA (k={k}) ...")
    model, c_v, log_perp = train_eval(k)
    print(f"📈 k={k} | c_v={c_v:.4f} | log_perplexity={log_perp:.4f} (higher = better)")
    rows.append({"k": k, "c_v": c_v, "log_perplexity": log_perp})

    # Best by highest c_v; tie-break by highest log_perplexity
    if (c_v > best["c_v"]) or (math.isclose(c_v, best["c_v"], rel_tol=1e-6) and log_perp > best["log_perplexity"]):
        best.update({"k": k, "c_v": c_v, "log_perplexity": log_perp, "model": model})

# ---- Save metrics table ----
dfm = pd.DataFrame(rows).sort_values("k")
dfm.to_csv(RESULTS_CSV, index=False)
print(f"\n📝 Saved metrics -> {RESULTS_CSV}")

# ---- Save best model & topic terms ----
best_k = best["k"]
best_model = best["model"]
best_path = os.path.join(OUT_DIR, f"best_lda_model_SD_k{best_k}.model")
best_model.save(best_path)
print(f"🏆 Best K={best_k} | c_v={best['c_v']:.4f} | log_perplexity={best['log_perplexity']:.4f}")
print(f"💾 Saved best model -> {best_path}")

def dump_topics(model, topn=20, path=None):
    rows = []
    for t in range(model.num_topics):
        for rank, (w, p) in enumerate(model.show_topic(t, topn=topn), start=1):
            rows.append({"topic": t, "rank": rank, "word": w, "prob": p})
    dt = pd.DataFrame(rows)
    if path: dt.to_csv(path, index=False)
    return dt

topics_csv = os.path.join(OUT_DIR, f"best_topics_SD_k{best_k}.csv")
dump_topics(best_model, topn=20, path=topics_csv)
print(f"🗂️ Topic top-terms saved -> {topics_csv}")

# ---- Plots ----
def plot_combined(df, best_k, out_path):
    df = df.sort_values("k")
    fig, ax1 = plt.subplots(figsize=(9, 5))
    ax1.plot(df["k"], df["c_v"], marker="o", label="c_v")
    ax1.set_xlabel("K (number of topics)")
    ax1.set_ylabel("Coherence (c_v)")
    ax1.grid(True, alpha=0.3)

    ax2 = ax1.twinx()
    ax2.plot(df["k"], df["log_perplexity"], marker="s", linestyle="--", label="log_perplexity")
    ax2.set_ylabel("log_perplexity (higher is better)")

    ax1.axvline(best_k, linestyle=":", linewidth=1.5)
    ax1.set_title(f"LDA K Sweep (K=1..35) — Best K={best_k}")

    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines1 + lines2, labels1 + labels2, loc="best")

    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()
    print(f"🖼️ Saved combined plot -> {out_path}")

def plot_single(x, y, ylabel, title, out_path, marker="o"):
    plt.figure(figsize=(8, 4))
    plt.plot(x, y, marker=marker)
    plt.xlabel("K (number of topics)")
    plt.ylabel(ylabel)
    plt.title(title)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()
    print(f"🖼️ Saved plot -> {out_path}")

# Combined (twin-axis)
plot_combined(dfm, best_k, PLOT_COMBINED)

# Separate per-metric plots
plot_single(dfm["k"], dfm["c_v"], "Coherence (c_v)", "LDA K Sweep — Coherence", PLOT_COH_ONLY, marker="o")
plot_single(dfm["k"], dfm["log_perplexity"], "log_perplexity (higher is better)", "LDA K Sweep — log_perplexity", PLOT_LP_ONLY, marker="s")

In [None]:
# === LDA K micro-sweep (K = 2..6) — reuse stratified split, CSV, and plots ===
import os, math, json, random
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel

# ---- Config ----
OUT_DIR   = os.path.join("..", "data", "final")
INPUT     = os.path.join(OUT_DIR, "Filtered_Combined_SD_Cleaned.parquet")
DICT_PATH = os.path.join(OUT_DIR, "lda_dictionary_SD.dict")

K_GRID = list(range(2, 7))  # 2..6 inclusive
RANDOM_STATE = 11
PASSES, ITERS = 5, 400       # can bump after picking K
CHUNKSIZE = 2000
WORKERS = os.cpu_count()

RESULTS_CSV   = os.path.join(OUT_DIR, "lda_k_selection_SD_metrics_2_6.csv")
SPLIT_JSON    = os.path.join(OUT_DIR, "lda_split_SD_stratified.json")
PLOT_COMBINED = os.path.join(OUT_DIR, "lda_k_sweep_SD_2_6.png")
PLOT_COH_ONLY = os.path.join(OUT_DIR, "lda_k_sweep_SD_2_6_coherence.png")
PLOT_LP_ONLY  = os.path.join(OUT_DIR, "lda_k_sweep_SD_2_6_logperp.png")

os.makedirs(OUT_DIR, exist_ok=True)

# ---- Load data/dictionary ----
print("📂 Loading data/dictionary...")
df = pd.read_parquet(INPUT)
texts = df["tokens"].tolist()
games = df["game"].tolist() if "game" in df.columns else ["ALL"] * len(texts)
dictionary = Dictionary.load(DICT_PATH)
corpus = [dictionary.doc2bow(t) for t in texts]
print(f"✅ docs={len(corpus)}  vocab={len(dictionary)}")

# ---- Load or create stratified split (90/10 by game) ----
if os.path.exists(SPLIT_JSON):
    print(f"🔁 Using existing split: {SPLIT_JSON}")
    with open(SPLIT_JSON, "r") as f:
        split = json.load(f)
    train_idx, test_idx = split["train_idx"], split["test_idx"]
else:
    print("🆕 Creating stratified split (90/10 by game)...")
    rng = random.Random(RANDOM_STATE)
    by_game = defaultdict(list)
    for i, g in enumerate(games): by_game[g].append(i)

    hold_idx = set()
    for g, idxs in by_game.items():
        rng.shuffle(idxs)
        k = max(1, int(0.10 * len(idxs)))
        hold_idx.update(idxs[:k])

    train_idx = [i for i in range(len(corpus)) if i not in hold_idx]
    test_idx  = [i for i in range(len(corpus)) if i in hold_idx]
    with open(SPLIT_JSON, "w") as f:
        json.dump({"random_state": RANDOM_STATE, "train_idx": train_idx, "test_idx": test_idx}, f)

train_corpus = [corpus[i] for i in train_idx]
test_corpus  = [corpus[i] for i in test_idx]
train_texts  = [texts[i] for i in train_idx]
print(f"🧪 Train: {len(train_corpus)}  Test: {len(test_corpus)}")

# ---- Train/eval helper ----
def train_eval(k: int):
    model = LdaMulticore(
        corpus=train_corpus,
        id2word=dictionary,
        num_topics=k,
        passes=PASSES,
        iterations=ITERS,
        random_state=RANDOM_STATE,
        workers=WORKERS,
        chunksize=CHUNKSIZE,
        eval_every=None,
        # alpha='asymmetric', eta=None  # optional to try later
    )
    c_v = CoherenceModel(model=model, texts=train_texts, dictionary=dictionary, coherence="c_v").get_coherence()
    log_perp = model.log_perplexity(test_corpus)  # higher (less negative) is better
    return model, c_v, log_perp

# ---- Sweep ----
rows = []
best = {"k": None, "c_v": -math.inf, "log_perplexity": -math.inf, "model": None}

for k in K_GRID:
    print(f"\n⏳ Training LDA (k={k}) ...")
    model, c_v, log_perp = train_eval(k)
    print(f"📈 k={k} | c_v={c_v:.4f} | log_perplexity={log_perp:.4f} (higher = better)")
    rows.append({"k": k, "c_v": c_v, "log_perplexity": log_perp})
    if (c_v > best["c_v"]) or (math.isclose(c_v, best["c_v"], rel_tol=1e-6) and log_perp > best["log_perplexity"]):
        best.update({"k": k, "c_v": c_v, "log_perplexity": log_perp, "model": model})

# ---- Save metrics ----
dfm = pd.DataFrame(rows).sort_values("k")
dfm.to_csv(RESULTS_CSV, index=False)
print(f"\n📝 Saved metrics -> {RESULTS_CSV}")
print(f"🏆 Best K={best['k']} | c_v={best['c_v']:.4f} | log_perplexity={best['log_perplexity']:.4f}")

# ---- Plots ----
def plot_combined(df, best_k, out_path):
    df = df.sort_values("k")
    fig, ax1 = plt.subplots(figsize=(9, 5))
    ax1.plot(df["k"], df["c_v"], marker="o", label="c_v")
    ax1.set_xlabel("K (number of topics)")
    ax1.set_ylabel("Coherence (c_v)")
    ax1.grid(True, alpha=0.3)

    ax2 = ax1.twinx()
    ax2.plot(df["k"], df["log_perplexity"], marker="s", linestyle="--", label="log_perplexity")
    ax2.set_ylabel("log_perplexity (higher is better)")

    ax1.axvline(best_k, linestyle=":", linewidth=1.5)
    ax1.set_title(f"LDA K Sweep (K=2..6) — Best K={best_k}")

    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines1 + lines2, labels1 + labels2, loc="best")

    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()
    print(f"🖼️ Saved combined plot -> {out_path}")

def plot_single(x, y, ylabel, title, out_path, marker="o"):
    plt.figure(figsize=(8, 4))
    plt.plot(x, y, marker=marker)
    plt.xlabel("K (number of topics)")
    plt.ylabel(ylabel)
    plt.title(title)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()
    print(f"🖼️ Saved plot -> {out_path}")

plot_combined(dfm, best["k"], PLOT_COMBINED)
plot_single(dfm["k"], dfm["c_v"], "Coherence (c_v)", "LDA K Sweep — Coherence (2..6)", PLOT_COH_ONLY, marker="o")
plot_single(dfm["k"], dfm["log_perplexity"], "log_perplexity (higher is better)", "LDA K Sweep — log_perplexity (2..6)", PLOT_LP_ONLY, marker="s")

In [None]:
# === Finalize: Train the final K=3 model on the FULL corpus and export artifacts ===
import os
import pandas as pd
from collections import defaultdict, Counter
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore

OUT_DIR   = os.path.join("..", "data", "final")
INPUT     = os.path.join(OUT_DIR, "Filtered_Combined_SD_Cleaned.parquet")
DICT_PATH = os.path.join(OUT_DIR, "lda_dictionary_SD.dict")

BEST_K    = 3
RND       = 11
PASSES    = 20
ITERS     = 1000
CHUNKSIZE = 2000
WORKERS   = os.cpu_count()

print("📂 Loading full corpus/dictionary...")
df = pd.read_parquet(INPUT)
texts = df["tokens"].tolist()
games = df["game"].tolist() if "game" in df.columns else ["ALL"] * len(texts)
dictionary = Dictionary.load(DICT_PATH)
corpus = [dictionary.doc2bow(t) for t in texts]
print(f"✅ docs={len(corpus)}  vocab={len(dictionary)}")

print(f"⏳ Training FINAL model on ALL docs (K={BEST_K}) ...")
final_model = LdaMulticore(
    corpus=corpus,
    id2word=dictionary,
    num_topics=BEST_K,
    passes=PASSES,
    iterations=ITERS,
    random_state=RND,
    workers=WORKERS,
    chunksize=CHUNKSIZE,
    eval_every=None,
    alpha='asymmetric',
    eta='auto'
)

# ---- Save model ----
final_model_path = os.path.join(OUT_DIR, f"final_lda_SD_k{BEST_K}.model")
final_model.save(final_model_path)
print(f"💾 Saved final model -> {final_model_path}")

# ---- Export: topic top terms ----
rows = []
for t in range(BEST_K):
    for rank, (w, p) in enumerate(final_model.show_topic(t, topn=25), start=1):
        rows.append({"topic": t, "rank": rank, "word": w, "prob": p})
topics_csv = os.path.join(OUT_DIR, f"final_topics_SD_k{BEST_K}.csv")
pd.DataFrame(rows).to_csv(topics_csv, index=False)
print(f"🗂️ Topic words saved -> {topics_csv}")

# ---- Export: per-game topic prevalence ----
# Assign each doc to its most probable topic
doc_top = final_model.get_document_topics
topic_game_counts = defaultdict(lambda: Counter())
for i, bow in enumerate(corpus):
    dt = doc_top(bow)
    if not dt:
        continue
    top_topic, top_prob = max(dt, key=lambda x: x[1])
    topic_game_counts[top_topic][games[i]] += 1

pg_rows = []
for t in range(BEST_K):
    total = sum(topic_game_counts[t].values())
    for g, c in topic_game_counts[t].items():
        pg_rows.append({
            "topic": t,
            "game": g,
            "count": c,
            "share_in_topic": (c / total) if total else 0.0
        })
per_game_csv = os.path.join(OUT_DIR, f"final_topic_by_game_SD_k{BEST_K}.csv")
pd.DataFrame(pg_rows).to_csv(per_game_csv, index=False)
print(f"📊 Per-game prevalence saved -> {per_game_csv}")

# ---- Export: per-doc dominant topic (optional but handy) ----
doc_rows = []
for i, bow in enumerate(corpus):
    dt = doc_top(bow)
    if dt:
        t, p = max(dt, key=lambda x: x[1])
    else:
        t, p = -1, 0.0
    doc_rows.append({
        "doc_id": i,
        "game": games[i],
        "dominant_topic": t,
        "dominant_prob": p
    })
per_doc_csv = os.path.join(OUT_DIR, f"final_doc_topics_SD_k{BEST_K}.csv")
pd.DataFrame(doc_rows).to_csv(per_doc_csv, index=False)
print(f"🧾 Per-doc dominant topics saved -> {per_doc_csv}")

print("✅ Finalization complete.")