In [None]:
# ========= POS-aware Cleaning Pipeline (FP, adjusted) =========
import os, re, glob, json, pickle, random
from collections import Counter, defaultdict
import pandas as pd
import nltk

# ----- Ensure NLTK resources -----
for pkg in ("stopwords", "punkt", "wordnet"):
    try:
        nltk.data.find(f"corpora/{pkg}" if pkg != "punkt" else "tokenizers/punkt")
    except LookupError:
        nltk.download(pkg)

try:
    nltk.data.find("taggers/averaged_perceptron_tagger_eng")
except LookupError:
    try:
        nltk.download("averaged_perceptron_tagger_eng")
    except:
        nltk.download("averaged_perceptron_tagger")

from nltk.corpus import stopwords, wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag_sents

# ----- Config -----
INPUT_ROOTS = [
    os.path.join("..", "data", "raw")
]
OUTPUT_DIR  = os.path.join("..", "data", "final")
os.makedirs(OUTPUT_DIR, exist_ok=True)

GLOB_PATTERNS = {
    "Fortnite_Ninja":    "Fortnite*Ninja*Comments*Analysis.parquet",
    "Fortnite_SypherPK": "Fortnite*Sypher*Comments*Analysis.parquet",
    "Fortnite_NickEh30": "Fortnite*Nick*Eh*30*Comments*Analysis.parquet",
    "Apex Legends":      "Apex*Legends*Comments*Analysis.parquet",
    "Rocket League":     "Rocket*League*Comments*Analysis.parquet",
    "DOTA 2":            "DOTA*2*Comments*Analysis.parquet",
    "Valorant":          "Valorant*Comments*Analysis.parquet",
}

# Phrase thresholds (tightened for big corpus)
BIGRAM_MIN_COUNT = 10
PHRASE_THRESHOLD = 8.0

# Row min tokens
MIN_TOKENS_ROW   = 5

# Dictionary pruning
NO_BELOW = 5
NO_ABOVE = 0.50
KEEP_N   = 100_000

# Keep or drop franchise tokens in the GLOBAL model
KEEP_FRANCHISE_TOKENS = False  # set True if you want topics to explicitly include franchise names

# ----- Stopwords -----
NLTK_STOP = set(stopwords.words("english"))
CUSTOM_STOP = {
    # ... [same as before, omitted for brevity]
}
if not KEEP_FRANCHISE_TOKENS:
    CUSTOM_STOP |= {'fortnite','apex','valorant','rocket_league','dota'}

STOP_WORDS = NLTK_STOP.union(CUSTOM_STOP)

# ----- Regex/helpers -----
URL_RE   = re.compile(r"(?:\@|http?\://|https?\://|www)\S+")
HTML_RE  = re.compile(r"<.*?>")
PUNC_RE  = re.compile(r"[^\w\s]")
DIGIT_RE = re.compile(r"\d+")
WS_RE    = re.compile(r"\s+")
LEMM     = WordNetLemmatizer()

LEMMA_FIX = {
    'of_thief': 'sea_of_thieves',
    'sea_of_thief': 'sea_of_thieves',
    'sea_of_thief_sea': 'sea_of_thieves',
}

BAD_PHRASE = re.compile(r'^[a-z]_[a-z]$')

def _wn_pos(tag: str):
    if not tag: return wn.NOUN
    t = tag[0]
    return wn.ADJ if t == 'J' else wn.VERB if t == 'V' else wn.NOUN if t == 'N' else wn.ADV if t == 'R' else wn.NOUN

def normalize(text: str) -> str:
    text = text.lower()
    text = URL_RE.sub(" ", text)
    text = HTML_RE.sub(" ", text)
    text = PUNC_RE.sub(" ", text)
    text = DIGIT_RE.sub(" ", text)
    text = WS_RE.sub(" ", text).strip()
    return text

def tokenize_simple(text: str):
    return text.split()

def pos_lemmatize(tokens):
    if not tokens:
        return []
    tagged = list(pos_tag_sents([tokens]))[0]
    return [LEMM.lemmatize(w, _wn_pos(tag)) for (w, tag) in tagged]

def resolve_path(pattern, roots):
    for root in roots:
        matches = glob.glob(os.path.join(root, pattern))
        if matches:
            matches.sort(key=lambda p: os.path.getmtime(p), reverse=True)
            return matches[0]
    return None

# ---------- Load, normalize, POS-lemma ----------
raw_dfs, missing = [], []
for label, pat in GLOB_PATTERNS.items():
    fpath = resolve_path(pat, INPUT_ROOTS)
    if not fpath:
        print(f"⚠️ No match for {label} with pattern {pat} in {INPUT_ROOTS}")
        missing.append(label)
        continue

    df = pd.read_parquet(fpath)
    if not {'author','text'}.issubset(df.columns):
        print(f"⚠️ Required columns missing in {os.path.basename(fpath)} — skipping.")
        continue

    df = df.dropna(subset=['author','text']).copy()
    df['__norm'] = df['text'].map(lambda t: normalize(t) if isinstance(t,str) else "")
    df['__raw_tokens'] = df['__norm'].map(tokenize_simple)
    df['raw_tokens'] = df['__raw_tokens'].map(pos_lemmatize)
    df['creator_or_game'] = label

    raw_dfs.append(df[['author','text','raw_tokens','creator_or_game']])
    print(f"✅ {label}: {len(df)} rows — {os.path.basename(fpath)}")

if not raw_dfs:
    raise SystemExit("No valid FP inputs loaded.")

fp = pd.concat(raw_dfs, ignore_index=True)
print("📊 Per-source counts:", fp['creator_or_game'].value_counts().to_dict())

# ---------- Train phrases on raw tokens (pre-stopwords) ----------
from gensim.models import Phrases
from gensim.models.phrases import Phraser

bigram  = Phrases(fp['raw_tokens'], min_count=BIGRAM_MIN_COUNT, threshold=PHRASE_THRESHOLD)
trigram = Phrases(bigram[fp['raw_tokens']], threshold=PHRASE_THRESHOLD)
bigram_phraser  = Phraser(bigram)
trigram_phraser = Phraser(trigram)

def apply_phrases_then_filter(toks):
    phr = trigram_phraser[bigram_phraser[toks]]
    phr = [w for w in phr if not BAD_PHRASE.match(w)]
    phr = [LEMMA_FIX.get(w, w) for w in phr]
    return [w for w in phr if w not in STOP_WORDS and len(w) > 2]

fp['tokens'] = fp['raw_tokens'].apply(apply_phrases_then_filter)

# ---------- Row-level min-length filter ----------
initial = len(fp)
fp = fp[fp['tokens'].str.len() >= MIN_TOKENS_ROW]
print(f"✅ Removed {initial - len(fp)} short comments (<{MIN_TOKENS_ROW} tokens).")

# ---------- Peek tokens ----------
all_tokens = [w for toks in fp['tokens'] for w in toks]
print("🔹 Top 50 tokens:", Counter(all_tokens).most_common(50))

# ---------- Save cleaned ----------
clean_path = os.path.join(OUTPUT_DIR, "Filtered_Combined_FP_Cleaned.parquet")
fp.to_parquet(clean_path, index=False)
print(f"💾 Saved cleaned data -> {clean_path}")

# ---------- Dictionary / Corpus (with pruning) ----------
from gensim.corpora import Dictionary
dictionary = Dictionary(fp['tokens'])
dictionary.filter_extremes(no_below=NO_BELOW, no_above=NO_ABOVE, keep_n=KEEP_N)
corpus = [dictionary.doc2bow(t) for t in fp['tokens']]
print(f"📚 Dictionary: {len(dictionary)} tokens | Corpus docs: {len(corpus)}")

dict_path = os.path.join(OUTPUT_DIR, "lda_dictionary_FP.dict")
dictionary.save(dict_path)

# Save phrasers & corpus for reuse
bigram_phraser.save(os.path.join(OUTPUT_DIR, "bigram_FP.pkl"))
trigram_phraser.save(os.path.join(OUTPUT_DIR, "trigram_FP.pkl"))
with open(os.path.join(OUTPUT_DIR, "lda_corpus_FP.pkl"), "wb") as f:
    pickle.dump(corpus, f)

# Save basic metadata
with open(os.path.join(OUTPUT_DIR, "cleaning_meta_FP.json"), "w") as f:
    json.dump({
        "no_below": NO_BELOW,
        "no_above": NO_ABOVE,
        "keep_n": KEEP_N,
        "bigram_min_count": BIGRAM_MIN_COUNT,
        "phrase_threshold": PHRASE_THRESHOLD,
        "min_tokens_row": MIN_TOKENS_ROW,
        "keep_franchise_tokens": KEEP_FRANCHISE_TOKENS,
        "stopwords_sizes": {"nltk": len(NLTK_STOP), "custom": len(CUSTOM_STOP)}
    }, f, indent=2)

# ---------- Stratified 90/10 split by source (creator/game) ----------
rng_state = 11
by_src = defaultdict(list)
for i, src in enumerate(fp['creator_or_game']):
    by_src[src].append(i)

hold_idx = set()
for src, idxs in by_src.items():
    r = random.Random(rng_state)
    r.shuffle(idxs)
    k = max(1, int(0.10 * len(idxs)))
    hold_idx.update(idxs[:k])

train_idx = [i for i in range(len(fp)) if i not in hold_idx]
test_idx  = [i for i in range(len(fp)) if i in hold_idx]
with open(os.path.join(OUTPUT_DIR, "lda_split_FP_stratified.json"), "w") as f:
    json.dump({"random_state": rng_state, "train_idx": train_idx, "test_idx": test_idx}, f, indent=2)

print(f"🧪 Stratified split saved — Train: {len(train_idx)}  Test: {len(test_idx)}")
print(f"✅ Artifacts saved:\n- Cleaned: {clean_path}\n- Dictionary: {dict_path}\n- Split: {os.path.join(OUTPUT_DIR, 'lda_split_FP_stratified.json')}")

In [None]:
# === LDA K sweep (FP, K = 1..35) — stratified split, CSV, and plots ===
import os, math, json, random
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel

# ---- Config ----
INPUT_FILE      = os.path.join("..", "data", "final", "Filtered_Combined_FP_Cleaned.parquet")
OUT_DIR         = os.path.join("..", "data", "final")
DICT_PATH       = os.path.join(OUT_DIR, "lda_dictionary_FP.dict")  # FP dictionary
K_GRID          = list(range(1, 36))  # 1..35 inclusive
RANDOM_STATE    = 11
PASSES, ITERS   = 5, 400              # can bump later when retraining best_k
CHUNKSIZE       = 2000
WORKERS         = os.cpu_count()

RESULTS_CSV     = os.path.join(OUT_DIR, "lda_k_selection_FP_metrics_1_35.csv")
SPLIT_JSON      = os.path.join(OUT_DIR, "lda_split_FP_stratified.json")
PLOT_COMBINED   = os.path.join(OUT_DIR, "lda_k_sweep_FP_1_35.png")
PLOT_COH_ONLY   = os.path.join(OUT_DIR, "lda_k_sweep_FP_coherence_only.png")
PLOT_LP_ONLY    = os.path.join(OUT_DIR, "lda_k_sweep_FP_logperp_only.png")

os.makedirs(OUT_DIR, exist_ok=True)

# ---- Load ----
print("📂 Loading data/dictionary...")
df = pd.read_parquet(INPUT_FILE)
texts = df["tokens"].tolist()
sources = df["creator_or_game"].tolist() if "creator_or_game" in df.columns else ["ALL"] * len(texts)

dictionary = Dictionary.load(DICT_PATH)
corpus = [dictionary.doc2bow(t) for t in texts]
print(f"✅ docs={len(corpus)}  vocab={len(dictionary)}")

# ---- Stratified train/test split by source (90/10) ----
if os.path.exists(SPLIT_JSON):
    print(f"🔁 Using existing split: {SPLIT_JSON}")
    with open(SPLIT_JSON, "r") as f:
        split = json.load(f)
    train_idx, test_idx = split["train_idx"], split["test_idx"]
else:
    print("🆕 Creating stratified split (90/10 by source)...")
    rng = random.Random(RANDOM_STATE)
    by_src = defaultdict(list)
    for i, s in enumerate(sources):
        by_src[s].append(i)

    hold_idx = set()
    for s, idxs in by_src.items():
        rng.shuffle(idxs)
        k = max(1, int(0.10 * len(idxs)))  # 10% per source
        hold_idx.update(idxs[:k])

    train_idx = [i for i in range(len(corpus)) if i not in hold_idx]
    test_idx  = [i for i in range(len(corpus)) if i in hold_idx]

    with open(SPLIT_JSON, "w") as f:
        json.dump({"random_state": RANDOM_STATE, "train_idx": train_idx, "test_idx": test_idx}, f)

train_corpus = [corpus[i] for i in train_idx]
test_corpus  = [corpus[i] for i in test_idx]
train_texts  = [texts[i] for i in train_idx]
print(f"🧪 Stratified split — Train: {len(train_corpus)}  Test: {len(test_corpus)}")

# ---- Train/eval helper ----
def train_eval(k: int):
    model = LdaMulticore(
        corpus=train_corpus,
        id2word=dictionary,
        num_topics=k,
        passes=PASSES,
        iterations=ITERS,
        random_state=RANDOM_STATE,
        workers=WORKERS,
        chunksize=CHUNKSIZE,
        eval_every=None,
        # Optional priors to try later:
        # alpha='asymmetric', eta=None
    )
    # Coherence on TRAIN to avoid leakage
    c_v = CoherenceModel(model=model, texts=train_texts, dictionary=dictionary, coherence="c_v").get_coherence()
    # Held-out log_perplexity: higher (less negative) is better
    log_perp = model.log_perplexity(test_corpus)
    return model, c_v, log_perp

# ---- Sweep K ----
rows = []
best = {"k": None, "c_v": -math.inf, "log_perplexity": -math.inf, "model": None}

for k in K_GRID:
    print(f"\n⏳ Training LDA (k={k}) ...")
    model, c_v, log_perp = train_eval(k)
    print(f"📈 k={k} | c_v={c_v:.4f} | log_perplexity={log_perp:.4f} (higher = better)")
    rows.append({"k": k, "c_v": c_v, "log_perplexity": log_perp})

    # Best by highest c_v; tie-break by highest log_perplexity
    if (c_v > best["c_v"]) or (math.isclose(c_v, best["c_v"], rel_tol=1e-6) and log_perp > best["log_perplexity"]):
        best.update({"k": k, "c_v": c_v, "log_perplexity": log_perp, "model": model})

# ---- Save metrics table ----
dfm = pd.DataFrame(rows).sort_values("k")
dfm.to_csv(RESULTS_CSV, index=False)
print(f"\n📝 Saved metrics -> {RESULTS_CSV}")

# ---- Save best model & topic terms ----
best_k = best["k"]
best_model = best["model"]
best_path = os.path.join(OUT_DIR, f"best_lda_model_FP_k{best_k}.model")
best_model.save(best_path)
print(f"🏆 Best K={best_k} | c_v={best['c_v']:.4f} | log_perplexity={best['log_perplexity']:.4f}")
print(f"💾 Saved best model -> {best_path}")

def dump_topics(model, topn=20, path=None):
    rows = []
    for t in range(model.num_topics):
        for rank, (w, p) in enumerate(model.show_topic(t, topn=topn), start=1):
            rows.append({"topic": t, "rank": rank, "word": w, "prob": p})
    dt = pd.DataFrame(rows)
    if path: dt.to_csv(path, index=False)
    return dt

topics_csv = os.path.join(OUT_DIR, f"best_topics_FP_k{best_k}.csv")
dump_topics(best_model, topn=20, path=topics_csv)
print(f"🗂️ Topic top-terms saved -> {topics_csv}")

# ---- Plots ----
def plot_combined(df, best_k, out_path):
    df = df.sort_values("k")
    fig, ax1 = plt.subplots(figsize=(9, 5))
    ax1.plot(df["k"], df["c_v"], marker="o", label="c_v")
    ax1.set_xlabel("K (number of topics)")
    ax1.set_ylabel("Coherence (c_v)")
    ax1.grid(True, alpha=0.3)

    ax2 = ax1.twinx()
    ax2.plot(df["k"], df["log_perplexity"], marker="s", linestyle="--", label="log_perplexity")
    ax2.set_ylabel("log_perplexity (higher is better)")

    ax1.axvline(best_k, linestyle=":", linewidth=1.5)
    ax1.set_title(f"LDA K Sweep (K=1..35) — Best K={best_k}")

    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines1 + lines2, labels1 + labels2, loc="best")

    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()
    print(f"🖼️ Saved combined plot -> {out_path}")

def plot_single(x, y, ylabel, title, out_path, marker="o"):
    plt.figure(figsize=(8, 4))
    plt.plot(x, y, marker=marker)
    plt.xlabel("K (number of topics)")
    plt.ylabel(ylabel)
    plt.title(title)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()
    print(f"🖼️ Saved plot -> {out_path}")

# Combined (twin-axis)
plot_combined(dfm, best_k, PLOT_COMBINED)

# Separate per-metric plots
plot_single(dfm["k"], dfm["c_v"], "Coherence (c_v)", "LDA K Sweep — Coherence", PLOT_COH_ONLY, marker="o")
plot_single(dfm["k"], dfm["log_perplexity"], "log_perplexity (higher is better)", "LDA K Sweep — log_perplexity", PLOT_LP_ONLY, marker="s")

In [None]:
import os
import pandas as pd
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore

OUT_DIR   = os.path.join("..", "data", "final")
INPUT     = os.path.join(OUT_DIR, "Filtered_Combined_FP_Cleaned.parquet")
DICT_PATH = os.path.join(OUT_DIR, "lda_dictionary_FP.dict")

BEST_K    = 3
RND       = 11
PASSES    = 20
ITERS     = 1000
CHUNKSIZE = 2000
WORKERS   = os.cpu_count()

print("📂 Loading full FP corpus/dictionary...")
df = pd.read_parquet(INPUT)
texts = df["tokens"].tolist()
dictionary = Dictionary.load(DICT_PATH)
corpus = [dictionary.doc2bow(t) for t in texts]
print(f"✅ docs={len(corpus)}  vocab={len(dictionary)}")

print(f"⏳ Training FINAL FP model (K={BEST_K}) on ALL docs...")
final_model = LdaMulticore(
    corpus=corpus,
    id2word=dictionary,
    num_topics=BEST_K,
    passes=PASSES,
    iterations=ITERS,
    random_state=RND,
    workers=WORKERS,
    chunksize=CHUNKSIZE,
    eval_every=None,
    alpha='asymmetric',
    eta='auto'
)

# Save model
final_model_path = os.path.join(OUT_DIR, f"final_lda_FP_k{BEST_K}.model")
final_model.save(final_model_path)
print(f"💾 Saved final model -> {final_model_path}")

# Export top-10 terms per topic
rows = []
for t in range(BEST_K):
    for rank, (w, p) in enumerate(final_model.show_topic(t, topn=10), start=1):
        rows.append({"topic": t, "rank": rank, "word": w, "prob": p})
topics_csv = os.path.join(OUT_DIR, f"final_topics_FP_k{BEST_K}.csv")
pd.DataFrame(rows).to_csv(topics_csv, index=False)
print(f"🗂️ Topic words saved -> {topics_csv}")