In [7]:
pip install pandas nltk gensim pyarrow fastparquet matplotlib sklearn

Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × python setup.py egg_info did not run successfully.
  │ exit code: 1
  ╰─> [15 lines of output]
      The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
      rather than 'sklearn' for pip commands.
      
      Here is how to fix this error in the main use cases:
      - use 'pip install scikit-learn' rather than 'pip install sklearn'
      - replace 'sklearn' by 'scikit-learn' in your pip requirements files
        (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
      - if the 'sklearn' package is used by one of your dependencies,
        it would be great if you take some time to track which package uses
        'sklearn' instead of 'scikit-learn' and report it to their issue tracker
      - as a last resort, set the environment variable
        SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
      
      More information is available at
      https://github.com/scikit-learn/sklearn-pypi-packag

In [5]:
# ========= POS-aware Cleaning Pipeline (All Games) =========
import os, re, glob, json, pickle, random
from collections import Counter, defaultdict
import pandas as pd
import nltk

# ----- Ensure NLTK resources -----
for pkg in ("stopwords", "punkt", "wordnet"):
    try:
        nltk.data.find(f"corpora/{pkg}" if pkg != "punkt" else "tokenizers/punkt")
    except LookupError:
        nltk.download(pkg)

# Tagger (try new name first, fallback to old)
try:
    nltk.data.find("taggers/averaged_perceptron_tagger_eng")
except LookupError:
    try:
        nltk.download("averaged_perceptron_tagger_eng")
    except:
        nltk.download("averaged_perceptron_tagger")

from nltk.corpus import stopwords, wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag_sents

# ----- Config -----
INPUT_ROOTS = [
    r"C:\Users\colin\Box\2024-colin-viktor\Videogame Scraping Project\data\raw"
]
OUT_DIR = r"C:\Users\colin\Box\2024-colin-viktor\Videogame Scraping Project\data\final"
os.makedirs(OUT_DIR, exist_ok=True)

# Flexible matching for *all* titles
GLOB_PATTERNS = {
    "Legend of Zelda":        "Legend*Wild*Comments*Analysis.parquet",
    "Fortnite_Ninja":         "Fortnite*Ninja*Comments*Analysis.parquet",
    "Fortnite_SypherPK":      "Fortnite*Sypher*Comments*Analysis.parquet",
    "Fortnite_NickEh30":      "Fortnite*Nick*Eh*30*Comments*Analysis.parquet",
    "Apex Legends":           "Apex*Legends*Comments*Analysis.parquet",
    "Baldur's Gate 3":        "Baldur*Gate*3*Comments*Analysis.parquet",
    "Rocket League":          "Rocket*League*Comments*Analysis.parquet",
    "Elden Ring":             "Elden*Ring*Comments*Analysis.parquet",
    "Hollow Knight":          "Hollow*Knight*Comments*Analysis.parquet",
    "Red Dead Redemption 2":  "Red*Dead*Redemption*2*Comments*Analysis.parquet",
    "DOTA 2":                 "DOTA*2*Comments*Analysis.parquet",
    "Valorant":               "Valorant*Comments*Analysis.parquet",
}

# Phrase thresholds
BIGRAM_MIN_COUNT = 5
PHRASE_THRESHOLD = 8.0

# Row min tokens
MIN_TOKENS_ROW = 5

# Dictionary pruning
NO_BELOW = 5
NO_ABOVE = 0.50
KEEP_N   = 100_000

# Keep or drop franchise tokens in the GLOBAL model
KEEP_FRANCHISE_TOKENS = False  # set True to allow topics anchored on franchise names

# ----- Stopwords -----
NLTK_STOP = set(stopwords.words("english"))
CUSTOM_STOP = {
    # general chat/meta
    'video','game','online','youtube','series','pls','lol','omg','xd','people','thing',
    'play','playing','make','time','love','look','want','think','watch','know','got','use','cant',
    'going','never','ever','part','help','played','getting','doesnt','bad','pretty',
    'show','fuck','shit','talk','went','comment','cool','amazing','seen','best','like','get','one',
    'dont','would','first','really','see','also','way','guy','good','say','back','much','still','even',
    'man','thats','need','bro','new','kid','every','always','could','said','please','youre','actually',
    'didnt','feel','ive','dude','name','keep','gon','watching','everyone','hey','someone','made','come',
    'great','give','well','fun','nice','let','right','day','friend','thought','work','mean','take','vid',
    'lmao','lot','god','something','hope','put','cause','literally','since','next','hate','used','saying',
    'funny','many','vids','tbh','wtf','ngl','hell','thank','thanks','maybe','already','oh','real','whole',
    'two','old','hour','minute','top','last','final','big','small','long','short','fast','slow','soon','later',
    'yeah','yall','wanna','wont','idk','guess','sometimes','isnt','easy','point','almost','behind','beginning',
    'true','sure','place','reason','whats','talking','view','stream','watched','bruh','tho','thumbnail',
    # platform/meta chatter
    'sub','channel','content','clip','stream',
    # creator/channel handles
    'ninja','sypher','sypherpk','nick','nickeh','nickeh30','shroud','jonas','zylbrad','brad','arin','dan','delirious',
    # ranked/MMR meta
    'ranked','rank','season','matchmaking','mmr','elo',
    # phrase residues
    'can_t','so_much','feel_like','oh_yeah_oh_yeah','sea_of_thief_sea','of_thief',
    # meme/noise
    'wiggle_wiggle_wiggle_wiggle',
    'episode','gonna','anyone','second','little','probably','without','everything',
    'another','year','stuff','around','wish','life','stop','wait','tell','start',
    'leave','hear','saw','call','change','remember','anyone','probably','maybe',
    'anyway','already','yet','still','even','also','else','whole','point','true',
    'real','finally','little','big','long','short','high','low','fast','slow',
    # vague verbs
    'try','find','get','got','make','take','put','use','using','see','look',
    'watch','watching','know','think','say','said','want','need',
}

if not KEEP_FRANCHISE_TOKENS:
    CUSTOM_STOP |= {
        'fortnite','apex','valorant','rocket_league','dota','zelda','elden_ring','hollow_knight',
        'red_dead_redemption','red_dead_redemption_2','baldur','baldur_gate','baldur_gate_3','rdr','rdr2'
    }

STOP_WORDS = NLTK_STOP.union(CUSTOM_STOP)

# ----- Regex/helpers -----
URL_RE   = re.compile(r"(?:\@|http?\://|https?\://|www)\S+")
HTML_RE  = re.compile(r"<.*?>")
PUNC_RE  = re.compile(r"[^\w\s]")
DIGIT_RE = re.compile(r"\d+")
WS_RE    = re.compile(r"\s+")
LEMM     = WordNetLemmatizer()

# Optional: fix odd lemmas if you see them
LEMMA_FIX = {
    # 'bos':'boss',
}

# Drop junk phrase artifacts (e.g., broken contractions)
BAD_PHRASE = re.compile(r'(^[a-z]_t$|^t_[a-z]$|^[a-z]_[a-z]$)')

def _wn_pos(tag: str):
    if not tag: return wn.NOUN
    t = tag[0]
    return wn.ADJ if t == 'J' else wn.VERB if t == 'V' else wn.NOUN if t == 'N' else wn.ADV if t == 'R' else wn.NOUN

def normalize(text: str) -> str:
    text = text.lower()
    text = URL_RE.sub(" ", text)
    text = HTML_RE.sub(" ", text)
    text = PUNC_RE.sub(" ", text)
    text = DIGIT_RE.sub(" ", text)
    text = WS_RE.sub(" ", text).strip()
    return text

def tokenize_simple(text: str):
    return text.split()

def pos_lemmatize(tokens):
    if not tokens:
        return []
    tagged = list(pos_tag_sents([tokens]))[0]
    return [LEMM.lemmatize(w, _wn_pos(tag)) for (w, tag) in tagged]

def resolve_path(pattern, roots):
    for root in roots:
        matches = glob.glob(os.path.join(root, pattern))
        if matches:
            matches.sort(key=lambda p: os.path.getmtime(p), reverse=True)
            return matches[0]
    return None

# ---------- Load, normalize, POS-lemma ----------
raw_dfs, missing = [], []
for game, pat in GLOB_PATTERNS.items():
    fpath = resolve_path(pat, INPUT_ROOTS)
    if not fpath:
        print(f"⚠️ No match for {game} with pattern {pat} in {INPUT_ROOTS}")
        missing.append(game); continue

    df = pd.read_parquet(fpath)
    if not {'author','text'}.issubset(df.columns):
        print(f"⚠️ Required columns missing in {os.path.basename(fpath)} — skipping.")
        continue

    df = df.dropna(subset=['author','text']).copy()
    df['__norm'] = df['text'].map(lambda t: normalize(t) if isinstance(t,str) else "")
    df['__raw_tokens'] = df['__norm'].map(tokenize_simple)
    df['raw_tokens'] = df['__raw_tokens'].map(pos_lemmatize)
    df['game'] = game

    raw_dfs.append(df[['author','text','raw_tokens','game']])
    print(f"✅ {game}: {len(df)} rows — {os.path.basename(fpath)}")

if not raw_dfs:
    raise SystemExit("No valid inputs loaded.")

all_df = pd.concat(raw_dfs, ignore_index=True)
print("📊 Per-game counts:", all_df['game'].value_counts().to_dict())

# ---------- Train phrases on raw tokens ----------
from gensim.models import Phrases
from gensim.models.phrases import Phraser

bigram  = Phrases(all_df['raw_tokens'], min_count=BIGRAM_MIN_COUNT, threshold=PHRASE_THRESHOLD)
trigram = Phrases(bigram[all_df['raw_tokens']], threshold=PHRASE_THRESHOLD)
bigram_phraser  = Phraser(bigram)
trigram_phraser = Phraser(trigram)

def apply_phrases_then_filter(toks):
    phr = trigram_phraser[bigram_phraser[toks]]
    phr = [w for w in phr if not BAD_PHRASE.match(w)]
    phr = [LEMMA_FIX.get(w, w) for w in phr]
    return [w for w in phr if w not in STOP_WORDS and len(w) > 2]

all_df['tokens'] = all_df['raw_tokens'].apply(apply_phrases_then_filter)

# ---------- Row-level min-length filter ----------
initial = len(all_df)
all_df = all_df[all_df['tokens'].str.len() >= MIN_TOKENS_ROW]
print(f"✅ Removed {initial - len(all_df)} short comments (<{MIN_TOKENS_ROW} tokens).")

# ---------- Peek tokens ----------
all_tokens = [w for toks in all_df['tokens'] for w in toks]
print("🔹 Top 50 tokens:", Counter(all_tokens).most_common(50))

# ---------- Save cleaned ----------
clean_path = os.path.join(OUT_DIR, "Filtered_Combined_AllGames_Cleaned.parquet")
all_df.to_parquet(clean_path, index=False)
print(f"💾 Saved cleaned data -> {clean_path}")

# ---------- Dictionary / Corpus (with pruning) ----------
from gensim.corpora import Dictionary
dictionary = Dictionary(all_df['tokens'])
dictionary.filter_extremes(no_below=NO_BELOW, no_above=NO_ABOVE, keep_n=KEEP_N)
corpus = [dictionary.doc2bow(t) for t in all_df['tokens']]
print(f"📚 Dictionary: {len(dictionary)} tokens | Corpus docs: {len(corpus)}")

dict_path = os.path.join(OUT_DIR, "lda_dictionary_AllGames.dict")
dictionary.save(dict_path)

# Save phrasers & corpus
bigram_phraser.save(os.path.join(OUT_DIR, "bigram_AllGames.pkl"))
trigram_phraser.save(os.path.join(OUT_DIR, "trigram_AllGames.pkl"))
with open(os.path.join(OUT_DIR, "lda_corpus_AllGames.pkl"), "wb") as f:
    pickle.dump(corpus, f)

# Save metadata
with open(os.path.join(OUT_DIR, "cleaning_meta_AllGames.json"), "w") as f:
    json.dump({
        "no_below": NO_BELOW,
        "no_above": NO_ABOVE,
        "keep_n": KEEP_N,
        "bigram_min_count": BIGRAM_MIN_COUNT,
        "phrase_threshold": PHRASE_THRESHOLD,
        "min_tokens_row": MIN_TOKENS_ROW,
        "keep_franchise_tokens": KEEP_FRANCHISE_TOKENS,
        "stopwords_sizes": {"nltk": len(NLTK_STOP), "custom": len(CUSTOM_STOP)},
    }, f, indent=2)

# ---------- Stratified 90/10 split BY GAME for later K-sweeps ----------
rng_state = 11
by_game = defaultdict(list)
for i, g in enumerate(all_df['game']):
    by_game[g].append(i)

hold_idx = set()
for g, idxs in by_game.items():
    r = random.Random(rng_state)
    r.shuffle(idxs)
    k = max(1, int(0.10 * len(idxs)))
    hold_idx.update(idxs[:k])

train_idx = [i for i in range(len(all_df)) if i not in hold_idx]
test_idx  = [i for i in range(len(all_df)) if i in hold_idx]
split_path = os.path.join(OUT_DIR, "lda_split_AllGames_stratified.json")
with open(split_path, "w") as f:
    json.dump({"random_state": rng_state, "train_idx": train_idx, "test_idx": test_idx}, f, indent=2)

print(f"🧪 Stratified split saved — Train: {len(train_idx)}  Test: {len(test_idx)}")
print(f"✅ Artifacts saved:\n- Cleaned: {clean_path}\n- Dictionary: {dict_path}\n- Split: {split_path}")


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Colin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


✅ Legend of Zelda: 201465 rows — Legend_of_Zelda_Breath_of_the_Wild_Comments_Analysis.parquet
✅ Fortnite_Ninja: 523501 rows — Fortnite_Ninja_Comments_Analysis.parquet
✅ Fortnite_SypherPK: 115364 rows — Fortnite_SypherPK_Comments_Analysis.parquet
✅ Fortnite_NickEh30: 180346 rows — Fortnite_NickEh30_Comments_Analysis.parquet
✅ Apex Legends: 486200 rows — Apex_Legends_Comments_Analysis.parquet
✅ Baldur's Gate 3: 11097 rows — Baldur's_Gate_3_Comments_Analysis.parquet
✅ Rocket League: 108567 rows — Rocket_League_Comments_Analysis.parquet
✅ Elden Ring: 129966 rows — Elden_Ring_Comments_Analysis.parquet
✅ Hollow Knight: 55429 rows — Hollow_Knight_Comments_Analysis.parquet
✅ Red Dead Redemption 2: 183118 rows — Red_Dead_Redemption_2_Comments_Analysis.parquet
✅ DOTA 2: 10046 rows — DOTA_2_Comments_Analysis.parquet
✅ Valorant: 74291 rows — Valorant_Comments_Analysis.parquet
📊 Per-game counts: {'Fortnite_Ninja': 523501, 'Apex Legends': 486200, 'Legend of Zelda': 201465, 'Red Dead Redemption 2': 1

In [9]:
# === LDA K sweep (ALL GAMES, K = 1..35) — stratified split, CSV, and plots ===
import os, math, json, random
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel

# ---- Config ----
OUT_DIR         = r"C:\Users\colin\Box\2024-colin-viktor\Videogame Scraping Project\data\final"
INPUT_FILE      = os.path.join(OUT_DIR, "Filtered_Combined_AllGames_Cleaned.parquet")
DICT_PATH       = os.path.join(OUT_DIR, "lda_dictionary_AllGames.dict")

K_GRID          = list(range(1, 36))  # 1..35 inclusive
RANDOM_STATE    = 11
PASSES, ITERS   = 5, 400
CHUNKSIZE       = 2000
WORKERS         = max(1, (os.cpu_count() or 1))

RESULTS_CSV     = os.path.join(OUT_DIR, "lda_k_selection_AllGames_metrics_1_35.csv")
SPLIT_JSON      = os.path.join(OUT_DIR, "lda_split_AllGames_stratified.json")
PLOT_COMBINED   = os.path.join(OUT_DIR, "lda_k_sweep_AllGames_1_35.png")
PLOT_COH_ONLY   = os.path.join(OUT_DIR, "lda_k_sweep_AllGames_coherence_only.png")
PLOT_LP_ONLY    = os.path.join(OUT_DIR, "lda_k_sweep_AllGames_logperp_only.png")

os.makedirs(OUT_DIR, exist_ok=True)

# ---- Load ----
print("📂 Loading data/dictionary...")
df = pd.read_parquet(INPUT_FILE)
texts = df["tokens"].tolist()
games = df["game"].tolist() if "game" in df.columns else ["ALL"] * len(texts)

dictionary = Dictionary.load(DICT_PATH)
corpus = [dictionary.doc2bow(t) for t in texts]
print(f"✅ docs={len(corpus)}  vocab={len(dictionary)}")

# ---- Stratified train/test split by game (90/10) ----
if os.path.exists(SPLIT_JSON):
    print(f"🔁 Using existing split: {SPLIT_JSON}")
    with open(SPLIT_JSON, "r") as f:
        split = json.load(f)
    train_idx, test_idx = split["train_idx"], split["test_idx"]
else:
    print("🆕 Creating stratified split (90/10 by game)...")
    rng = random.Random(RANDOM_STATE)
    by_game = defaultdict(list)
    for i, g in enumerate(games):
        by_game[g].append(i)

    hold_idx = set()
    for g, idxs in by_game.items():
        rng.shuffle(idxs)
        k = max(1, int(0.10 * len(idxs)))
        hold_idx.update(idxs[:k])

    train_idx = [i for i in range(len(corpus)) if i not in hold_idx]
    test_idx  = [i for i in range(len(corpus)) if i in hold_idx]

    with open(SPLIT_JSON, "w") as f:
        json.dump({"random_state": RANDOM_STATE, "train_idx": train_idx, "test_idx": test_idx}, f)

train_corpus = [corpus[i] for i in train_idx]
test_corpus  = [corpus[i] for i in test_idx]
train_texts  = [texts[i] for i in train_idx]
print(f"🧪 Stratified split — Train: {len(train_corpus)}  Test: {len(test_corpus)}")

# ---- Train/eval helper ----
def train_eval(k: int):
    model = LdaMulticore(
        corpus=train_corpus,
        id2word=dictionary,
        num_topics=k,
        passes=PASSES,
        iterations=ITERS,
        random_state=RANDOM_STATE,
        workers=WORKERS,
        chunksize=CHUNKSIZE,
        eval_every=None,
        alpha='asymmetric',
        eta='auto',
    )
    # Coherence on TRAIN to avoid leakage
    c_v = CoherenceModel(model=model, texts=train_texts, dictionary=dictionary, coherence="c_v").get_coherence()
    # Held-out log_perplexity on TEST: higher (less negative) is better
    log_perp = model.log_perplexity(test_corpus)
    return model, c_v, log_perp

# ---- Sweep K ----
rows = []
best = {"k": None, "c_v": -math.inf, "log_perplexity": -math.inf, "model": None}

for k in K_GRID:
    print(f"\n⏳ Training LDA (k={k}) ...")
    model, c_v, log_perp = train_eval(k)
    print(f"📈 k={k} | c_v={c_v:.4f} | log_perplexity={log_perp:.4f} (higher = better)")
    rows.append({"k": k, "c_v": c_v, "log_perplexity": log_perp})

    # Best by highest c_v; tie-break by highest log_perplexity
    if (c_v > best["c_v"]) or (math.isclose(c_v, best["c_v"], rel_tol=1e-6) and log_perp > best["log_perplexity"]):
        best.update({"k": k, "c_v": c_v, "log_perplexity": log_perp, "model": model})

# ---- Save metrics table ----
dfm = pd.DataFrame(rows).sort_values("k")
dfm.to_csv(RESULTS_CSV, index=False)
print(f"\n📝 Saved metrics -> {RESULTS_CSV}")

# ---- Save best model & topic terms ----
best_k = best["k"]
best_model = best["model"]
best_path = os.path.join(OUT_DIR, f"best_lda_model_AllGames_k{best_k}.model")
best_model.save(best_path)
print(f"🏆 Best K={best_k} | c_v={best['c_v']:.4f} | log_perplexity={best['log_perplexity']:.4f}")
print(f"💾 Saved best model -> {best_path}")

def dump_topics(model, topn=20, path=None):
    rows = []
    for t in range(model.num_topics):
        for rank, (w, p) in enumerate(model.show_topic(t, topn=topn), start=1):
            rows.append({"topic": t, "rank": rank, "word": w, "prob": p})
    dt = pd.DataFrame(rows)
    if path: dt.to_csv(path, index=False)
    return dt

topics_csv = os.path.join(OUT_DIR, f"best_topics_AllGames_k{best_k}.csv")
dump_topics(best_model, topn=20, path=topics_csv)
print(f"🗂️ Topic top-terms saved -> {topics_csv}")

# ---- Plots ----
def plot_combined(df, best_k, out_path):
    df = df.sort_values("k")
    fig, ax1 = plt.subplots(figsize=(9, 5))
    ax1.plot(df["k"], df["c_v"], marker="o", label="c_v")
    ax1.set_xlabel("K (number of topics)")
    ax1.set_ylabel("Coherence (c_v)")
    ax1.grid(True, alpha=0.3)

    ax2 = ax1.twinx()
    ax2.plot(df["k"], df["log_perplexity"], marker="s", linestyle="--", label="log_perplexity")
    ax2.set_ylabel("log_perplexity (higher is better)")

    ax1.axvline(best_k, linestyle=":", linewidth=1.5)
    ax1.set_title(f"LDA K Sweep (All Games, K=1..35) — Best K={best_k}")

    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines1 + lines2, labels1 + labels2, loc="best")

    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()
    print(f"🖼️ Saved combined plot -> {out_path}")

def plot_single(x, y, ylabel, title, out_path, marker="o"):
    plt.figure(figsize=(8, 4))
    plt.plot(x, y, marker=marker)
    plt.xlabel("K (number of topics)")
    plt.ylabel(ylabel)
    plt.title(title)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()
    print(f"🖼️ Saved plot -> {out_path}")

# Combined (twin-axis)
plot_combined(dfm, best_k, PLOT_COMBINED)

# Separate per-metric plots
plot_single(dfm["k"], dfm["c_v"], "Coherence (c_v)", "LDA K Sweep — Coherence (All Games)", PLOT_COH_ONLY, marker="o")
plot_single(dfm["k"], dfm["log_perplexity"], "log_perplexity (higher is better)", "LDA K Sweep — log_perplexity (All Games)", PLOT_LP_ONLY, marker="s")


📂 Loading data/dictionary...
✅ docs=423032  vocab=40194
🔁 Using existing split: C:\Users\colin\Box\2024-colin-viktor\Videogame Scraping Project\data\final\lda_split_AllGames_stratified.json
🧪 Stratified split — Train: 380734  Test: 42298

⏳ Training LDA (k=1) ...
📈 k=1 | c_v=0.4697 | log_perplexity=-8.7321 (higher = better)

⏳ Training LDA (k=2) ...
📈 k=2 | c_v=0.4547 | log_perplexity=-8.9449 (higher = better)

⏳ Training LDA (k=3) ...
📈 k=3 | c_v=0.4354 | log_perplexity=-9.0958 (higher = better)

⏳ Training LDA (k=4) ...
📈 k=4 | c_v=0.4248 | log_perplexity=-9.2249 (higher = better)

⏳ Training LDA (k=5) ...
📈 k=5 | c_v=0.4270 | log_perplexity=-9.3278 (higher = better)

⏳ Training LDA (k=6) ...
📈 k=6 | c_v=0.4316 | log_perplexity=-9.3860 (higher = better)

⏳ Training LDA (k=7) ...
📈 k=7 | c_v=0.4671 | log_perplexity=-9.4704 (higher = better)

⏳ Training LDA (k=8) ...
📈 k=8 | c_v=0.4593 | log_perplexity=-9.6207 (higher = better)

⏳ Training LDA (k=9) ...
📈 k=9 | c_v=0.4628 | log_perplex

In [None]:
import pandas as pd
from gensim.corpora import Dictionary
from gensim.models import LdaModel

# 🔹 Step 1: Load Cleaned Tokenized Data
lda_data_path = r"C:/Users/colin/Box/2024-colin-viktor/Videogame Scraping Project/data/final/Filtered_Combined_AllGames_Cleaned.parquet"
df = pd.read_parquet(lda_data_path)

# 🔹 Step 2: Create Dictionary & Corpus
dictionary = Dictionary(df["tokens"])
corpus = [dictionary.doc2bow(tokens) for tokens in df["tokens"]]
print(f"✅ Dictionary and corpus created with {len(dictionary)} unique tokens.")

# 🔹 Step 3: Define Function to Print Top Words for Each Topic
def print_top_words(lda_model, topn=10):
    k = lda_model.num_topics
    print(f"\n🔹 Top {topn} Words per Topic (K = {k})")
    for topic_id in range(k):
        words = [word for word, _ in lda_model.show_topic(topic_id, topn=topn)]
        print(f"Topic {topic_id + 1}: {', '.join(words)}")

# 🔹 Step 4: Run LDA for 4 Topics
print("\n🔹 Running LDA for 4 Topics...")
lda_model_4 = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=4,
    iterations=5000,
    random_state=42
)
print_top_words(lda_model_4, topn=10)

# 🔹 Step 5: Run LDA for 5 Topics
print("\n🔹 Running LDA for 5 Topics...")
lda_model_5 = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=5,
    iterations=5000,
    random_state=42
)
print_top_words(lda_model_5, topn=10)
