### LDA with multiple topic counts (6, 10, 15)

This script tries multiple topic counts (6, 10, 15), saves a bar chart PNG for each count (to outputs/reports/images/), prints top words per topic (so we can label bars in the report), reports LDA perplexity for quick comparison, handles empty/missing descriptions safely

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
# ---------- paths ----------
try:
    BASE = Path(__file__).resolve().parents[2]   # when in a .py under outputs/reports/
except NameError:
    BASE = Path.cwd()
    if BASE.name == "reports":
        BASE = BASE.parents[1]
    elif BASE.name == "outputs":
        BASE = BASE.parent

courses_path = BASE / "backend" / "data" / "interim" / "courses_metadata_clean.csv"
videos_path = BASE / "backend" / "data" / "interim" / "videos_metadata_clean.csv"
books_path = BASE / "backend" / "data" / "interim" / "books_metadata.csv"

img_dir      = BASE / "outputs" / "reports" / "images"
img_dir.mkdir(parents=True, exist_ok=True)


print("[check] Using:", courses_path)
print("[check] Using:", videos_path)
print("[check] Using:", books_path)

df = pd.read_csv(courses_path)

TEXT_COL = "description"  

[check] Using: c:\Users\jvlas\source\repos\TrioLearn\backend\data\interim\courses_metadata_clean.csv
[check] Using: c:\Users\jvlas\source\repos\TrioLearn\backend\data\interim\videos_metadata_clean.csv
[check] Using: c:\Users\jvlas\source\repos\TrioLearn\backend\data\interim\books_metadata.csv


In [None]:

def fit_lda_on_masked(df, text_col, n_topics, random_state=42):
    txt = df[text_col].astype(str).fillna("").str.strip()
    mask = txt.str.len() > 0
    texts = txt[mask]
    if texts.empty:
        raise ValueError(f"No non-empty docs in '{text_col}'")

    vec = CountVectorizer(stop_words="english", max_df=0.95, min_df=2)
    X = vec.fit_transform(texts)
    if X.shape[1] == 0:
        vec = CountVectorizer(stop_words="english", max_df=1.0, min_df=1)
        X = vec.fit_transform(texts)

    lda = LatentDirichletAllocation(
        n_components=n_topics, learning_method="batch", random_state=random_state
    )
    theta = lda.fit_transform(X)  # rows align to texts.index
    dom = pd.Series(theta.argmax(axis=1), index=texts.index)

    # attach 
    col = f"dom_topic_{n_topics}"
    out = df.copy()
    out[col] = np.nan
    out.loc[mask, col] = dom
    return out, lda, vec, X, col

In [4]:
def top_words_per_topic(lda, vectorizer, topn=10):
    terms = np.array(vectorizer.get_feature_names_out())
    tops = []
    for k, comp in enumerate(lda.components_):
        idx = np.argsort(comp)[::-1][:topn]
        words = terms[idx]
        tops.append((k, words))
    return tops

In [5]:
def save_distribution_plot(df, topic_col, n_topics, out_png_path, title_prefix="Courses"):
    counts = pd.Series(pd.Categorical(df[topic_col], categories=range(n_topics))).value_counts().sort_index()
    plt.figure(figsize=(10,5))
    counts.plot(kind="bar")
    plt.title(f"Distribution of Dominant Topics ({title_prefix}) — K={n_topics}")
    plt.xlabel("Topic")
    plt.ylabel("Number of Items")
    plt.tight_layout()
    plt.savefig(out_png_path, dpi=300)
    plt.close()

### Run script for COURSES with several K = 6, 10, 15. 

In [None]:

topic_grid = [6, 10, 15]  
results = []

for K in topic_grid:
    print(f"\n=== Fitting LDA with K={K} ===")
    dfK, ldaK, vecK, XK, colK = fit_lda_on_masked(df, TEXT_COL, n_topics=K, random_state=42)

    # perplexity (lower is better)
    perp = ldaK.perplexity(XK)
    print(f"[metric] Perplexity(K={K}): {perp:,.2f}")

    # top words
    print(f"[topics] Top words per topic (K={K}):")
    for k, words in top_words_per_topic(ldaK, vecK, topn=10):
        print(f"  Topic {k}: {', '.join(words)}")

    # save plot
    png_path = img_dir / f"Courses_topic_distribution_K{K}.png"
    save_distribution_plot(dfK, colK, n_topics=K, out_png_path=png_path, title_prefix="Courses")
    print(f"[save] {png_path}")

    results.append({"K": K, "perplexity": perp, "png": str(png_path), "topic_col": colK})

print("\n[summary]")
for r in results:
    print(f"  K={r['K']}: perplexity={r['perplexity']:.2f} | plot={r['png']}")


=== Fitting LDA with K=6 ===
[metric] Perplexity(K=6): 1,165.97
[topics] Top words per topic (K=6):
  Topic 0: en, google, la, des, les, et, que, para, vous, el
  Topic 1: nan, project, management, ll, program, skills, projects, software, cloud, development
  Topic 2: data, learning, course, specialization, ll, machine, project, skills, learn, ai
  Topic 3: cloud, google, exam, professional, learning, certification, hands, aws, microsoft, new
  Topic 4: data, course, skills, ll, learn, business, project, analytics, program, career
  Topic 5: course, design, specialization, financial, ux, business, learn, new, understand, project
[save] c:\Users\jvlas\source\repos\TrioLearn\outputs\reports\images\courses_topic_distribution_K6.png

=== Fitting LDA with K=10 ===
[metric] Perplexity(K=10): 1,194.22
[topics] Top words per topic (K=10):
  Topic 0: des, google, les, et, learning, vous, la, machine, une, en
  Topic 1: project, ll, skills, management, software, projects, program, cloud, develo

### Run script for videos 

In [None]:
topic_grid = [6, 10, 15] 
results = []

for K in topic_grid:
    print(f"\n=== Fitting LDA with K={K} ===")
    dfK, ldaK, vecK, XK, colK = fit_lda_on_masked(df, TEXT_COL, n_topics=K, random_state=42)

    # perplexity (lower is better)
    perp = ldaK.perplexity(XK)
    print(f"[metric] Perplexity(K={K}): {perp:,.2f}")

    # top words
    print(f"[topics] Top words per topic (K={K}):")
    for k, words in top_words_per_topic(ldaK, vecK, topn=10):
        print(f"  Topic {k}: {', '.join(words)}")

    # save plot
    png_path = img_dir / f"videos_topic_distribution_K{K}.png"
    save_distribution_plot(dfK, colK, n_topics=K, out_png_path=png_path, title_prefix="Videos")
    print(f"[save] {png_path}")

    results.append({"K": K, "perplexity": perp, "png": str(png_path), "topic_col": colK})

print("\n[summary]")
for r in results:
    print(f"  K={r['K']}: perplexity={r['perplexity']:.2f} | plot={r['png']}")