### LDA with multiple topic counts (6, 10, 15) forvideos

This script tries multiple topic counts (6, 10, 15), saves a bar chart PNG for each count (to outputs/reports/images/), prints top words per topic (so we can label bars in the report), reports LDA perplexity for quick comparison, handles empty/missing descriptions safely

In [2]:
from pathlib import Path
import re, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

### Configuration

In [6]:

MODALITY = "books"  # "courses" | "books" | "videos"

# INPUT_CSV = r"C:\Users\jvlas\source\repos\TrioLearn\backend\data\interim\videos_metadata_clean.csv"
INPUT_CSV = None

# Column with text to model:
TEXT_COL = "description"       

# Topic counts to try:
K_LIST = [6, 10, 15]

# Cleaned column name
CLEAN_COL = "description_clean"

# How many top words to export per topic
TOP_WORDS = 15

### Path

In [8]:

def resolve_base() -> Path:
    """Resolve TrioLearn repo root for both .py and notebook runs."""
    try:
        return Path(__file__).resolve().parents[2]
    except NameError:
        # If run in notebook / interactive
        cwd = Path.cwd()
        # If already at repo root (TrioLearn), keep it
        if (cwd / "backend").exists() and (cwd / "outputs").exists():
            return cwd
        # If inside outputs/reports, go up to repo
        if cwd.name == "reports" and cwd.parent.name == "outputs":
            return cwd.parents[1]
        # If inside outputs, go up one
        if cwd.name == "outputs":
            return cwd.parent
        # Fallback: assume current
        return cwd

BASE = resolve_base()

if INPUT_CSV is None:
    # Auto paths (change filename here per modality if needed)
    default_map = {
        "courses": BASE / "backend" / "data" / "interim" / "courses_metadata_clean.csv",
        "books":   BASE / "backend" / "data" / "interim" / "books_metadata.csv",
        "videos":  BASE / "backend" / "data" / "interim" / "videos_metadata_clean.csv",
    }
    INPUT_CSV = default_map.get(MODALITY, None)

if INPUT_CSV is None:
    print("[error] Could not resolve INPUT_CSV path. Please set INPUT_CSV explicitly.")
    sys.exit(1)

INPUT_CSV = Path(INPUT_CSV)
OUT_IMG_DIR = BASE / "outputs" / "reports" / "images"
OUT_CSV_DIR = BASE / "outputs" / "reports" / "csv"

print(f"[check] Repo base: {BASE}")
print(f"[check] Reading: {INPUT_CSV}")
print(f"[check] Images -> {OUT_IMG_DIR}")
print(f"[check] CSVs    -> {OUT_CSV_DIR}")

if not INPUT_CSV.exists():
    print(f"[error] CSV not found: {INPUT_CSV}")
    sys.exit(1)

OUT_IMG_DIR.mkdir(parents=True, exist_ok=True)
OUT_CSV_DIR.mkdir(parents=True, exist_ok=True)

[check] Repo base: c:\Users\jvlas\source\repos\TrioLearn
[check] Reading: c:\Users\jvlas\source\repos\TrioLearn\backend\data\interim\books_metadata.csv
[check] Images -> c:\Users\jvlas\source\repos\TrioLearn\outputs\reports\images
[check] CSVs    -> c:\Users\jvlas\source\repos\TrioLearn\outputs\reports\csv


### Load input cvs


In [9]:
df = pd.read_csv(INPUT_CSV)
print(f"[check] Loaded shape: {df.shape}")
print(f"[check] Columns: {list(df.columns)[:12]} ...")

[check] Loaded shape: (3507, 15)
[check] Columns: ['keyword', 'volume_id', 'title', 'authors', 'description', 'categories', 'publishedDate', 'pageCount', 'language', 'averageRating', 'ratingsCount', 'previewLink'] ...


### Pick an ID column

In [17]:
def pick_id_col(dframe: pd.DataFrame) -> str:
    for c in ["global_id", "course_id", "id", "external_id", "uuid", "book_id", "video_id"]:
        if c in dframe.columns:
            return c
    return None

ID_COL = pick_id_col(df)
if ID_COL is None:
    df = df.reset_index().rename(columns={"index": "item_id"})
    ID_COL = "item_id"
print(f"[check] Using ID column: {ID_COL}")

[check] Using ID column: item_id


### Cleaning: remove URLs & URL-like tockens

In [10]:

URL_TOKEN_PATTERN = re.compile(
    r"""(?ix)
    (https?://\S+)|
    \b(www|youtu|youtube|bit|ly|com|http|https|net|io|amzn|gle|linkedin|twitter|playlist|watch|youtu\.be)\b
    """
)

def clean_text_url_tokens(s: str) -> str:
    if not isinstance(s, str):
        s = "" if pd.isna(s) else str(s)
    s = re.sub(r"https?://\S+|www\.\S+", " ", s)      # remove full URLs
    s = URL_TOKEN_PATTERN.sub(" ", s)                 # remove url-like tokens
    s = re.sub(r"[^A-Za-z0-9\s\-\_\.]", " ", s)       # keep simple chars only
    s = re.sub(r"\s+", " ", s).strip()
    return s

if TEXT_COL not in df.columns:
    raise KeyError(f"[error] TEXT_COL='{TEXT_COL}' not in columns of {INPUT_CSV}")

df[CLEAN_COL] = df[TEXT_COL].map(clean_text_url_tokens)
print(f"[check] Cleaning done. Sample:\n{df[[TEXT_COL, CLEAN_COL]].head(3)}")

[check] Cleaning done. Sample:
                                         description  \
0  Just like electricity, Machine Learning will r...   
1  This comprehensive encyclopedia, in A-Z format...   
2  FUNDAMENTALS AND METHODS OF MACHINE AND DEEP L...   

                                   description_clean  
0  Just like electricity Machine Learning will re...  
1  This comprehensive encyclopedia in A-Z format ...  
2  FUNDAMENTALS AND METHODS OF MACHINE AND DEEP L...  


In [11]:
def fit_lda_on_col(dframe: pd.DataFrame, text_col: str, n_topics: int, modality: str, random_state: int = 42):
    txt = dframe[text_col].astype(str).fillna("").str.strip()
    mask = txt.str.len() > 0
    texts = txt[mask]
    if texts.empty:
        raise ValueError(f"No non-empty docs in '{text_col}'")

    vec = CountVectorizer(stop_words="english", max_df=0.95, min_df=2)
    X = vec.fit_transform(texts)
    if X.shape[1] == 0:
        vec = CountVectorizer(stop_words="english", max_df=1.0, min_df=1)
        X = vec.fit_transform(texts)

    lda = LatentDirichletAllocation(n_components=n_topics, learning_method="batch", random_state=random_state)
    theta = lda.fit_transform(X)
    dom = pd.Series(theta.argmax(axis=1), index=texts.index, name="dom_topic")

    topic_col = f"{modality}_dom_topic_K{n_topics}"
    out = dframe.copy()
    out[topic_col] = np.nan
    out.loc[mask, topic_col] = dom.values
    return out, lda, vec, X, topic_col

In [12]:
def top_words_table(lda, vectorizer, topn=TOP_WORDS) -> pd.DataFrame:
    terms = np.array(vectorizer.get_feature_names_out())
    rows = []
    for k, comp in enumerate(lda.components_):
        idx = np.argsort(comp)[::-1][:topn]
        for rank, j in enumerate(idx, start=1):
            rows.append({"topic": k, "rank": rank, "word": terms[j], "weight": comp[j]})
    return pd.DataFrame(rows)

In [15]:
def auto_label(words: list, max_words=3) -> str:
    return " / ".join(words[:max_words])

def join_words(ws, n=10):
    return ", ".join(ws[:n])

### Run script for chosen modality with several K = 6, 10, 15. 

In [18]:
summary = []
for K in K_LIST:
    print(f"\n=== Fitting LDA (MODALITY={MODALITY}, K={K}) ===")
    dfK, ldaK, vecK, XK, topic_col = fit_lda_on_col(df, CLEAN_COL, n_topics=K, modality=MODALITY, random_state=42)

    # Perplexity
    perp = ldaK.perplexity(XK)
    print(f"[metric] Perplexity(K={K}): {perp:,.2f}")

    # ---- Exports: per-item dominant topic
    out_items = pd.DataFrame({
        "item_id": dfK[ID_COL],
        "dom_topic": dfK[topic_col].astype("Int64")
    })
    csv_items = OUT_CSV_DIR / f"{MODALITY}_dom_topics_K{K}_CLEAN.csv"
    out_items.to_csv(csv_items, index=False)
    print(f"[save] {csv_items} ({len(out_items.dropna())} labeled rows)")

    # ---- Exports: top-words per topic
    tw = top_words_table(ldaK, vecK, topn=TOP_WORDS)
    # build suggested labels quickly
    label_df = (tw.groupby("topic")["word"].apply(list)
                  .reset_index()
                  .rename(columns={"word": "top_words_list"}))
    label_df["top_words"] = label_df["top_words_list"].apply(lambda ws: join_words(ws, n=min(10, len(ws))))
    label_df["suggested_label"] = label_df["top_words_list"].apply(lambda ws: auto_label(ws, max_words=3))
    label_df = label_df.drop(columns=["top_words_list"])
    csv_topics = OUT_CSV_DIR / f"{MODALITY}_topics_topwords_K{K}_CLEAN.csv"
    label_df.to_csv(csv_topics, index=False)
    print(f"[save] {csv_topics}")

    # ---- Exports: distribution counts
    counts = out_items["dom_topic"].value_counts().sort_index()
    dist_csv = OUT_CSV_DIR / f"{MODALITY}_topic_distribution_K{K}_CLEAN.csv"
    counts.rename_axis("topic").reset_index(name="count").to_csv(dist_csv, index=False)
    print(f"[save] {dist_csv}")

    # ---- Plot: distribution
    plt.figure(figsize=(10, 5))
    # Ensure all topics 0..K-1 appear even if empty
    cat = pd.Series(pd.Categorical(out_items["dom_topic"], categories=range(K)))
    cat.value_counts().sort_index().plot(kind="bar")
    plt.title(f"{MODALITY.capitalize()} — Distribution of Dominant Topics (K={K}, CLEAN)")
    plt.xlabel("Topic")
    plt.ylabel("Number of Items")
    plt.tight_layout()
    png_path = OUT_IMG_DIR / f"{MODALITY}_topic_distribution_K{K}_CLEAN.png"
    plt.savefig(png_path, dpi=300)
    plt.close()
    print(f"[save] {png_path}")

    summary.append({"K": K, "perplexity": perp, "items_csv": str(csv_items), "topics_csv": str(csv_topics), "dist_csv": str(dist_csv), "plot": str(png_path)})



=== Fitting LDA (MODALITY=books, K=6) ===
[metric] Perplexity(K=6): 1,518.14
[save] c:\Users\jvlas\source\repos\TrioLearn\outputs\reports\csv\books_dom_topics_K6_CLEAN.csv (3063 labeled rows)
[save] c:\Users\jvlas\source\repos\TrioLearn\outputs\reports\csv\books_topics_topwords_K6_CLEAN.csv
[save] c:\Users\jvlas\source\repos\TrioLearn\outputs\reports\csv\books_topic_distribution_K6_CLEAN.csv
[save] c:\Users\jvlas\source\repos\TrioLearn\outputs\reports\images\books_topic_distribution_K6_CLEAN.png

=== Fitting LDA (MODALITY=books, K=10) ===
[metric] Perplexity(K=10): 1,506.61
[save] c:\Users\jvlas\source\repos\TrioLearn\outputs\reports\csv\books_dom_topics_K10_CLEAN.csv (3063 labeled rows)
[save] c:\Users\jvlas\source\repos\TrioLearn\outputs\reports\csv\books_topics_topwords_K10_CLEAN.csv
[save] c:\Users\jvlas\source\repos\TrioLearn\outputs\reports\csv\books_topic_distribution_K10_CLEAN.csv
[save] c:\Users\jvlas\source\repos\TrioLearn\outputs\reports\images\books_topic_distribution_K10_

### Summary

In [19]:

print("\n[summary]")
for r in summary:
    print(f"  K={r['K']}: perp={r['perplexity']:.2f} | items={r['items_csv']} | topics={r['topics_csv']} | dist={r['dist_csv']} | plot={r['plot']}")


[summary]
  K=6: perp=1518.14 | items=c:\Users\jvlas\source\repos\TrioLearn\outputs\reports\csv\books_dom_topics_K6_CLEAN.csv | topics=c:\Users\jvlas\source\repos\TrioLearn\outputs\reports\csv\books_topics_topwords_K6_CLEAN.csv | dist=c:\Users\jvlas\source\repos\TrioLearn\outputs\reports\csv\books_topic_distribution_K6_CLEAN.csv | plot=c:\Users\jvlas\source\repos\TrioLearn\outputs\reports\images\books_topic_distribution_K6_CLEAN.png
  K=10: perp=1506.61 | items=c:\Users\jvlas\source\repos\TrioLearn\outputs\reports\csv\books_dom_topics_K10_CLEAN.csv | topics=c:\Users\jvlas\source\repos\TrioLearn\outputs\reports\csv\books_topics_topwords_K10_CLEAN.csv | dist=c:\Users\jvlas\source\repos\TrioLearn\outputs\reports\csv\books_topic_distribution_K10_CLEAN.csv | plot=c:\Users\jvlas\source\repos\TrioLearn\outputs\reports\images\books_topic_distribution_K10_CLEAN.png
  K=15: perp=1506.94 | items=c:\Users\jvlas\source\repos\TrioLearn\outputs\reports\csv\books_dom_topics_K15_CLEAN.csv | topics=c:\