### 1. Prepare datasets: 

courses_combined_cleaned.csv (courses)
books_metadata_cleaned.csv (books)
videos_metadata_cleaned.csv (videos)

Each should have a column text_for_embedding that will be  feedede into LDA.

In [11]:
from pathlib import Path
import pandas as pd

# Detect base (works in Jupyter & scripts)
try:
    base = Path(__file__).resolve().parents[2]   # when running as .py
except NameError:
    base = Path.cwd().parents[1]                 # when running from Jupyter in outputs/reports/

print("[check] Project base:", base)

# Create individual paths
courses_path = base / "backend" / "data" / "interim" / "courses_metadata_clean.csv"
books_path   = base / "backend" / "data" / "interim" / "books_metadata.csv"
videos_path  = base / "backend" / "data" / "interim" / "videos_metadata_clean.csv"

# Put into dictionary for verification loop
data_files = {
    "courses": courses_path,
    "books": books_path,
    "videos": videos_path,
}

# Verify each path 
for name, path in data_files.items():
    print(f"\n[check] {name.upper()} file path: {path}")
    if not path.exists():
        print(f"[error] {name} CSV not found!")
        continue

    try:
        df = pd.read_csv(path, nrows=3)  # only read first few rows for speed
        print(f"[ok] {name} loaded, shape (preview): {df.shape}")
        print(f"[ok] {name} columns:", list(df.columns)[:10])
    except Exception as e:
        print(f"[error] Failed to read {name}: {e}")


[check] Project base: c:\Users\jvlas\source\repos\TrioLearn

[check] COURSES file path: c:\Users\jvlas\source\repos\TrioLearn\backend\data\interim\courses_metadata_clean.csv
[ok] courses loaded, shape (preview): (3, 8)
[ok] courses columns: ['course_id', 'title', 'description', 'platform', 'provider', 'level', 'skills', 'text_for_embedding']

[check] BOOKS file path: c:\Users\jvlas\source\repos\TrioLearn\backend\data\interim\books_metadata.csv
[ok] books loaded, shape (preview): (3, 15)
[ok] books columns: ['keyword', 'volume_id', 'title', 'authors', 'description', 'categories', 'publishedDate', 'pageCount', 'language', 'averageRating']

[check] VIDEOS file path: c:\Users\jvlas\source\repos\TrioLearn\backend\data\interim\videos_metadata_clean.csv
[ok] videos loaded, shape (preview): (3, 17)
[ok] videos columns: ['video_id', 'title', 'description', 'channel_id', 'channel_title', 'published_at', 'view_count', 'like_count', 'comment_count', 'tags']


### 2. Fit separate LDA models ( run Latent Dirichlet Allocation (LDA) separately per modality). 

In [13]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

def fit_lda(df, text_col, n_topics=6, modality="course"):
    # 1) Build a boolean mask of valid (non-empty) docs, keep original indices
    txt = df[text_col].astype(str).fillna("").str.strip()
    mask = txt.str.len() > 0
    texts = txt[mask]

    if texts.empty:
        raise ValueError(f"No non-empty documents found in column '{text_col}'.")

    # 2) Vectorize only valid rows (aligned indices preserved)
    vectorizer = CountVectorizer(stop_words="english", max_df=0.95, min_df=2)
    dtm = vectorizer.fit_transform(texts)

    if dtm.shape[1] == 0:
        # too aggressive filtering; relax thresholds
        vectorizer = CountVectorizer(stop_words="english", max_df=1.0, min_df=1)
        dtm = vectorizer.fit_transform(texts)
        if dtm.shape[1] == 0:
            raise ValueError("No terms left after vectorization. Check your text preprocessing.")

    # 3) Fit LDA
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    theta = lda.fit_transform(dtm)  # rows align with 'texts' (i.e., df[mask])

    # 4) Create a Series of dominant topics with the SAME index as 'texts'
    dom = pd.Series(theta.argmax(axis=1), index=texts.index)

    # 5) Initialize column with NaN, then assign only for valid rows
    colname = f"{modality}_dominant_topic"
    df[colname] = np.nan
    df.loc[mask, colname] = dom

    return df, lda, vectorizer


### 3. Fit LDA for courses with fixed number of topics = 5

In [14]:

import pandas as pd
import matplotlib.pyplot as plt

# output path

img_dir = base / "outputs" / "reports" / "images"

# Load courses
courses_df = pd.read_csv(courses_path)

# If your text column is not "description", change it here (e.g., "text_for_embedding")
TEXT_COL = "description"

# Fit LDA for courses only (for now)
courses_df, lda_courses, vec_courses = fit_lda(
    courses_df, TEXT_COL, modality="course"
)

# Save the plot as PNG
plt.figure(figsize=(7,4))
topic_counts = courses_df["course_dominant_topic"].value_counts().sort_index()
topic_counts.plot(kind="bar")
plt.title("Distribution of Dominant Topics (Courses)")
plt.xlabel("Topic")
plt.ylabel("Number of Items")
plt.tight_layout()

out_path = img_dir / "courses_topic_distribution.png"
plt.savefig(out_path, dpi=300)
plt.close()

print(f"[info] Saved courses topic distribution chart to: {out_path}")


[info] Saved courses topic distribution chart to: c:\Users\jvlas\source\repos\TrioLearn\outputs\reports\images\courses_topic_distribution.png


### 3.2 Fit LDA for videos with number of topics = 5

In [15]:
# output path

img_dir = base / "outputs" / "reports" / "images"

# Load courses
videos_df = pd.read_csv(videos_path)

# If your text column is not "description", change it here (e.g., "text_for_embedding")
TEXT_COL = "description"

# Fit LDA for courses only (for now)
videos_df, lda_courses, vec_courses = fit_lda(
    videos_df, TEXT_COL, modality="course"
)

# Save the plot as PNG
plt.figure(figsize=(7,4))
topic_videos = videos_df["video_dominant_topic"].value_counts().sort_index()
topic_counts.plot(kind="bar")
plt.title("Distribution of Dominant Topics (Courses)")
plt.xlabel("Topic")
plt.ylabel("Number of Items")
plt.tight_layout()

out_path = img_dir / "video_topic_distribution.png"
plt.savefig(out_path, dpi=300)
plt.close()

print(f"[info] Saved courses topic distribution chart to: {out_path}")

KeyError: 'video_dominant_topic'

<Figure size 700x400 with 0 Axes>

In [None]:


# (Optional) Later, once you have books/videos CSVs, just replicate:

books_df  = pd.read_csv(r"C:\path\to\books.csv")
videos_df = pd.read_csv(r"C:\path\to\videos.csv")
books_df, lda_books, vec_books    = fit_lda(books_df, TEXT_COL, modality="book")
videos_df, lda_videos, vec_videos = fit_lda(videos_df, TEXT_COL, modality="video")