# Topic Modeling with LDA (CountVectorizer + LatentDirichletAllocation)

Notebook version of the recommender using LDA on CountVectorizer features.

In [2]:
import time
from typing import List, Optional

import numpy as np
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
# Load and prep data
import re

def strip_experience(text: str) -> str:
    if not isinstance(text, str):
        return ""
    # remove the “Experience required: X to Y Years” (or single number) part
    return re.sub(r"Experience required:\s*\d+\s*(to\s*\d+)?\s*Years", "", text, flags=re.IGNORECASE).strip()

job_df = pd.read_csv("cleaned_job_data_dedup.csv", usecols=["job_text_cleaned"])

# Clean and de-dup
job_texts = (
    job_df["job_text_cleaned"]
    .astype(str)
    .map(strip_experience)
    .drop_duplicates()
    .fillna("")
    .tolist()
)
print(f"Job descriptions after de-dup (experience stripped): {len(job_texts):,}")

# Load resume data
resume_texts = pd.read_csv("cleaned_resume.csv", usecols=["cleaned_text"])["cleaned_text"].fillna("").tolist()
print(f"Resumes loaded: {len(resume_texts):,}")
print(f"There are {len(job_texts):,} jobs and {len(resume_texts):,} resumes.")

Job descriptions after de-dup (experience stripped): 14,760
Resumes loaded: 1
There are 14,760 jobs and 1 resumes.


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity

# Build similarity scores and store as a DataFrame
resume_idx = 0  # change to pick a different resume

# Vectorizer configuration
VECTORIZER_CONFIG = dict(
    strip_accents="unicode",
    stop_words="english",
    lowercase=True,
    max_features=5000,
    token_pattern=r"\b[a-zA-Z]{3,}\b",
    max_df=0.75,
    min_df=5,
    ngram_range=(1, 3),
)

# LDA configuration
N_TOPICS = 10
LDA_CONFIG = dict(
    n_components=N_TOPICS,
    max_iter=100,
    learning_method="batch",
    random_state=44,
)

# Fit on job texts
vectorizer = CountVectorizer(**VECTORIZER_CONFIG)
job_dtm = vectorizer.fit_transform(job_texts)

lda = LatentDirichletAllocation(**LDA_CONFIG)
job_topics = lda.fit_transform(job_dtm)

feature_names = vectorizer.get_feature_names_out()
print(f"Vocab size: {len(feature_names)}")

# Transform resumes into topic space
resume_dtm = vectorizer.transform(resume_texts)
resume_topics = lda.transform(resume_dtm)

# Cosine similarity (use one resume or all)
scores = cosine_similarity(resume_topics[resume_idx:resume_idx + 1], job_topics).ravel()

resume_job_result = (
    pd.DataFrame({"job_description": job_texts, "similarity_score": scores})
)

Vocab size: 5000


In [5]:
# Inspect the top matches
print(resume_job_result.sort_values("similarity_score", ascending=False).head(10))

                                         job_description  similarity_score
9322   Delivery & Practice Head with verification\nAb...          0.997312
10299  Database Operations Analyst\nAbout the job Job...          0.995117
14652  Application Engineer\nAbout the job Applicatio...          0.994346
10195  Director Metals & Mining Consulting - Big4 / B...          0.993420
4801   Research Analyst - Energy & Resources\nAbout t...          0.993081
9001   HRMS Implementation Consultant\nAbout the job ...          0.992779
2199   Excel & SQL Data Analyst Intern\nAbout the job...          0.991960
14258  Information Technology Business Analyst with v...          0.990426
7087   Project Operations Manager - Fully Remote\nAbo...          0.989182
8243   Operations Data Analyst - W2 Only with verific...          0.988871


In [6]:
resume_job_result["normalized_score"] = (
    (resume_job_result["similarity_score"] - resume_job_result["similarity_score"].min())
    / (resume_job_result["similarity_score"].max() - resume_job_result["similarity_score"].min())
)

In [7]:
resume_job_result

Unnamed: 0,job_description,similarity_score,normalized_score
0,Digital Marketing Specialist\nManage and grow ...,0.143700,0.143477
1,Web Developer\nDesign and code user interfaces...,0.078154,0.077707
2,Operations Manager\nEstablish and enforce qual...,0.918696,0.921116
3,"Network Engineer\nDesign, configure, and optim...",0.596365,0.597686
4,Event Manager\nSpecialize in conference and co...,0.836056,0.838194
...,...,...,...
14755,UX Designer\nAbout the job Role Overview This ...,0.783551,0.785510
14756,Full Stack Engineer with verification\nAbout t...,0.167898,0.167757
14757,ServiceNow Developer\nAbout the job Dice is th...,0.555139,0.556319
14758,Senior Frontend Developer\nAbout the job Senio...,0.176569,0.176458
