# Topic Modeling with LSA (TF-IDF + TruncatedSVD)

Notebook version of the recommender using Latent Semantic Analysis.

In [1]:
import time
from typing import List, Optional

import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer


In [2]:
# Load job data
import re
def strip_experience(text):
    if not isinstance(text, str):
        return ""
    # remove the “Experience required: X to Y Years” (or single number) part
    return re.sub(r"Experience required:\s*\d+\s*(to\s*\d+)?\s*Years", "", text, flags=re.IGNORECASE).strip()

job_df = pd.read_csv("cleaned_job_data_dedup.csv", usecols=["job_text_cleaned"])

# Clean and de-dup
job_texts = (
    job_df["job_text_cleaned"]
    .astype(str)
    .map(strip_experience)
    .drop_duplicates()
    .fillna("")
    .tolist()
)
print(f"Job descriptions after de-dup (experience stripped): {len(job_texts):,}")

# Load resume data
resume_texts = pd.read_csv("cleaned_resume.csv", usecols=["cleaned_text"])["cleaned_text"].fillna("").tolist()
print(f"Resumes loaded: {len(resume_texts):,}")
print(f"There are {len(job_texts):,} jobs and {len(resume_texts):,} resumes.")

Job descriptions after de-dup (experience stripped): 14,760
Resumes loaded: 1
There are 14,760 jobs and 1 resumes.


In [3]:
# Build similarity scores and store as a DataFrame
resume_idx = 0  # change to pick a different resume

# Vectorizer configuration
VECTORIZER_CONFIG = dict(
    stop_words="english",
    lowercase=True,
    strip_accents="unicode",
    max_features=None,
    ngram_range=(1, 3),
    min_df=5,
    max_df=0.6,
    dtype=np.float32,
)

# LSA (SVD) configuration
N_TOPICS = 120
SVD_CONFIG = dict(
    n_components=N_TOPICS,
    random_state=42,
)

vectorizer = TfidfVectorizer(**VECTORIZER_CONFIG)
svd = TruncatedSVD(**SVD_CONFIG)
lsa = make_pipeline(svd, Normalizer(copy=False))

job_tfidf = vectorizer.fit_transform(job_texts)
print(f"TF-IDF shape: {job_tfidf.shape}")

job_topics = lsa.fit_transform(job_tfidf)
print(
        f"LSA topic matrix: {job_topics.shape} "
        f"(explained variance: {svd.explained_variance_ratio_.sum():.3f}, "
    )

resume_tfidf = vectorizer.transform([resume_texts[resume_idx]])
resume_topics = lsa.transform(resume_tfidf)

scores = (resume_topics @ job_topics.T).ravel() # Cosine Similarity

resume_job_result = (
    pd.DataFrame({"job_description": job_texts, "similarity_score": scores})
)

# Inspect the top matches
resume_job_result.head(10)

TF-IDF shape: (14760, 296720)
LSA topic matrix: (14760, 120) (explained variance: 0.159, 


Unnamed: 0,job_description,similarity_score
0,Digital Marketing Specialist\nManage and grow ...,0.036917
1,Web Developer\nDesign and code user interfaces...,0.005602
2,Operations Manager\nEstablish and enforce qual...,0.296057
3,"Network Engineer\nDesign, configure, and optim...",0.094214
4,Event Manager\nSpecialize in conference and co...,0.244755
5,Software Tester\nTest software applications an...,0.166322
6,"Teacher\nPlan and deliver engaging lessons, ad...",0.072192
7,UX/UI Designer\nCreate visually appealing user...,0.011965
8,"UX/UI Designer\nWork on interaction design, de...",0.024416
9,Wedding Planner\nOffer expert advice and guida...,0.331743


In [4]:
print(resume_job_result.sort_values("similarity_score", ascending=False).head(10))

                                         job_description  similarity_score
8243   Operations Data Analyst - W2 Only with verific...          0.885380
426    Business Analyst\nAbout the job Data / Busines...          0.866316
14648  Professional, Business Intelligence Analyst - ...          0.866043
10334  Business Data Analyst\nAbout the job Job Summa...          0.865299
10070  Spatial Data Scientist / Geospatial Analyst - ...          0.858949
12941  FP&A Associate\nAbout the job Responsibilities...          0.852798
11814  Senior Technical Analyst, Care Management\nAbo...          0.851774
1528   Data Analyst\nAbout the job About Us Sports Re...          0.849482
10299  Database Operations Analyst\nAbout the job Job...          0.841293
9219   Business Analyst with verification\nAbout the ...          0.839393


In [5]:
resume_job_result["normalized_score"] = (
    (resume_job_result["similarity_score"] - resume_job_result["similarity_score"].min())
    / (resume_job_result["similarity_score"].max() - resume_job_result["similarity_score"].min())
)

In [6]:
resume_job_result

Unnamed: 0,job_description,similarity_score,normalized_score
0,Digital Marketing Specialist\nManage and grow ...,0.036917,0.075139
1,Web Developer\nDesign and code user interfaces...,0.005602,0.041005
2,Operations Manager\nEstablish and enforce qual...,0.296057,0.357613
3,"Network Engineer\nDesign, configure, and optim...",0.094214,0.137596
4,Event Manager\nSpecialize in conference and co...,0.244755,0.301691
...,...,...,...
14755,UX Designer\nAbout the job Role Overview This ...,0.234675,0.290704
14756,Full Stack Engineer with verification\nAbout t...,0.194448,0.246855
14757,ServiceNow Developer\nAbout the job Dice is th...,0.164507,0.214218
14758,Senior Frontend Developer\nAbout the job Senio...,0.326767,0.391087
