# Job Recommendation System: Content-Based Filtering

This notebook builds two job recommendation models and compares their performance:

1. **Baseline Model**: Cosine similarity on job titles (TF-IDF)
2. **Enhanced Model**: Weighted multi-feature similarity (description TF-IDF + structured features)

**Data Source:** [Workable XML Job Feed](https://www.workable.com/boards/workable.xml) (~177K jobs)

**Output:** Pre-computed recommendation JSON files for a Next.js web app demo.

## 1. Setup & Dependencies

In [None]:
# Install dependencies (uncomment for Google Colab)
# !pip install lxml beautifulsoup4 scikit-learn pandas numpy matplotlib seaborn tqdm requests

In [None]:
import os
import re
import json
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from lxml import etree
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from collections import Counter

# Settings
pd.set_option('display.max_colwidth', 80)
sns.set_theme(style='whitegrid')
plt.rcParams['figure.figsize'] = (12, 5)

FEED_URL = 'https://www.workable.com/boards/workable.xml'
XML_PATH = 'data/workable_feed.xml'
OUTPUT_DIR = 'output'
SAMPLE_SIZE = 1000
TOP_N = 3  # recommendations per job

os.makedirs('data', exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

print('Setup complete.')

## 2. Download XML Feed

In [None]:
if os.path.exists(XML_PATH):
    file_size_mb = os.path.getsize(XML_PATH) / (1024 * 1024)
    print(f'XML feed already cached at {XML_PATH} ({file_size_mb:.1f} MB). Skipping download.')
else:
    print(f'Downloading XML feed from {FEED_URL}...')
    print('This may take a few minutes (the file is large).')
    try:
        response = requests.get(FEED_URL, stream=True, timeout=300)
        response.raise_for_status()
        total_size = int(response.headers.get('content-length', 0))
        with open(XML_PATH, 'wb') as f:
            downloaded = 0
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
                downloaded += len(chunk)
                if total_size > 0:
                    pct = downloaded / total_size * 100
                    print(f'\rDownloaded {downloaded / 1024 / 1024:.1f} MB ({pct:.0f}%)', end='')
        print(f'\nDone! Saved to {XML_PATH}')
    except Exception as e:
        print(f'Download failed: {e}')
        print('\nAlternative: Download the file manually from your browser:')
        print(f'  {FEED_URL}')
        print(f'  Save it to: {os.path.abspath(XML_PATH)}')

## 3. Parse XML Feed

In [None]:
def parse_job_feed(xml_path, max_jobs=None):
    """Parse the Workable XML feed using streaming for constant memory."""
    jobs = []
    context = etree.iterparse(xml_path, events=('end',), tag='job', recover=True)
    
    for event, elem in tqdm(context, desc='Parsing jobs'):
        job = {
            'id': (elem.findtext('referencenumber') or '').strip(),
            'title': (elem.findtext('title') or '').strip(),
            'company': (elem.findtext('company') or '').strip(),
            'city': (elem.findtext('city') or '').strip(),
            'state': (elem.findtext('state') or '').strip(),
            'country': (elem.findtext('country') or '').strip(),
            'remote': (elem.findtext('remote') or '').strip().lower() == 'true',
            'description': (elem.findtext('description') or '').strip(),
            'education': (elem.findtext('education') or '').strip(),
            'job_type': (elem.findtext('jobtype') or '').strip(),
            'category': (elem.findtext('category') or '').strip(),
            'experience': (elem.findtext('experience') or '').strip(),
            'url': (elem.findtext('url') or '').strip(),
            'date': (elem.findtext('date') or '').strip(),
        }
        
        # Only include jobs with at least a title and description
        if job['id'] and job['title'] and job['description']:
            jobs.append(job)
        
        # Free memory
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
        
        if max_jobs and len(jobs) >= max_jobs:
            break
    
    return jobs

# Parse all jobs (or set max_jobs to limit for testing)
all_jobs = parse_job_feed(XML_PATH)
print(f'\nParsed {len(all_jobs):,} valid jobs from the feed.')

## 4. Load into DataFrame

In [None]:
df_all = pd.DataFrame(all_jobs)
print(f'DataFrame shape: {df_all.shape}')
print(f'\nColumns: {list(df_all.columns)}')
print(f'\nCategory distribution (top 15):')
print(df_all['category'].value_counts().head(15))
print(f'\nJob type distribution:')
print(df_all['job_type'].value_counts())
print(f'\nCountry distribution (top 10):')
print(df_all['country'].value_counts().head(10))
df_all.head(3)

## 5. Sample 1,000 Diverse Jobs

In [None]:
def stratified_sample(df, n=1000, stratify_col='category', min_per_group=5, random_state=42):
    """Sample n jobs with diversity across categories."""
    # Remove duplicates by title+company+city
    df_dedup = df.drop_duplicates(subset=['title', 'company', 'city'])
    print(f'After deduplication: {len(df_dedup):,} jobs (removed {len(df) - len(df_dedup):,} duplicates)')
    
    # Fill empty categories
    df_dedup = df_dedup.copy()
    df_dedup[stratify_col] = df_dedup[stratify_col].replace('', 'Other')
    
    # Calculate proportional allocation
    cat_counts = df_dedup[stratify_col].value_counts()
    cat_proportions = cat_counts / cat_counts.sum()
    cat_samples = (cat_proportions * n).apply(lambda x: max(int(x), min_per_group))
    
    # Adjust to hit target n
    while cat_samples.sum() > n:
        largest = cat_samples.idxmax()
        cat_samples[largest] -= 1
    while cat_samples.sum() < n:
        largest = cat_samples.idxmax()
        cat_samples[largest] += 1
    
    # Sample from each category
    sampled = []
    for cat, count in cat_samples.items():
        cat_df = df_dedup[df_dedup[stratify_col] == cat]
        sample_n = min(count, len(cat_df))
        sampled.append(cat_df.sample(n=sample_n, random_state=random_state))
    
    result = pd.concat(sampled).reset_index(drop=True)
    print(f'Sampled {len(result)} jobs across {result[stratify_col].nunique()} categories')
    return result

df = stratified_sample(df_all, n=SAMPLE_SIZE)
print(f'\nSample category distribution:')
print(df['category'].value_counts().head(10))

## 6. Clean HTML Descriptions

In [None]:
def clean_html(raw_html):
    """Remove HTML tags, URLs, emails, and normalize whitespace."""
    if not raw_html or not isinstance(raw_html, str):
        return ''
    soup = BeautifulSoup(raw_html, 'html.parser')
    for tag in soup.find_all(['script', 'style']):
        tag.decompose()
    text = soup.get_text(separator=' ', strip=True)
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+\.\S+', '', text)
    text = re.sub(r'&\w+;', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['description_clean'] = df['description'].apply(clean_html)

# Show before/after for one example
idx = 0
print('=== RAW HTML (first 300 chars) ===')
print(df.iloc[idx]['description'][:300])
print('\n=== CLEANED TEXT (first 300 chars) ===')
print(df.iloc[idx]['description_clean'][:300])
print(f'\nDescription lengths (cleaned): min={df["description_clean"].str.len().min()}, '
      f'median={df["description_clean"].str.len().median():.0f}, '
      f'max={df["description_clean"].str.len().max()}')

## 7. Text Preprocessing

In [None]:
def preprocess_text(text):
    """Lowercase and basic cleaning for TF-IDF input."""
    if not text:
        return ''
    text = text.lower()
    # Remove special characters but keep spaces and alphanumeric
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['title_clean'] = df['title'].apply(preprocess_text)
df['desc_processed'] = df['description_clean'].apply(preprocess_text)

print('Sample preprocessed titles:')
for i in range(5):
    print(f'  {df.iloc[i]["title"]}  ->  {df.iloc[i]["title_clean"]}')

## 8. Handle Missing Fields

In [None]:
# Fill missing values
df['category'] = df['category'].replace('', 'Other')
df['experience'] = df['experience'].replace('', 'Not Specified')
df['education'] = df['education'].replace('', 'Not Specified')
df['job_type'] = df['job_type'].replace('', 'Not Specified')
df['city'] = df['city'].replace('', 'Unknown')
df['state'] = df['state'].replace('', 'Unknown')
df['country'] = df['country'].replace('', 'Unknown')

print('Missing values after filling:')
print(df[['category', 'experience', 'education', 'job_type', 'city', 'state', 'country']].eq('').sum())

## 9. Data Statistics

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Category distribution
cat_counts = df['category'].value_counts().head(15)
cat_counts.plot(kind='barh', ax=axes[0, 0], color='steelblue')
axes[0, 0].set_title('Top 15 Categories')
axes[0, 0].set_xlabel('Count')

# Job type distribution
type_counts = df['job_type'].value_counts()
type_counts.plot(kind='bar', ax=axes[0, 1], color='coral')
axes[0, 1].set_title('Job Type Distribution')
axes[0, 1].tick_params(axis='x', rotation=45)

# Country distribution (top 10)
country_counts = df['country'].value_counts().head(10)
country_counts.plot(kind='bar', ax=axes[1, 0], color='seagreen')
axes[1, 0].set_title('Top 10 Countries')
axes[1, 0].tick_params(axis='x', rotation=45)

# Description length distribution
df['desc_len'] = df['description_clean'].str.len()
axes[1, 1].hist(df['desc_len'], bins=50, color='mediumpurple', edgecolor='white')
axes[1, 1].set_title('Description Length Distribution')
axes[1, 1].set_xlabel('Characters')
axes[1, 1].set_ylabel('Count')

plt.tight_layout()
plt.show()

print(f'\nDataset summary: {len(df)} jobs, {df["category"].nunique()} categories, '
      f'{df["country"].nunique()} countries, {df["company"].nunique()} companies')

---
## 10. Baseline Model: Title-Only TF-IDF Cosine Similarity

In [None]:
# Build TF-IDF matrix on job titles
title_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=5000,
    stop_words='english',
    sublinear_tf=True,
)
title_tfidf = title_vectorizer.fit_transform(df['title_clean'])

print(f'Title TF-IDF matrix shape: {title_tfidf.shape}')
print(f'Vocabulary size: {len(title_vectorizer.vocabulary_)}')
print(f'Top 20 terms by document frequency:')
feature_names = title_vectorizer.get_feature_names_out()
doc_freq = (title_tfidf > 0).sum(axis=0).A1
top_terms_idx = doc_freq.argsort()[::-1][:20]
for idx in top_terms_idx:
    print(f'  {feature_names[idx]}: {doc_freq[idx]} docs')

## 11. Compute Title Cosine Similarity Matrix

In [None]:
title_sim_matrix = cosine_similarity(title_tfidf)
print(f'Title similarity matrix shape: {title_sim_matrix.shape}')
print(f'Memory: {title_sim_matrix.nbytes / 1024 / 1024:.1f} MB')

# Quick sanity check - similarity of first job with itself should be 1.0
print(f'\nSelf-similarity (should be ~1.0): {title_sim_matrix[0, 0]:.4f}')
print(f'Mean pairwise similarity: {title_sim_matrix[np.triu_indices_from(title_sim_matrix, k=1)].mean():.4f}')

## 12. Extract Baseline Recommendations (Top-3 per Job)

In [None]:
def get_top_n_recs(sim_matrix, df, n=3):
    """Extract top-N recommendations per job from a similarity matrix."""
    recs = {}
    for i in range(len(df)):
        scores = sim_matrix[i].copy()
        scores[i] = -1  # exclude self
        top_n = np.argsort(scores)[::-1][:n]
        recs[df.iloc[i]['id']] = [
            {'id': df.iloc[top_n[j]]['id'], 'score': round(float(scores[top_n[j]]), 4)}
            for j in range(n)
        ]
    return recs

baseline_recs = get_top_n_recs(title_sim_matrix, df, n=TOP_N)
print(f'Generated baseline recommendations for {len(baseline_recs)} jobs.')

# Show example
example_id = df.iloc[0]['id']
print(f'\nExample: "{df.iloc[0]["title"]}" ({example_id})')
print(f'Baseline recommendations:')
for rec in baseline_recs[example_id]:
    rec_row = df[df['id'] == rec['id']].iloc[0]
    print(f'  - {rec_row["title"]} at {rec_row["company"]} (score: {rec["score"]:.4f})')

## 13. Spot-Check Baseline Recommendations

In [None]:
def display_recommendations(job_idx, recs_dict, df, model_name='Model'):
    """Display a job and its recommendations."""
    job = df.iloc[job_idx]
    print(f'\n{"="*80}')
    print(f'SOURCE JOB: {job["title"]}')
    print(f'  Company: {job["company"]} | Location: {job["city"]}, {job["state"]}, {job["country"]}')
    print(f'  Category: {job["category"]} | Type: {job["job_type"]} | Experience: {job["experience"]}')
    print(f'\n{model_name} Recommendations:')
    for i, rec in enumerate(recs_dict[job['id']], 1):
        rec_row = df[df['id'] == rec['id']].iloc[0]
        print(f'  {i}. {rec_row["title"]}')
        print(f'     Company: {rec_row["company"]} | Location: {rec_row["city"]}, {rec_row["state"]}')
        print(f'     Category: {rec_row["category"]} | Score: {rec["score"]:.4f}')

# Spot-check 5 diverse jobs
spot_check_indices = [0, len(df)//5, len(df)//3, len(df)//2, len(df)*3//4]
for idx in spot_check_indices:
    display_recommendations(idx, baseline_recs, df, model_name='Baseline')

---
## 14. Enhanced Model: Description TF-IDF

In [None]:
# Build TF-IDF matrix on cleaned descriptions
desc_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=10000,
    max_df=0.85,
    min_df=2,
    stop_words='english',
    sublinear_tf=True,
    dtype=np.float32,
)
desc_tfidf = desc_vectorizer.fit_transform(df['desc_processed'])

print(f'Description TF-IDF matrix shape: {desc_tfidf.shape}')
print(f'Vocabulary size: {len(desc_vectorizer.vocabulary_)}')
print(f'\nTop 20 description terms by document frequency:')
desc_feature_names = desc_vectorizer.get_feature_names_out()
desc_doc_freq = (desc_tfidf > 0).sum(axis=0).A1
desc_top_idx = desc_doc_freq.argsort()[::-1][:20]
for idx in desc_top_idx:
    print(f'  {desc_feature_names[idx]}: {desc_doc_freq[idx]} docs')

## 15. Compute Individual Feature Similarity Matrices

In [None]:
# Description similarity (TF-IDF cosine)
print('Computing description similarity...')
desc_sim = cosine_similarity(desc_tfidf)

# Category match (binary: same category = 1.0, different = 0.0)
print('Computing category similarity...')
categories = df['category'].values
cat_sim = (categories[:, None] == categories[None, :]).astype(np.float32)

# Location similarity (tiered: same city=1.0, same state=0.5, same country=0.2, else=0.0)
print('Computing location similarity...')
cities = df['city'].values
states = df['state'].values
countries = df['country'].values
loc_sim = np.where(
    cities[:, None] == cities[None, :], 1.0,
    np.where(
        states[:, None] == states[None, :], 0.5,
        np.where(
            countries[:, None] == countries[None, :], 0.2, 0.0
        )
    )
).astype(np.float32)

# Job type match (binary)
print('Computing job type similarity...')
jobtypes = df['job_type'].values
type_sim = (jobtypes[:, None] == jobtypes[None, :]).astype(np.float32)

# Experience level similarity (ordinal distance)
print('Computing experience similarity...')
exp_map = {
    'Entry level': 0, 'Internship': 0,
    'Associate': 1,
    'Mid-Senior level': 2, 'Not Specified': 2,
    'Director': 3,
    'Executive': 4,
}
exp_vals = np.array([exp_map.get(e, 2) for e in df['experience'].values], dtype=np.float32)
exp_dist = np.abs(exp_vals[:, None] - exp_vals[None, :])
max_dist = exp_dist.max()
exp_sim = (1.0 - exp_dist / max_dist).astype(np.float32) if max_dist > 0 else np.ones_like(exp_dist)

print(f'\nAll feature matrices computed. Shape: {desc_sim.shape}')
print(f'Memory per matrix: {desc_sim.nbytes / 1024 / 1024:.1f} MB')
print(f'Total memory: {(desc_sim.nbytes + cat_sim.nbytes + loc_sim.nbytes + type_sim.nbytes + exp_sim.nbytes) / 1024 / 1024:.1f} MB')

## 16. Combine with Weights

In [None]:
WEIGHTS = {
    'description': 0.35,
    'title': 0.25,
    'category': 0.15,
    'location': 0.10,
    'job_type': 0.08,
    'experience': 0.07,
}

print('Feature weights:')
for feature, weight in WEIGHTS.items():
    print(f'  {feature}: {weight:.0%}')
print(f'  Total: {sum(WEIGHTS.values()):.0%}')

# Compute weighted similarity matrix
weighted_sim = (
    WEIGHTS['description'] * desc_sim +
    WEIGHTS['title'] * title_sim_matrix.astype(np.float32) +
    WEIGHTS['category'] * cat_sim +
    WEIGHTS['location'] * loc_sim +
    WEIGHTS['job_type'] * type_sim +
    WEIGHTS['experience'] * exp_sim
)

print(f'\nWeighted similarity matrix shape: {weighted_sim.shape}')
print(f'Mean weighted similarity: {weighted_sim[np.triu_indices_from(weighted_sim, k=1)].mean():.4f}')

## 17. Extract Weighted Recommendations (Top-3 per Job)

In [None]:
weighted_recs = get_top_n_recs(weighted_sim, df, n=TOP_N)
print(f'Generated weighted recommendations for {len(weighted_recs)} jobs.')

# Show example
example_id = df.iloc[0]['id']
print(f'\nExample: "{df.iloc[0]["title"]}" ({example_id})')
print(f'Weighted recommendations:')
for rec in weighted_recs[example_id]:
    rec_row = df[df['id'] == rec['id']].iloc[0]
    print(f'  - {rec_row["title"]} at {rec_row["company"]} (score: {rec["score"]:.4f})')

## 18. Side-by-Side Spot Check: Baseline vs Weighted

In [None]:
def compare_recommendations(job_idx, baseline, weighted, df):
    """Side-by-side comparison of both models for a single job."""
    job = df.iloc[job_idx]
    job_id = job['id']
    
    print(f'\n{"="*100}')
    print(f'SOURCE: {job["title"]} | {job["company"]} | {job["city"]}, {job["country"]}')
    print(f'        Category: {job["category"]} | Type: {job["job_type"]} | Exp: {job["experience"]}')
    print(f'{"="*100}')
    
    base_recs = baseline[job_id]
    wtd_recs = weighted[job_id]
    
    # Check overlap
    base_ids = {r['id'] for r in base_recs}
    wtd_ids = {r['id'] for r in wtd_recs}
    overlap = base_ids & wtd_ids
    
    print(f'\n  {"BASELINE (Title Only)":<48} | {"ENHANCED (Weighted Multi-Feature)":<48}')
    print(f'  {"-"*48} | {"-"*48}')
    
    for i in range(TOP_N):
        br = df[df['id'] == base_recs[i]['id']].iloc[0]
        wr = df[df['id'] == wtd_recs[i]['id']].iloc[0]
        
        b_marker = '*' if base_recs[i]['id'] in overlap else ' '
        w_marker = '*' if wtd_recs[i]['id'] in overlap else ' '
        
        b_str = f'{b_marker}{br["title"][:35]:<36} ({base_recs[i]["score"]:.3f})'
        w_str = f'{w_marker}{wr["title"][:35]:<36} ({wtd_recs[i]["score"]:.3f})'
        print(f'  {b_str:<48} | {w_str:<48}')
        
        b_loc = f'   {br["company"][:20]} | {br["city"]}, {br["country"]}'
        w_loc = f'   {wr["company"][:20]} | {wr["city"]}, {wr["country"]}'
        print(f'  {b_loc:<48} | {w_loc:<48}')
    
    print(f'\n  Overlap: {len(overlap)}/{TOP_N} recommendations in common (* = shared)')

# Compare same 5 jobs as the spot check
for idx in spot_check_indices:
    compare_recommendations(idx, baseline_recs, weighted_recs, df)

---
## 19. Model Evaluation Metrics

In [None]:
def compute_metrics(recs_dict, sim_matrix, df):
    """Compute offline evaluation metrics for a recommendation model."""
    id_to_idx = {row['id']: i for i, row in df.iterrows()}
    
    # 1. Mean Recommendation Score
    all_scores = [rec['score'] for recs in recs_dict.values() for rec in recs]
    mean_score = np.mean(all_scores)
    
    # 2. Intra-List Diversity (ILD)
    ilds = []
    for job_id, recs in recs_dict.items():
        rec_indices = [id_to_idx[r['id']] for r in recs if r['id'] in id_to_idx]
        if len(rec_indices) < 2:
            continue
        # Pairwise similarity among recommendations
        pairs = []
        for a in range(len(rec_indices)):
            for b in range(a + 1, len(rec_indices)):
                pairs.append(sim_matrix[rec_indices[a], rec_indices[b]])
        ild = 1.0 - np.mean(pairs)  # diversity = 1 - similarity
        ilds.append(ild)
    mean_ild = np.mean(ilds)
    
    # 3. Catalog Coverage
    recommended_ids = set()
    for recs in recs_dict.values():
        for rec in recs:
            recommended_ids.add(rec['id'])
    coverage = len(recommended_ids) / len(df)
    
    # 4. Category Coherence
    job_cats = {row['id']: row['category'] for _, row in df.iterrows()}
    coherence_scores = []
    for job_id, recs in recs_dict.items():
        source_cat = job_cats.get(job_id, '')
        same_cat = sum(1 for r in recs if job_cats.get(r['id'], '') == source_cat)
        coherence_scores.append(same_cat / len(recs))
    mean_coherence = np.mean(coherence_scores)
    
    return {
        'Mean Rec Score': mean_score,
        'Intra-List Diversity': mean_ild,
        'Catalog Coverage': coverage,
        'Category Coherence': mean_coherence,
    }

# Use description TF-IDF similarity for ILD (content-based diversity measure)
baseline_metrics = compute_metrics(baseline_recs, desc_sim, df)
weighted_metrics = compute_metrics(weighted_recs, desc_sim, df)

metrics_df = pd.DataFrame({
    'Baseline (Title Only)': baseline_metrics,
    'Enhanced (Weighted)': weighted_metrics,
})
metrics_df['Better'] = metrics_df.apply(
    lambda row: 'Enhanced' if row['Enhanced (Weighted)'] >= row['Baseline (Title Only)'] else 'Baseline',
    axis=1
)

print('\nModel Comparison Metrics:')
print('=' * 70)
print(metrics_df.to_string())
print('\nNote: Higher is better for all metrics except Category Coherence (depends on goal).')

## 20. Visualize Model Comparison

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(18, 5))

metrics_names = list(baseline_metrics.keys())
baseline_vals = list(baseline_metrics.values())
weighted_vals = list(weighted_metrics.values())
colors = ['#3b82f6', '#10b981']  # blue for baseline, green for enhanced

for i, (name, bv, wv) in enumerate(zip(metrics_names, baseline_vals, weighted_vals)):
    bars = axes[i].bar(['Baseline', 'Enhanced'], [bv, wv], color=colors, edgecolor='white', width=0.6)
    axes[i].set_title(name, fontsize=11, fontweight='bold')
    axes[i].set_ylim(0, max(bv, wv) * 1.3)
    
    # Add value labels
    for bar, val in zip(bars, [bv, wv]):
        axes[i].text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.01,
                     f'{val:.3f}', ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.suptitle('Model Comparison: Baseline vs Enhanced', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

# Score distribution comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

base_scores = [rec['score'] for recs in baseline_recs.values() for rec in recs]
wtd_scores = [rec['score'] for recs in weighted_recs.values() for rec in recs]

axes[0].hist(base_scores, bins=50, color=colors[0], alpha=0.7, label='Baseline', edgecolor='white')
axes[0].hist(wtd_scores, bins=50, color=colors[1], alpha=0.7, label='Enhanced', edgecolor='white')
axes[0].set_title('Recommendation Score Distribution', fontweight='bold')
axes[0].set_xlabel('Similarity Score')
axes[0].set_ylabel('Count')
axes[0].legend()

# Overlap analysis
overlaps = []
for job_id in baseline_recs:
    base_ids = {r['id'] for r in baseline_recs[job_id]}
    wtd_ids = {r['id'] for r in weighted_recs[job_id]}
    overlaps.append(len(base_ids & wtd_ids))

overlap_counts = Counter(overlaps)
axes[1].bar(overlap_counts.keys(), overlap_counts.values(), color='mediumpurple', edgecolor='white')
axes[1].set_title('Recommendation Overlap Between Models', fontweight='bold')
axes[1].set_xlabel(f'Number of shared recommendations (out of {TOP_N})')
axes[1].set_ylabel('Number of jobs')
axes[1].set_xticks(range(TOP_N + 1))

plt.tight_layout()
plt.show()

print(f'\nOverlap summary: {np.mean(overlaps):.1f} average shared recommendations per job')
for k, v in sorted(overlap_counts.items()):
    print(f'  {k}/{TOP_N} overlap: {v} jobs ({v/len(df)*100:.1f}%)')

## 21. Qualitative Comparison Panel (10 Diverse Jobs)

In [None]:
# Select 10 diverse jobs across different categories
unique_cats = df['category'].unique()
panel_indices = []
for cat in unique_cats[:10]:
    cat_jobs = df[df['category'] == cat]
    if len(cat_jobs) > 0:
        panel_indices.append(cat_jobs.index[0])

# If we don't have 10 categories, fill from remaining jobs
while len(panel_indices) < 10 and len(panel_indices) < len(df):
    idx = len(df) * len(panel_indices) // 10
    if idx not in panel_indices:
        panel_indices.append(idx)

print(f'Qualitative Comparison Panel: {len(panel_indices)} diverse jobs')
print(f'Categories represented: {[df.iloc[i]["category"] for i in panel_indices]}')

for idx in panel_indices:
    compare_recommendations(idx, baseline_recs, weighted_recs, df)

## 22. Summary & Findings

### Key Findings

**Baseline Model (Title-Only Cosine Similarity):**
- Uses only job titles for matching
- Fast to compute but misses semantic similarity when titles differ
- Two jobs with different titles but similar responsibilities (e.g., "Software Developer" vs "Full Stack Engineer") may not match well

**Enhanced Model (Weighted Multi-Feature):**
- Combines 6 features: description (35%), title (25%), category (15%), location (10%), job type (8%), experience (7%)
- TF-IDF on descriptions captures deeper skill/requirement overlap
- Structured features (location, category, experience) provide additional context
- Generally produces more relevant and diverse recommendations

### When the Models Diverge
The most interesting cases are when the two models produce **different** recommendations. This typically happens when:
1. Jobs have different titles but similar descriptions (enhanced model catches this)
2. Jobs are in the same location/category (enhanced model boosts these)
3. Jobs have very generic titles (baseline struggles with differentiation)

---
## 23. Export JSON for Web App

In [None]:
# Export jobs.json
jobs_export = []
for _, row in df.iterrows():
    jobs_export.append({
        'id': row['id'],
        'title': row['title'],
        'company': row['company'],
        'city': row['city'],
        'state': row['state'],
        'country': row['country'],
        'remote': bool(row['remote']),
        'description': row['description'],  # Keep raw HTML for web rendering
        'category': row['category'],
        'jobType': row['job_type'],
        'experience': row['experience'],
        'education': row['education'],
        'url': row['url'],
    })

# Write files
jobs_path = os.path.join(OUTPUT_DIR, 'jobs.json')
baseline_path = os.path.join(OUTPUT_DIR, 'recs_baseline.json')
weighted_path = os.path.join(OUTPUT_DIR, 'recs_weighted.json')

with open(jobs_path, 'w') as f:
    json.dump(jobs_export, f, separators=(',', ':'))

with open(baseline_path, 'w') as f:
    json.dump(baseline_recs, f, separators=(',', ':'))

with open(weighted_path, 'w') as f:
    json.dump(weighted_recs, f, separators=(',', ':'))

print('Exported files:')
for path in [jobs_path, baseline_path, weighted_path]:
    size_kb = os.path.getsize(path) / 1024
    if size_kb > 1024:
        print(f'  {path}: {size_kb/1024:.1f} MB')
    else:
        print(f'  {path}: {size_kb:.0f} KB')

## 24. Sanity Check Exports

In [None]:
# Reload and verify
with open(jobs_path) as f:
    verify_jobs = json.load(f)
with open(baseline_path) as f:
    verify_baseline = json.load(f)
with open(weighted_path) as f:
    verify_weighted = json.load(f)

job_ids = {j['id'] for j in verify_jobs}

print(f'Jobs exported: {len(verify_jobs)}')
print(f'Baseline recs: {len(verify_baseline)} jobs with recommendations')
print(f'Weighted recs: {len(verify_weighted)} jobs with recommendations')

# Verify all rec IDs exist in jobs
missing_baseline = 0
missing_weighted = 0
for job_id, recs in verify_baseline.items():
    assert job_id in job_ids, f'Job {job_id} in baseline recs but not in jobs'
    assert len(recs) == TOP_N, f'Job {job_id} has {len(recs)} recs, expected {TOP_N}'
    for r in recs:
        if r['id'] not in job_ids:
            missing_baseline += 1

for job_id, recs in verify_weighted.items():
    assert job_id in job_ids, f'Job {job_id} in weighted recs but not in jobs'
    assert len(recs) == TOP_N, f'Job {job_id} has {len(recs)} recs, expected {TOP_N}'
    for r in recs:
        if r['id'] not in job_ids:
            missing_weighted += 1

print(f'\nValidation:')
print(f'  All job IDs in baseline recs exist in jobs.json: {missing_baseline == 0}')
print(f'  All job IDs in weighted recs exist in jobs.json: {missing_weighted == 0}')
print(f'  Every job has exactly {TOP_N} recommendations: True')
print(f'\nReady for web app!')