<a href="https://colab.research.google.com/github/allurkarsneha/SkillMatch-Resume-Job-matching-system/blob/main/SkillMatch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from typing import List, Tuple
import html
import inspect
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

**Inspecting the Categories Resume.csv has**

In [2]:
import pandas as pd
df = pd.read_csv("Resume.csv")
cats = sorted(df["Category"].dropna().unique())
print("Num categories:", len(cats))
print(cats)

Num categories: 24
['ACCOUNTANT', 'ADVOCATE', 'AGRICULTURE', 'APPAREL', 'ARTS', 'AUTOMOBILE', 'AVIATION', 'BANKING', 'BPO', 'BUSINESS-DEVELOPMENT', 'CHEF', 'CONSTRUCTION', 'CONSULTANT', 'DESIGNER', 'DIGITAL-MEDIA', 'ENGINEERING', 'FINANCE', 'FITNESS', 'HEALTHCARE', 'HR', 'INFORMATION-TECHNOLOGY', 'PUBLIC-RELATIONS', 'SALES', 'TEACHER']


**Categorizing the Jobs**

In [3]:
def categorize_job_title(job_title: str) -> str:
    """
    Categorize job titles based on keyword matching and patterns.

    Args:
        job_title: The job title string to categorize

    Returns:
        The category name as a string
    """
    if pd.isna(job_title) or not isinstance(job_title, str):
        return "OTHER"

    # Convert to lowercase for case-insensitive matching
    title_lower = job_title.lower().strip()

    def _normalize(text: str) -> str:
        # Normalize punctuation/separators to spaces so titles like "hr-manager" still match "hr manager"
        text = re.sub(r"[\\/\\|_\"'.,()\[\]{}]+", " ", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text

    def _matches_any(text: str, keywords: List[str]) -> bool:
        # Use word-boundary matching for short/abbreviation keywords to avoid false positives.
        for kw in keywords:
            kw_norm = kw.lower().strip()
            if not kw_norm:
                continue
            if len(kw_norm) <= 3 and kw_norm.isalnum():
                if re.search(rf"\\b{re.escape(kw_norm)}\\b", text):
                    return True
            else:
                if kw_norm in text:
                    return True
        return False

    normalized_title = _normalize(title_lower)

    # Ordered rules: most specific -> most general (to avoid false matches)
    category_rules: List[Tuple[str, List[str]]] = [
        # INFORMATION-TECHNOLOGY - IT-specific roles/terms (kept above ENGINEERING and SALES)
        (
            "INFORMATION-TECHNOLOGY",
            [
                "information technology",
                "it support", "helpdesk", "service desk", "desktop support",
                "system administrator", "sysadmin",
                "network engineer", "network administrator",
                "cybersecurity", "security analyst", "soc analyst", "penetration tester",
                "cloud engineer", "aws", "azure", "gcp",
                "kubernetes", "docker",
                "database administrator", "dba",
                "systems engineer", "it engineer"
            ],
          ),

        # PUBLIC-RELATIONS - PR/communications roles (keep above SALES/HR)
        (
            "PUBLIC-RELATIONS",
            [
                "public relations",
                "pr",
                "communications manager",
                "communications specialist",
                "media relations",
                "press",
                "spokesperson",
                "community manager",
                "brand communications",
            ],
        ),
        # DIGITAL-MEDIA - content/social/SEO roles (keep above SALES)
        (
            "DIGITAL-MEDIA",
            [
                "digital marketing",
                "social media",
                "content writer",
                "content strategist",
                "seo",
                "sem",
                "growth marketing",
                "email marketing",
                "performance marketing",
                "copywriter",
                "video editor",
                "videographer",
            ],
        ),
        # HR - human resources/recruiting roles
        (
            "HR",
            [
                "human resources",
                "hr",
                "recruiter",
                "recruitment",
                "talent acquisition",
                "people operations",
                "hr manager",
                "hr specialist",
                "hr business partner",
            ],
        ),
        # ACCOUNTANT - accounting/bookkeeping roles (keep above FINANCE)
        (
            "ACCOUNTANT",
            [
                "accountant",
                "accounts payable",
                "accounts receivable",
                "bookkeeper",
                "bookkeeping",
                "audit",
                "auditor",
                "tax",
                "payroll",
                "cpa",
            ],
        ),
        # FINANCE - finance/banking/credit roles
        (
            "FINANCE",
            [
                "finance",
                "financial analyst",
                "investment",
                "equity research",
                "risk analyst",
                "treasury",
                "credit analyst",
                "controller",
            ],
        ),
        # BANKING - explicit banking roles/terms
        (
            "BANKING",
            [
                "bank",
                "banking",
                "teller",
                "branch manager",
                "loan officer",
                "mortgage",
                "relationship manager",
            ],
        ),
        # BUSINESS-DEVELOPMENT - bizdev/account management partnerships
        (
            "BUSINESS-DEVELOPMENT",
            [
                "business development",
                "bizdev",
                "partnerships",
                "strategic partnerships",
                "account executive",
                "account manager",
                "client success",
                "customer success",
            ],
        ),
        # SALES - direct sales roles/terms
        (
            "SALES",
            [
                "sales",
                "sales executive",
                "sales manager",
                "inside sales",
                "outside sales",
                "sales representative",
                "business sales",
                "retail sales",
                "store associate",
            ],
        ),
        # CONSULTANT - consulting/advisory roles (keep separate from BUSINESS-DEVELOPMENT)
        (
            "CONSULTANT",
            [
                "consultant",
                "consulting",
                "advisor",
                "advisory",
                "management consultant",
                "strategy consultant",
            ],
        ),
        # ADVOCATE - legal/policy advocacy and related roles
        (
            "ADVOCATE",
            [
                "advocate",
                "advocacy",
                "case worker",
                "caseworker",
                "legal aid",
                "social worker",
                "lawyer",
                "attorney",
                "legal counsel",
                "legal associate",
                "paralegal",
                "litigation",
            ],
        ),
        # TEACHER - education roles
        (
            "TEACHER",
            [
                "teacher",
                "tutor",
                "instructor",
                "lecturer",
                "professor",
                "educator",
            ],
        ),
        # HEALTHCARE - clinical roles
        (
            "HEALTHCARE",
            [
                "doctor",
                "physician",
                "nurse",
                "rn",
                "pharmacist",
                "medical assistant",
                "clinical",
                "hospital",
                "therapist",
                "dentist",
                "paramedic",
            ],
        ),
        # FITNESS - gym/training roles
        (
            "FITNESS",
            [
                "fitness",
                "personal trainer",
                "trainer",
                "gym",
                "strength coach",
                "yoga",
                "pilates",
                "nutrition coach",
            ],
        ),
        # CHEF - culinary roles
        (
            "CHEF",
            [
                "chef",
                "cook",
                "line cook",
                "sous chef",
                "head chef",
                "kitchen",
                "pastry",
            ],
        ),
        # AVIATION - aviation-specific roles
        (
            "AVIATION",
            [
                "pilot",
                "aviation",
                "aircraft",
                "aerospace",
                "flight attendant",
                "airline",
                "air traffic",
                "cabin crew",
            ],
        ),
        # AUTOMOBILE - auto industry roles
        (
            "AUTOMOBILE",
            [
                "automotive",
                "automobile",
                "car mechanic",
                "auto mechanic",
                "technician",
                "vehicle",
                "service advisor",
            ],
        ),
        # ENGINEERING - general engineering roles (kept after IT and aviation)
        (
            "ENGINEERING",
            [
                "engineer", "engineering",
                "software engineer", "software developer", "developer", "programmer",
                "full stack", "backend", "frontend", "web developer", "mobile developer",
                "flutter", "ios", "android", "react", "angular", "node", "django", "flask",
                "data scientist", "machine learning", "ml", "ai", "deep learning", "nlp",
                "data engineer", "devops", "sre"
            ],
        ),

        # CONSTRUCTION - construction/trades
        (
            "CONSTRUCTION",
            [
                "construction",
                "contractor",
                "site supervisor",
                "foreman",
                "carpenter",
                "electrician",
                "plumber",
                "mason",
                "welder",
            ],
        ),
        # AGRICULTURE - farming/agri roles
        (
            "AGRICULTURE",
            [
                "agriculture",
                "farmer",
                "farming",
                "agronomist",
                "horticulture",
                "livestock",
                "crop",
            ],
        ),
        # APPAREL - fashion/garment roles
        (
            "APPAREL",
            [
                "apparel",
                "fashion",
                "garment",
                "merchandiser",
                "textile",
                "seamstress",
                "tailor",
            ],
        ),
        # DESIGNER - design roles (kept separate from ARTS)
        (
            "DESIGNER",
            [
                "designer",
                "graphic designer",
                "ux designer",
                "ui designer",
                "product designer",
                "visual designer",
                "interaction designer",
            ],
        ),
        # ARTS - arts/creative roles
        (
            "ARTS",
            [
                "artist",
                "illustrator",
                "painter",
                "musician",
                "actor",
                "actress",
                "photographer",
            ],
        ),
        # BPO - call center / outsourcing roles
        (
            "BPO",
            [
                "bpo",
                "call center",
                "customer service",
                "customer support",
                "technical support",
                "voice process",
                "non voice",
                "back office",
            ],
        ),
    ]

    for category, keywords in category_rules:
        # Each rule matches common job-title keywords for that category.
        if _matches_any(normalized_title, keywords):
            return category

    # Fallback category when no keyword-based rule matches
    return "OTHER"

def process_job_data(input_file: str = "job_title_des.csv",
                     output_file: str = "jobs_with_category.csv") -> pd.DataFrame:
    """
    Process job data by loading CSV, categorizing job titles, and saving results.

    Args:
        input_file: Path to input CSV file
        output_file: Path to output CSV file

    Returns:
        DataFrame with categories added
    """
    # Load the CSV file using pandas
    try:
        df = pd.read_csv(input_file)

        if "Unnamed: 0" in df.columns:
          df = df.drop(columns=["Unnamed: 0"])

        # Normalize column names to expected format
        df = df.rename(columns={
            "Job Title": "job_title",
            "Job Description": "job_description"
        })

        # Create job_id if it does not exist
        if "job_id" not in df.columns:
            df["job_id"] = range(1, len(df) + 1)
            print(f"Successfully loaded {len(df)} rows from {input_file}")

    except FileNotFoundError:
        print(f"Error: {input_file} not found. Please ensure the file exists in the current directory.")
        return pd.DataFrame()
    except Exception as e:
        print(f"Error loading CSV: {e}")
        return pd.DataFrame()

    # Check if required columns exist
    required_columns = ['job_id', 'job_title', 'job_description']
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        print(f"Error: Missing required columns: {missing_columns}")
        return df

    # Create new category column using the categorize_job_title function
    print("Categorizing job titles...")
    df['category'] = df['job_title'].apply(categorize_job_title)

    # Display category distribution
    print("\nCategory Distribution:")
    category_counts = df['category'].value_counts()
    for category, count in category_counts.items():
        print(f"  {category}: {count} jobs")

    # Save the resulting DataFrame to CSV
    try:
        df.to_csv(output_file, index=False)
        print(f"\nSuccessfully saved categorized data to {output_file}")
    except Exception as e:
        print(f"Error saving CSV: {e}")

    return df

jobs_df = process_job_data(
    input_file="job_title_des.csv",
    output_file="jobs_with_category.csv"
)

jobs_df.head()

Successfully loaded 2277 rows from job_title_des.csv
Categorizing job titles...

Category Distribution:
  ENGINEERING: 1861 jobs
  INFORMATION-TECHNOLOGY: 284 jobs
  PUBLIC-RELATIONS: 132 jobs

Successfully saved categorized data to jobs_with_category.csv


Unnamed: 0,job_title,job_description,job_id,category
0,Flutter Developer,We are looking for hire experts flutter develo...,1,ENGINEERING
1,Django Developer,PYTHON/DJANGO (Developer/Lead) - Job Code(PDJ ...,2,ENGINEERING
2,Machine Learning,"Data Scientist (Contractor)\n\nBangalore, IN\n...",3,ENGINEERING
3,iOS Developer,JOB DESCRIPTION:\n\nStrong framework outside o...,4,ENGINEERING
4,Full Stack Developer,job responsibility full stack engineer – react...,5,ENGINEERING


In [4]:
def clean_text(text: str) -> str:
    """
    Cleaning steps:
    - Lowercase
    - Remove HTML tags
    - Unescape HTML entities (e.g., &​amp;)
    - Normalize whitespace
    - Remove repeated separator noise (e.g., =====, ----, ____ , ****, ####)
    - Preserve important tech characters for terms like: C++, C#, node.js
      (i.e., we do NOT strip '+', '#', '.' globally)
    """
    if pd.isna(text):
        return ""

    # Ensure string
    text = str(text)

    # Decode HTML entities early (e.g., &​nbsp; -> space)
    text = html.unescape(text)

    # Remove HTML tags like <br>, <div>...</div>
    text = re.sub(r"<[^>]+>", " ", text)

    # Lowercase for normalization
    text = text.lower()

    # Remove repeated "separator noise" (keep single meaningful punctuation)
    # Examples removed: "------", "=====", "_____", "*****", "#####"
    text = re.sub(r"[-=_*~`]{3,}", " ", text)

    # Remove repeated pipes or dot leaders that often appear as layout noise
    # Examples: "|||||", ".....", "• • •" (bullet-ish noise)
    text = re.sub(r"[|]{3,}", " ", text)
    text = re.sub(r"[.]{5,}", " ", text)

    # Remove repeated slashes used as separators (but keep normal "c/c++" patterns)
    text = re.sub(r"[/]{3,}", " ", text)

    # Normalize whitespace (tabs/newlines/multiple spaces -> single space)
    text = re.sub(r"\s+", " ", text).strip()

    return text


def normalize_category(cat: str) -> str:
    """
    Normalize category labels:
    - Convert to string
    - Strip surrounding whitespace
    - Uppercase
    """
    if pd.isna(cat):
        return ""
    return str(cat).strip().upper()


def drop_short_text_rows(df, text_col, min_chars=30, min_words=5):
    """
    Drop rows where the cleaned text is too short to be useful.

    Defaults are conservative:
    - min_chars: minimum number of characters
    - min_words: minimum number of whitespace-separated tokens
    """
    text_series = df[text_col].fillna("").astype(str)

    char_ok = text_series.str.len() >= min_chars
    word_ok = text_series.str.split().str.len() >= min_words

    return df[char_ok & word_ok].copy()

def preprocess_resumes(
    input_csv="Resume.csv",
    output_csv="resumes_clean.csv",
    min_chars=30,
    min_words=5,
):
    """
    Load Resume.csv, select needed columns, clean text, normalize categories, drop short samples, save.
    Expected columns: ID, Resume_str, Category
    """

    # Select only needed columns (saves memory + prevents accidental leakage of extra fields)
    df = pd.read_csv(input_csv, usecols=["ID", "Resume_str", "Category"])

    df = df.rename(columns={
        "ID": "resume_id",
        "Resume_str": "resume_text",
        "Category": "category"
    })

    # Normalize category labels
    df["category"] = df["category"].apply(normalize_category)

    # Clean resume text
    df["resume_text"] = df["resume_text"].apply(clean_text)

    # Drop very short resumes
    df = drop_short_text_rows(df, text_col="resume_text", min_chars=min_chars, min_words=min_words)

    df = df[df["category"] != ""]

    # Save cleaned output
    df.to_csv(output_csv, index=False)

    return df

resumes_clean = preprocess_resumes()
# Basic stats
print("[resumes] rows:", resumes_clean.shape[0])
print("[resumes] category distribution:")
print(resumes_clean["category"].value_counts())

[resumes] rows: 2483
[resumes] category distribution:
category
INFORMATION-TECHNOLOGY    120
BUSINESS-DEVELOPMENT      119
ADVOCATE                  118
CHEF                      118
ENGINEERING               118
ACCOUNTANT                118
FINANCE                   118
FITNESS                   117
AVIATION                  117
SALES                     116
BANKING                   115
HEALTHCARE                115
CONSULTANT                115
CONSTRUCTION              112
PUBLIC-RELATIONS          111
HR                        110
DESIGNER                  107
ARTS                      103
TEACHER                   102
APPAREL                    97
DIGITAL-MEDIA              96
AGRICULTURE                63
AUTOMOBILE                 36
BPO                        22
Name: count, dtype: int64


In [5]:
def preprocess_jobs(
    input_csv="jobs_with_category.csv",
    output_csv="jobs_clean.csv",
    min_chars=30,
    min_words=5,
):
    """
    Load jobs_with_category.csv, select needed columns, clean text fields, normalize categories,
    drop short samples (based on combined title+description), save.
    Expected columns: job_id, job_title, job_description, category
    """
    df = pd.read_csv(input_csv, usecols=["job_id", "job_title", "job_description", "category"])

    # Normalize category labels to uppercase + strip whitespace
    df["category"] = df["category"].apply(normalize_category)

    # Clean job title and description separately
    df["job_title"] = df["job_title"].apply(clean_text)
    df["job_description"] = df["job_description"].apply(clean_text)

    # Drop very short job postings using combined text length (title + description)
    combined = (df["job_title"].fillna("") + " " + df["job_description"].fillna("")).str.strip()
    df["_combined"] = combined

    df = drop_short_text_rows(df, text_col="_combined", min_chars=min_chars, min_words=min_words)

    df = df[df["category"] != ""]
    df = df.drop(columns=["_combined"])

    # Save cleaned output
    df.to_csv(output_csv, index=False)

    # Basic stats
    print(f"[jobs] saved: {output_csv}")
    print(f"[jobs] rows: {len(df)}")
    print("[jobs] category distribution:")
    print(df["category"].value_counts())

    return df


# Run preprocessing
resumes_clean = preprocess_resumes()
jobs_clean = preprocess_jobs()


[jobs] saved: jobs_clean.csv
[jobs] rows: 2277
[jobs] category distribution:
category
ENGINEERING               1861
INFORMATION-TECHNOLOGY     284
PUBLIC-RELATIONS           132
Name: count, dtype: int64


In [6]:
import inspect
inspect.getsource(preprocess_resumes)[:800]

'def preprocess_resumes(\n    input_csv="Resume.csv",\n    output_csv="resumes_clean.csv",\n    min_chars=30,\n    min_words=5,\n):\n    """\n    Load Resume.csv, select needed columns, clean text, normalize categories, drop short samples, save.\n    Expected columns: ID, Resume_str, Category\n    """\n\n    # Select only needed columns (saves memory + prevents accidental leakage of extra fields)\n    df = pd.read_csv(input_csv, usecols=["ID", "Resume_str", "Category"])\n\n    df = df.rename(columns={\n        "ID": "resume_id",\n        "Resume_str": "resume_text",\n        "Category": "category"\n    })\n\n    # Normalize category labels\n    df["category"] = df["category"].apply(normalize_category)\n\n    # Clean resume text\n    df["resume_text"] = df["resume_text"].apply(clean_text)\n\n    # Drop very short res'

In [7]:
import os
print("resumes_clean.csv exists?", os.path.exists("resumes_clean.csv"))
print("jobs_clean.csv exists?", os.path.exists("jobs_clean.csv"))

resumes_clean.csv exists? True
jobs_clean.csv exists? True


**Pair Generation + Weak Labels**

In [8]:
# STEP 4 — Pair Generation + Better HIGH + More Balanced Labels
# Inputs (in Colab working dir):
# - resumes_clean.csv: resume_id, resume_text, category
# - jobs_clean.csv: job_id, job_title, job_description, category
#
# Outputs:
# - train_pairs.csv (balanced-ish across labels)
# - train_split.csv, val_split.csv (80/20 stratified by label)

TECH_PAIR = {"ENGINEERING", "INFORMATION-TECHNOLOGY"}
PR_CAT = "PUBLIC-RELATIONS"

# Stronger keyword cues used to prefer higher-quality HIGH pairs (title/description still used for fallback scoring)
CATEGORY_KEYWORDS = {
    "ENGINEERING": [
        "engineer", "engineering", "mechanical", "civil", "electrical", "manufacturing",
        "cad", "solidworks", "autocad", "hvac", "controls", "quality", "validation",
    ],
    "INFORMATION-TECHNOLOGY": [
        "software", "developer", "programmer", "full stack", "backend", "frontend",
        "devops", "cloud", "aws", "azure", "gcp", "kubernetes", "docker",
        "data", "machine learning", "ml", "ai", "security", "network", "sysadmin",
        "python", "java", "javascript", "sql",
    ],
    "PUBLIC-RELATIONS": [
        "public relations", "pr", "communications", "media relations", "press",
        "community", "brand", "publicist", "stakeholder", "content", "social media",
    ],
}

def load_data(
    resumes_csv: str = "resumes_clean.csv",
    jobs_csv: str = "jobs_clean.csv",
):
    """
    Load resumes/jobs, select needed columns, normalize categories, filter to overlap categories.
    """
    resumes = pd.read_csv(resumes_csv, usecols=["resume_id", "resume_text", "category"])
    jobs = pd.read_csv(jobs_csv, usecols=["job_id", "job_title", "job_description", "category"])

    resumes["category"] = resumes["category"].astype(str).str.strip().str.upper()
    jobs["category"] = jobs["category"].astype(str).str.strip().str.upper()

    resumes["resume_text"] = resumes["resume_text"].fillna("").astype(str)
    jobs["job_title"] = jobs["job_title"].fillna("").astype(str)
    jobs["job_description"] = jobs["job_description"].fillna("").astype(str)

    # Build job_text for downstream use
    jobs["job_text"] = (jobs["job_title"] + " " + jobs["job_description"]).str.strip()

    overlap = sorted(set(resumes["category"]).intersection(set(jobs["category"])))
    print("Overlap categories:", overlap)

    resumes = resumes[resumes["category"].isin(overlap)].reset_index(drop=True)
    jobs = jobs[jobs["category"].isin(overlap)].reset_index(drop=True)

    return resumes, jobs, overlap

def assign_label(resume_category: str, job_category: str) -> int:
    """
    3-class weak supervision labels:
    - 2 HIGH: same category
    - 1 MEDIUM: ENGINEERING <-> INFORMATION-TECHNOLOGY
    - 0 LOW: otherwise
    """
    rc = str(resume_category).strip().upper()
    jc = str(job_category).strip().upper()

    if rc == jc:
        return 2
    if {rc, jc} == TECH_PAIR:
        return 1
    return 0

def _normalize_text(s: str) -> str:
    s = str(s).lower()
    s = s.replace("_", " ").replace("-", " ")
    s = " ".join(s.split())
    return s

def _keyword_score(text: str, keywords: list[str]) -> int:
    """
    Score = number of keyword hits (simple contains match).
    Used to prefer more 'on-category' jobs for HIGH pairing.
    """
    t = _normalize_text(text)
    score = 0
    for kw in keywords:
        if _normalize_text(kw) in t:
            score += 1
    return score

def _prepare_job_pools(jobs: pd.DataFrame, overlap_categories: list[str]):
    """
    Precompute per-category pools of job_ids and sampling weights
    to make HIGH selection 'better' (keyword-weighted).
    """
    pools = {}
    for cat in overlap_categories:
        subset = jobs[jobs["category"] == cat].copy()
        if subset.empty:
            pools[cat] = {"job_ids": np.array([], dtype=int), "weights": np.array([], dtype=float)}
            continue

        kws = CATEGORY_KEYWORDS.get(cat, [])
        # Prefer jobs with strong keyword evidence in title, then also in full job_text
        scores = []
        for _, row in subset.iterrows():
            title_score = _keyword_score(row["job_title"], kws) if kws else 0
            text_score = _keyword_score(row["job_text"], kws) if kws else 0
            score = (2 * title_score) + text_score
            scores.append(score)

        scores = np.array(scores, dtype=float)
        weights = 1.0 + scores  # ensure non-zero
        pools[cat] = {
            "job_ids": subset["job_id"].astype(int).to_numpy(),
            "weights": weights,
        }
    return pools

def _sample_job_ids(
    rng: np.random.Generator,
    job_ids: np.ndarray,
    weights: np.ndarray | None,
    n: int,
    forbid: set[int],
):
    """
    Sample n job_ids.
    - Prefer no duplicates within a resume when possible.
    - If pool is small, sample with replacement.
    """
    if len(job_ids) == 0 or n <= 0:
        return []

    chosen = []
    # If enough unique jobs exist, sample without replacement while avoiding forbid
    available_mask = np.array([jid not in forbid for jid in job_ids], dtype=bool)
    available_ids = job_ids[available_mask]
    available_w = weights[available_mask] if weights is not None and len(weights) == len(job_ids) else None

    if len(available_ids) >= n:
        probs = None
        if available_w is not None:
            probs = available_w / available_w.sum()
        picked = rng.choice(available_ids, size=n, replace=False, p=probs)
        return [int(x) for x in picked]

    # Otherwise sample with replacement and do best-effort forbid avoidance
    probs = None
    if weights is not None and len(weights) == len(job_ids):
        probs = weights / weights.sum()

    attempts = 0
    while len(chosen) < n and attempts < n * 20:
        jid = int(rng.choice(job_ids, size=1, replace=True, p=probs)[0])
        # allow duplicates only if we must (tiny pools)
        if jid not in forbid or len(job_ids) < (len(forbid) + 1):
            chosen.append(jid)
            forbid.add(jid)
        attempts += 1

    # If still short (extremely constrained), fill arbitrarily with replacement
    while len(chosen) < n:
        chosen.append(int(rng.choice(job_ids, size=1, replace=True, p=probs)[0]))

    return chosen

def _low_category_for(resume_cat: str, overlap_categories: list[str]) -> str | None:
    """
    LOW sampling strategy (per spec):
    - if resume is tech: pick PUBLIC-RELATIONS
    - if resume is PUBLIC-RELATIONS: pick ENGINEERING
    Fallback: any different category.
    """
    if resume_cat in TECH_PAIR and PR_CAT in overlap_categories:
        return PR_CAT
    if resume_cat == PR_CAT and "ENGINEERING" in overlap_categories:
        return "ENGINEERING"
    others = [c for c in overlap_categories if c != resume_cat]
    return others[0] if others else None

def build_pairs_improved(
    resumes: pd.DataFrame,
    jobs: pd.DataFrame,
    overlap_categories: list[str],
    seed: int = 42,
    n_high: int = 2,
    n_medium: int = 2,   # only for tech resumes
    n_low: int = 2,
    topk_same: int = 150,
    topk_diff: int = 200,
):
    rng = np.random.default_rng(seed)

    # TF-IDF on combined corpus (shared vocabulary space)
    corpus = pd.concat([resumes["resume_text"], jobs["job_text"]], ignore_index=True)
    vec = TfidfVectorizer(max_features=50000, ngram_range=(1,2), stop_words="english")
    X = vec.fit_transform(corpus)

    R = X[:len(resumes)]
    J = X[len(resumes):]

    # job row indices by category for fast slicing
    jobs_by_cat = {c: jobs.index[jobs["category"] == c].to_numpy() for c in overlap_categories}

    rows = []
    for i, r in resumes.iterrows():
        resume_id = int(r["resume_id"])
        resume_text = r["resume_text"]
        rc = r["category"]

        used_job_rows = set()

        def pick_best(pool_rows, k, max_pool):
            if len(pool_rows) == 0 or k <= 0:
                return []

            pool = pool_rows
            if len(pool) > max_pool:
                pool = rng.choice(pool, size=max_pool, replace=False)

            sims = cosine_similarity(R[i], J[pool]).ravel()
            order = np.argsort(-sims)

            picks = []
            for idx in order:
                row_id = int(pool[idx])
                if row_id not in used_job_rows:
                    picks.append(row_id)
                    used_job_rows.add(row_id)
                if len(picks) == k:
                    break

            # if still short, fill randomly (tiny pools)
            while len(picks) < k:
                picks.append(int(rng.choice(pool_rows)))

            return picks

        # HIGH: best matches within same category (job-specific positives)
        same_rows = jobs_by_cat.get(rc, np.array([], dtype=int))
        high_rows = pick_best(same_rows, n_high, topk_same)
        for jr in high_rows:
            job = jobs.loc[jr]
            rows.append({
                "resume_id": resume_id,
                "job_id": int(job["job_id"]),
                "label": 2,
                "resume_text": resume_text,
                "job_title": job["job_title"],
                "job_description": job["job_description"],
                "resume_category": rc,
                "job_category": job["category"],
                "job_text": job["job_text"],
            })

        # MEDIUM: tech cross-pair (best match within other tech category)
        if rc in TECH_PAIR:
            other_cat = (TECH_PAIR - {rc}).pop()
            other_rows = jobs_by_cat.get(other_cat, np.array([], dtype=int))
            med_rows = pick_best(other_rows, n_medium, topk_same)
            for jr in med_rows:
                job = jobs.loc[jr]
                rows.append({
                    "resume_id": resume_id,
                    "job_id": int(job["job_id"]),
                    "label": 1,
                    "resume_text": resume_text,
                    "job_title": job["job_title"],
                    "job_description": job["job_description"],
                    "resume_category": rc,
                    "job_category": job["category"],
                    "job_text": job["job_text"],
                })

        # LOW: hard negatives (different category but still similar)
        diff_cats = [c for c in overlap_categories if c != rc]
        diff_rows = jobs.index[jobs["category"].isin(diff_cats)].to_numpy()
        low_rows = pick_best(diff_rows, n_low, topk_diff)
        for jr in low_rows:
            job = jobs.loc[jr]
            rows.append({
                "resume_id": resume_id,
                "job_id": int(job["job_id"]),
                "label": 0,
                "resume_text": resume_text,
                "job_title": job["job_title"],
                "job_description": job["job_description"],
                "resume_category": rc,
                "job_category": job["category"],
                "job_text": job["job_text"],
            })

    pairs = pd.DataFrame(rows)
    pairs["label"] = pairs["label"].astype(int)

    # Safety: ensure labels match the rule
    pairs["label"] = pairs.apply(lambda x: assign_label(x["resume_category"], x["job_category"]), axis=1).astype(int)

    return pairs

def balance_by_label(pairs: pd.DataFrame, seed: int = 42) -> pd.DataFrame:
    """
    Make labels more balanced by sampling each label to the same count.
    - Downsample majority labels (no replacement)
    - Oversample minority labels if needed (with replacement)
    """
    rng = np.random.default_rng(seed)
    counts = pairs["label"].value_counts().to_dict()
    labels = sorted(counts.keys())
    target = min(counts.values())

    parts = []
    for lab in labels:
        subset = pairs[pairs["label"] == lab]
        if len(subset) == target:
            parts.append(subset)
        elif len(subset) > target:
            parts.append(subset.sample(n=target, replace=False, random_state=seed))
        else:
            # oversample with replacement if a label is scarce
            idx = rng.choice(subset.index.to_numpy(), size=target, replace=True)
            parts.append(pairs.loc[idx])

    balanced = pd.concat(parts, ignore_index=True).sample(frac=1.0, random_state=seed).reset_index(drop=True)
    return balanced

def make_splits(pairs: pd.DataFrame, seed: int = 42):
    """
    80/20 split stratified by label.
    """
    train_df, val_df = train_test_split(
        pairs,
        test_size=0.2,
        random_state=seed,
        stratify=pairs["label"],
    )
    return train_df.reset_index(drop=True), val_df.reset_index(drop=True)

In [9]:
# ---------------- RUN STEP 4 ----------------
resumes_df, jobs_df, overlap_cats = load_data("resumes_clean.csv", "jobs_clean.csv")

pairs_df = build_pairs_improved(
    resumes_df,
    jobs_df,
    overlap_cats,
    seed=42,
    n_high=2,
    n_medium=2,
    n_low=2,
)

print("\nLabel distribution BEFORE balancing:")
print(pairs_df["label"].value_counts().sort_index())

pairs_bal = balance_by_label(pairs_df, seed=42)

print("\nLabel distribution AFTER balancing:")
print(pairs_bal["label"].value_counts().sort_index())

# Save full pairs (balanced-ish)
pairs_bal.to_csv("train_pairs.csv", index=False)
print("\nSaved: train_pairs.csv | rows:", len(pairs_bal))

train_df, val_df = make_splits(pairs_bal, seed=42)

print("\nTrain label distribution:")
print(train_df["label"].value_counts().sort_index())

print("\nVal label distribution:")
print(val_df["label"].value_counts().sort_index())

train_df.to_csv("train_split.csv", index=False)
val_df.to_csv("val_split.csv", index=False)
print("\nSaved: train_split.csv, val_split.csv")

Overlap categories: ['ENGINEERING', 'INFORMATION-TECHNOLOGY', 'PUBLIC-RELATIONS']

Label distribution BEFORE balancing:
label
0    236
1    938
2    698
Name: count, dtype: int64

Label distribution AFTER balancing:
label
0    236
1    236
2    236
Name: count, dtype: int64

Saved: train_pairs.csv | rows: 708

Train label distribution:
label
0    189
1    189
2    188
Name: count, dtype: int64

Val label distribution:
label
0    47
1    47
2    48
Name: count, dtype: int64

Saved: train_split.csv, val_split.csv


In [10]:
#Random check
train_df = pd.read_csv("train_split.csv")
val_df = pd.read_csv("val_split.csv")
print(train_df.columns)
print(train_df["label"].value_counts(), "\n")
print(val_df["label"].value_counts())


Index(['resume_id', 'job_id', 'label', 'resume_text', 'job_title',
       'job_description', 'resume_category', 'job_category', 'job_text'],
      dtype='object')
label
1    189
0    189
2    188
Name: count, dtype: int64 

label
2    48
0    47
1    47
Name: count, dtype: int64


In [11]:
import pandas as pd
df = pd.read_csv("train_split.csv")
print(df.shape)
print(df["label"].value_counts())

(566, 9)
label
1    189
0    189
2    188
Name: count, dtype: int64


**Training**

In [12]:
!pip -q install -U transformers datasets accelerate scikit-learn

In [13]:
!pip -q install -U "huggingface_hub>=0.33.5,<2.0" "transformers>=4.45.0" "tokenizers>=0.20.0" "accelerate>=0.33.0" "datasets>=2.21.0" scikit-learn



In [14]:
import os
import inspect
import torch

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)
from transformers.trainer_callback import EarlyStoppingCallback
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    precision_recall_fscore_support,
)

MODEL_NAME = "distilbert-base-uncased"
MAX_LENGTH = 384
OUTPUT_DIR = "fit_classifier_distilbert"


def load_data(train_csv: str = "train_split.csv", val_csv: str = "val_split.csv"):
    """
    Load train/val CSVs and return HF Datasets with columns:
    - resume_text, job_text, labels
    """
    train_df = pd.read_csv(train_csv, usecols=["resume_text", "job_text", "label"])
    val_df = pd.read_csv(val_csv, usecols=["resume_text", "job_text", "label"])

    for df in (train_df, val_df):
        df["resume_text"] = df["resume_text"].fillna("").astype(str)
        df["job_text"] = df["job_text"].fillna("").astype(str)
        df["label"] = df["label"].astype(int)

    train_ds = Dataset.from_pandas(train_df, preserve_index=False).rename_column("label", "labels")
    val_ds = Dataset.from_pandas(val_df, preserve_index=False).rename_column("label", "labels")
    return train_ds, val_ds


def tokenize_pairs(tokenizer, batch):
    """
    Paired tokenization: (resume_text, job_text)
    """
    return tokenizer(
        batch["resume_text"],
        batch["job_text"],
        max_length=MAX_LENGTH,
        truncation=True,
        padding="max_length",
    )


def compute_metrics(eval_pred):
    """
    Return weighted precision/recall/F1 (main metric) + accuracy.
    """
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels, preds)
    p_w, r_w, f1_w, _ = precision_recall_fscore_support(
        labels, preds, average="weighted", zero_division=0
    )
    return {
        "accuracy": acc,
        "precision_weighted": p_w,
        "recall_weighted": r_w,
        "f1_weighted": f1_w,
    }


class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")

        if self.class_weights is not None:
            weight = self.class_weights.to(logits.device)  # <<< FIX
            loss_fct = torch.nn.CrossEntropyLoss(weight=weight)
        else:
            loss_fct = torch.nn.CrossEntropyLoss()

        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss


# ----- Load + tokenize -----
train_ds, val_ds = load_data("train_split.csv", "val_split.csv")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_tok = train_ds.map(lambda b: tokenize_pairs(tokenizer, b), batched=True)
val_tok = val_ds.map(lambda b: tokenize_pairs(tokenizer, b), batched=True)

# Keep only model inputs + labels
keep_cols = ["input_ids", "attention_mask", "labels"]
train_tok = train_tok.remove_columns([c for c in train_tok.column_names if c not in keep_cols])
val_tok = val_tok.remove_columns([c for c in val_tok.column_names if c not in keep_cols])

train_tok.set_format("torch")
val_tok.set_format("torch")

# ----- Class weights from training distribution -----
train_labels = np.array(train_ds["labels"], dtype=int)
counts = np.bincount(train_labels, minlength=3).astype(float)

# Inverse-frequency style weights (stable and common)
# weight_c = N / (C * count_c)
N = counts.sum()
C = 3.0
weights = N / (C * np.maximum(counts, 1.0))
class_weights = torch.tensor(weights, dtype=torch.float)

print("Train label counts:", counts.astype(int).tolist())
print("Class weights:", class_weights.tolist())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/566 [00:00<?, ? examples/s]

Map:   0%|          | 0/142 [00:00<?, ? examples/s]

Train label counts: [189, 189, 188]
Class weights: [0.998236358165741, 0.998236358165741, 1.003546118736267]


In [15]:
# ----- Model -----
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ----- TrainingArguments with version-safe eval strategy -----
common_args = dict(
    output_dir="distilbert_fit_runs",
    num_train_epochs=5,                 # slightly longer training improves stability
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_weighted",
    greater_is_better=True,
    logging_strategy="steps",
    logging_steps=50,
    report_to="none",
)

sig_args = inspect.signature(TrainingArguments.__init__).parameters
if "eval_strategy" in sig_args:
    training_args = TrainingArguments(**common_args, eval_strategy="epoch")
else:
    training_args = TrainingArguments(**common_args, evaluation_strategy="epoch")

# ----- Trainer with version-safe tokenizer/processing_class -----
trainer_kwargs = dict(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

sig_tr = inspect.signature(Trainer.__init__).parameters
if "tokenizer" in sig_tr:
    trainer_kwargs["tokenizer"] = tokenizer
elif "processing_class" in sig_tr:
    trainer_kwargs["processing_class"] = tokenizer

trainer = WeightedTrainer(**trainer_kwargs, class_weights=class_weights)

# ----- Train -----
trainer.train()

# ----- Save best model + tokenizer -----
os.makedirs(OUTPUT_DIR, exist_ok=True)
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"\nSaved model + tokenizer to: {OUTPUT_DIR}/")

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
pre_classifier.bias     | MISSING    | 
pre_classifier.weight   | MISSING    | 
classifier.weight       | MISSING    | 
classifier.bias         | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss,Accuracy,Precision Weighted,Recall Weighted,F1 Weighted
1,No log,0.822961,0.619718,0.415814,0.619718,0.496671
2,0.937895,0.769441,0.591549,0.413513,0.591549,0.484185
3,0.708210,0.693092,0.605634,0.495805,0.605634,0.514873
4,0.708210,0.582984,0.739437,0.759194,0.739437,0.726989
5,0.601274,0.557406,0.746479,0.769712,0.746479,0.736858


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias'].
There were unexpected keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.beta', 'distilbert.embeddings.LayerNorm.gamma'].


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


Saved model + tokenizer to: fit_classifier_distilbert/


In [16]:
# ----- Final evaluation (val) with detailed metrics -----
pred = trainer.predict(val_tok)
val_logits = pred.predictions
val_labels = pred.label_ids
val_preds = np.argmax(val_logits, axis=-1)

print("\nFinal evaluation (val):")
print(
    classification_report(
        val_labels,
        val_preds,
        labels=[0, 1, 2],
        target_names=["0=LOW", "1=MEDIUM", "2=HIGH"],
        digits=4,
        zero_division=0,
    )
)

acc = accuracy_score(val_labels, val_preds)
print(f"Accuracy: {acc:.4f}")

cm = confusion_matrix(val_labels, val_preds, labels=[0, 1, 2])
print("\nConfusion matrix (rows=true, cols=pred) for labels [0,1,2]:")
print(cm)


Final evaluation (val):
              precision    recall  f1-score   support

       0=LOW     0.9773    0.9149    0.9451        47
    1=MEDIUM     0.6087    0.8936    0.7241        47
      2=HIGH     0.7241    0.4375    0.5455        48

    accuracy                         0.7465       142
   macro avg     0.7700    0.7487    0.7382       142
weighted avg     0.7697    0.7465    0.7369       142

Accuracy: 0.7465

Confusion matrix (rows=true, cols=pred) for labels [0,1,2]:
[[43  0  4]
 [ 1 42  4]
 [ 0 27 21]]


**Two-stage pipeline : SBERT Retrieval + DistilBERT Reranking**

In [17]:
# STEP 6 — Inference: SBERT Retrieval + DistilBERT Reranking
# - jobs_clean.csv
# - fit_classifier_distilbert/  (saved model + tokenizer from Step 5)

!pip -q install -U sentence-transformers transformers accelerate

In [18]:
import json
from pathlib import Path
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForSequenceClassification


SBERT_MODEL_NAME = "all-MiniLM-L6-v2"
CLASSIFIER_DIR = "fit_classifier_distilbert"
MAX_LENGTH = 384  # must match your Step 5 upgraded setting (or keep at 256 if you trained that way)


def load_models(
    sbert_model_name: str = SBERT_MODEL_NAME,
    classifier_dir: str = CLASSIFIER_DIR,
):
    """
    Load retrieval model (Sentence-BERT) and reranker (trained DistilBERT classifier).
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    sbert = SentenceTransformer(sbert_model_name, device=str(device))

    clf_tokenizer = AutoTokenizer.from_pretrained(classifier_dir)
    clf_model = AutoModelForSequenceClassification.from_pretrained(classifier_dir)
    clf_model.to(device)
    clf_model.eval()

    return sbert, clf_tokenizer, clf_model, device


def embed_jobs(
    sbert: SentenceTransformer,
    jobs_csv: str = "jobs_clean.csv",
    cache_dir: str = "emb_cache",
    force_recompute: bool = False,
):
    """
    Build (and cache) job embeddings for retrieval.
    Caches embeddings to avoid recomputation.

    Cache files:
    - emb_cache/jobs_job_ids.npy
    - emb_cache/jobs_embeddings.npy
    - emb_cache/cache_meta.json
    """
    cache_path = Path(cache_dir)
    cache_path.mkdir(parents=True, exist_ok=True)

    emb_file = cache_path / "jobs_embeddings.npy"
    ids_file = cache_path / "jobs_job_ids.npy"
    meta_file = cache_path / "cache_meta.json"

    jobs = pd.read_csv(jobs_csv, usecols=["job_id", "job_title", "job_description", "category"])
    jobs["job_title"] = jobs["job_title"].fillna("").astype(str)
    jobs["job_description"] = jobs["job_description"].fillna("").astype(str)
    jobs["category"] = jobs["category"].fillna("").astype(str)
    jobs["job_text"] = (jobs["job_title"] + " " + jobs["job_description"]).str.strip()

    job_ids = jobs["job_id"].astype(int).to_numpy()

    can_load_cache = (
        (not force_recompute)
        and emb_file.exists()
        and ids_file.exists()
        and meta_file.exists()
    )

    if can_load_cache:
        try:
            cached_ids = np.load(ids_file)
            with open(meta_file, "r") as f:
                meta = json.load(f)

            # Basic cache validation: same model + same job_id sequence length
            if (
                  meta.get("sbert_model_name") == SBERT_MODEL_NAME
                  and len(cached_ids) == len(job_ids)
                  and np.array_equal(cached_ids, job_ids)
              ):
                job_embeddings = np.load(emb_file)
                print(f"Loaded cached job embeddings: {emb_file} | shape={job_embeddings.shape}")
                return jobs, job_embeddings
        except Exception:
            pass  # fall through to recompute

    print("Computing job embeddings (this runs once; then cached)...")
    # normalize_embeddings=True makes cosine similarity a dot product
    job_embeddings = sbert.encode(
        jobs["job_text"].tolist(),
        batch_size=128,
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=True,
    ).astype(np.float32)

    np.save(emb_file, job_embeddings)
    np.save(ids_file, job_ids)

    meta = {
        "sbert_model_name": SBERT_MODEL_NAME,
        "jobs_csv": jobs_csv,
        "num_jobs": int(len(jobs)),
    }
    with open(meta_file, "w") as f:
        json.dump(meta, f)

    print(f"Saved cached embeddings to: {cache_path}")
    return jobs, job_embeddings


def retrieve_top_k(
    resume_text: str,
    sbert: SentenceTransformer,
    jobs_df: pd.DataFrame,
    job_embeddings: np.ndarray,
    k: int = 20,
):
    """
    Stage 1 retrieval using SBERT cosine similarity.
    Returns a DataFrame of top-k candidates with similarity_score.
    """
    resume_text = "" if resume_text is None else str(resume_text).strip()
    if not resume_text:
        raise ValueError("resume_text is empty")

    resume_emb = sbert.encode(
        [resume_text],
        convert_to_numpy=True,
        normalize_embeddings=True,
        show_progress_bar=False,
    ).astype(np.float32)

    # Cosine similarity for normalized vectors = dot product
    sims = (job_embeddings @ resume_emb[0]).astype(np.float32)

    k = min(k, len(jobs_df))
    top_idx = np.argpartition(-sims, kth=k - 1)[:k]
    top_idx = top_idx[np.argsort(-sims[top_idx])]

    candidates = jobs_df.iloc[top_idx].copy()
    candidates["similarity_score"] = sims[top_idx]
    return candidates.reset_index(drop=True)


@torch.no_grad()
def rerank_with_classifier(
    resume_text: str,
    candidates_df: pd.DataFrame,
    clf_tokenizer,
    clf_model,
    device,
    max_length: int = MAX_LENGTH,
    batch_size: int = 16,
):
    """
    Stage 2 reranking using your fine-tuned DistilBERT classifier.
    Ranks primarily by P(HIGH). Adds predicted_fit, confidence, and per-class probs.
    """
    label_map = {0: "LOW", 1: "MEDIUM", 2: "HIGH"}

    resume_text = "" if resume_text is None else str(resume_text).strip()
    if not resume_text:
        raise ValueError("resume_text is empty")

    job_texts = candidates_df["job_text"].fillna("").astype(str).tolist()

    probs_all = []
    for start in range(0, len(job_texts), batch_size):
        batch_job_texts = job_texts[start : start + batch_size]
        batch_resume_texts = [resume_text] * len(batch_job_texts)

        enc = clf_tokenizer(
            batch_resume_texts,
            batch_job_texts,
            max_length=max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        enc = {k: v.to(device) for k, v in enc.items()}

        logits = clf_model(**enc).logits
        probs = torch.softmax(logits, dim=-1).detach().cpu().numpy()
        probs_all.append(probs)

    probs_all = np.vstack(probs_all)
    pred_ids = probs_all.argmax(axis=1)

    out = candidates_df.copy()
    out["prob_low"] = probs_all[:, 0]
    out["prob_medium"] = probs_all[:, 1]
    out["prob_high"] = probs_all[:, 2]

    # Base prediction (argmax)
    out["predicted_fit"] = [label_map[int(i)] for i in pred_ids]

    # Score-based ranking (expected fit score)
    out["fit_score"] = (0*out["prob_low"] + 1*out["prob_medium"] + 2*out["prob_high"])

    # Keep prob_high as confidence (useful for printing/UI)
    out["confidence"] = out["prob_high"]

    # Add a separate HIGH flag
    out["high_flag"] = (out["predicted_fit"] == "HIGH")
    out["high_likelihood"] = (out["prob_high"] >= 0.45)

    # Sort by fit_score first, then prob_high, then similarity
    out = out.sort_values(
        ["fit_score", "prob_high", "similarity_score"],
        ascending=[False, False, False],
    ).reset_index(drop=True)

    # Deduplicate by job_id (not title)
    out = out.drop_duplicates(subset=["job_title", "job_description"], keep="first").reset_index(drop=True)

    return out


# --------- Example run ---------
sbert, clf_tokenizer, clf_model, device = load_models()
jobs_df, job_embeddings = embed_jobs(
    sbert,
    jobs_csv="jobs_clean.csv",
    cache_dir="emb_cache",
    force_recompute=False
)

resume_text_input = """
Senior software engineer with experience in Python, APIs, AWS, Docker, Kubernetes,
CI/CD pipelines, and building scalable backend services. Worked with SQL, data pipelines,
and monitoring/observability.
""".strip()

K = 200
candidates = retrieve_top_k(
    resume_text_input,
    sbert,
    jobs_df,
    job_embeddings,
    k=K
)

reranked = rerank_with_classifier(
    resume_text_input,
    candidates,
    clf_tokenizer,
    clf_model,
    device
)
reranked_unique = reranked.drop_duplicates(subset=["job_title"], keep="first").reset_index(drop=True)
print("\nTop jobs (retrieval + reranking):\n")
print("job_title | predicted_fit | high_flag | high_likelihood | fit_score | prob_high | similarity")

for _, row in reranked_unique.head(20).iterrows():
    print(
        f"{row['job_title'][:60]:60} | "
        f"{row['predicted_fit']:11} | "
        f"{str(bool(row['high_flag'])):9} | "
        f"{str(bool(row['high_likelihood'])):14} | "
        f"{row['fit_score']:.3f}   | "
        f"{row['confidence']:.4f}  | "
        f"{row['similarity_score']:.4f}"
    )

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]

Loaded cached job embeddings: emb_cache/jobs_embeddings.npy | shape=(2277, 384)

Top jobs (retrieval + reranking):

job_title | predicted_fit | high_flag | high_likelihood | fit_score | prob_high | similarity
backend developer                                            | HIGH        | True      | True           | 1.473   | 0.5394  | 0.6401
php developer                                                | HIGH        | True      | True           | 1.472   | 0.5440  | 0.6056
django developer                                             | HIGH        | True      | True           | 1.466   | 0.5312  | 0.6161
software engineer                                            | HIGH        | True      | True           | 1.452   | 0.4850  | 0.6266
devops engineer                                              | HIGH        | True      | True           | 1.448   | 0.4896  | 0.6087
java developer                                               | HIGH        | True      | True           | 1.442   | 0.5084  | 

**GRADIO Interface**

In [19]:
!pip -q uninstall -y gradio gradio-client huggingface_hub
!pip -q install -U --no-cache-dir gradio==6.5.1 gradio-client==2.0.3 "huggingface_hub>=0.33.5,<2.0" pypdf==4.3.1

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.2/24.2 MB[0m [31m87.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.7/55.7 kB[0m [31m241.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m308.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [20]:
import gradio as gr
import huggingface_hub
import gradio_client
from pypdf import PdfReader

print("gradio:", gr.__version__)
print("huggingface_hub:", huggingface_hub.__version__)
print("gradio_client:", gradio_client.__version__)

gradio: 6.5.1
huggingface_hub: 1.3.5
gradio_client: 2.0.3


In [24]:
# STEP 7 — Gradio UI (Resume + JD upload/text) + Heuristic Match + Recommendations + Top-K Jobs

import os
import re
import time
import numpy as np
import pandas as pd
import gradio as gr
from pypdf import PdfReader
from functools import lru_cache

os.environ["HF_HUB_ETAG_TIMEOUT"] = "120"
os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "600"
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

OUT_COLS = [
    "job_title",
    "category",
    "predicted_fit",
    "fit_score",
    "prob_high",
    "similarity_score",
]

@lru_cache(maxsize=1)
def get_assets():
    t0 = time.perf_counter()
    print("[init] Loading models...")
    sbert, clf_tokenizer, clf_model, device = load_models()
    t1 = time.perf_counter()
    print(f"[init] load_models() took {t1 - t0:.1f}s")

    print("[init] Loading/creating job embeddings (cached if available)...")
    jobs_df, job_embeddings = embed_jobs(
        sbert,
        jobs_csv="jobs_clean.csv",
        cache_dir="emb_cache",
        force_recompute=False,
    )
    t2 = time.perf_counter()
    print(f"[init] embed_jobs() took {t2 - t1:.1f}s")
    print(f"[init] Total init took {t2 - t0:.1f}s")
    return sbert, clf_tokenizer, clf_model, device, jobs_df, job_embeddings

def _extract_pdf_text(pdf_path: str) -> str:
    if not pdf_path:
        return ""
    reader = PdfReader(pdf_path)
    text = "\n".join((p.extract_text() or "") for p in reader.pages)
    return re.sub(r"\s+", " ", text).strip()

def _resolve_text(pdf_path: str, manual_text: str, label: str):
    text = ""
    source = "manual text"
    note = ""

    pdf_path = pdf_path or ""
    manual_text = (manual_text or "").strip()

    if pdf_path:
        try:
            text = _extract_pdf_text(pdf_path)
            source = "PDF"
            if not text:
                note = f"{label} PDF has no extractable text (likely scanned/image-only). Paste text below."
        except Exception as e:
            source = "PDF (error)"
            note = f"{label} PDF error: {type(e).__name__}: {e}"
            text = ""

    if (not text) and manual_text:
        text = manual_text
        source = "manual text"

    n = len(text)
    info = f"**{label} Source:** {source}\n\n**{label} Length:** {n:,} characters"
    if note:
        info += f"\n\n**{label} Note:** {note}"

    preview = (text[:300] + ("…" if n > 300 else "")) if text else ""
    return text, info, preview, note

_SKILLS = [
    "python","java","c++","c#","javascript","typescript","go","rust","scala","kotlin","swift",
    "sql","postgresql","mysql","sqlite","mongodb","redis","elasticsearch",
    "pandas","numpy","scikit-learn","sklearn","pytorch","tensorflow","keras","xgboost","lightgbm",
    "transformers","huggingface","nlp","bert","distilbert","sentence-transformers",
    "spark","hadoop","airflow","dbt","kafka","flink",
    "docker","kubernetes","helm","terraform","ansible","ci/cd","jenkins","github actions","gitlab ci",
    "aws","gcp","azure","lambda","sagemaker",
    "flask","django","fastapi","node","express","react","next.js","nextjs","vue","angular",
    "linux","bash","git",
    "tableau","power bi","excel","snowflake","bigquery","redshift",
    "mlops","data engineering","data analysis","data science",
    "agile","scrum","jira",
]

_skill_patterns = []
for s in _SKILLS:
    if s == "c++":
        _skill_patterns.append((s, re.compile(r"(?i)\bc\+\+\b")))
    elif s == "c#":
        _skill_patterns.append((s, re.compile(r"(?i)\bc#\b")))
    elif s == "ci/cd":
        _skill_patterns.append((s, re.compile(r"(?i)\bci\s*/\s*cd\b|\bci-cd\b")))
    elif s in {"sklearn", "scikit-learn"}:
        _skill_patterns.append(("scikit-learn", re.compile(r"(?i)\bscikit[- ]learn\b|\bsklearn\b")))
    elif s in {"next.js", "nextjs"}:
        _skill_patterns.append(("next.js", re.compile(r"(?i)\bnext\.js\b|\bnextjs\b")))
    elif s == "github actions":
        _skill_patterns.append((s, re.compile(r"(?i)\bgithub actions\b")))
    elif s == "gitlab ci":
        _skill_patterns.append((s, re.compile(r"(?i)\bgitlab ci\b")))
    elif s == "power bi":
        _skill_patterns.append((s, re.compile(r"(?i)\bpower\s*bi\b")))
    else:
        _skill_patterns.append((s, re.compile(r"(?i)\b" + re.escape(s) + r"\b")))

def extract_skills(text: str):
    if not text:
        return []
    found = []
    for s, pat in _skill_patterns:
        if pat.search(text):
            found.append(s)
    out = []
    seen = set()
    for s in found:
        if s not in seen:
            seen.add(s)
            out.append(s)
    return out

def extract_education(text: str):
    if not text:
        return []
    pats = [
        (r"(?i)\bph\.?d\b|\bdoctorate\b", "PhD/Doctorate"),
        (r"(?i)\bmaster('?s)?\b|\bm\.?s\.?\b|\bmsc\b|\bmeng\b", "Master's"),
        (r"(?i)\bbachelor('?s)?\b|\bb\.?s\.?\b|\bbsc\b|\bbe\b|\bbtech\b", "Bachelor's"),
        (r"(?i)\bassociate\b", "Associate"),
        (r"(?i)\bhigh school\b|\bsecondary school\b", "High School"),
    ]
    hits = []
    for rgx, label in pats:
        if re.search(rgx, text):
            hits.append(label)
    out = []
    seen = set()
    for h in hits:
        if h not in seen:
            seen.add(h)
            out.append(h)
    return out

_EDU_RANK = {
    "High School": 1,
    "Associate": 2,
    "Bachelor's": 3,
    "Master's": 4,
    "PhD/Doctorate": 5,
}

def highest_edu_level(hints):
    if not hints:
        return None, None
    best = None
    best_rank = -1
    for h in hints:
        r = _EDU_RANK.get(h, -1)
        if r > best_rank:
            best_rank = r
            best = h
    return best, best_rank if best is not None else None

def estimate_years_experience(text: str):
    if not text:
        return None
    vals = []
    for m in re.finditer(r"(?i)\b(\d{1,2})\s*\+?\s*(?:years|yrs)\b", text):
        vals.append(int(m.group(1)))
    for m in re.finditer(r"(?i)\b(\d{1,2})\s*-\s*(\d{1,2})\s*(?:years|yrs)\b", text):
        vals.append(int(m.group(2)))
    return max(vals) if vals else None

def parse_jd_min_years(text: str):
    if not text:
        return None
    vals = []
    for m in re.finditer(r"(?i)\b(\d{1,2})\s*\+?\s*(?:years|yrs)\b", text):
        vals.append(int(m.group(1)))
    for m in re.finditer(r"(?i)\b(\d{1,2})\s*-\s*(\d{1,2})\s*(?:years|yrs)\b", text):
        vals.append(int(m.group(1)))
    return min(vals) if vals else None

def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    a = np.asarray(a, dtype=np.float32)
    b = np.asarray(b, dtype=np.float32)
    na = float(np.linalg.norm(a))
    nb = float(np.linalg.norm(b))
    if na == 0.0 or nb == 0.0:
        return 0.0
    return float(np.dot(a, b) / (na * nb))

def empty_df():
    return pd.DataFrame(columns=OUT_COLS)

def run_matching(
    resume_pdf_path,
    resume_manual_text,
    jd_pdf_path,
    jd_manual_text,
    retrieve_pool_size,
    top_k_to_show,
    prob_high_threshold,
    dedup_by_job_title,
):
    t0 = time.perf_counter()

    resume_text, resume_info, resume_preview, resume_note = _resolve_text(
        resume_pdf_path, resume_manual_text, label="Resume"
    )
    jd_text, jd_info, jd_preview, jd_note = _resolve_text(
        jd_pdf_path, jd_manual_text, label="Job Description"
    )

    if not resume_text:
        match_md = "## Resume ↔ Job Description Match\n\n**Error:** Provide a resume PDF or paste resume text."
        if resume_note:
            match_md += f"\n\n**Resume Note:** {resume_note}"
        if jd_note:
            match_md += f"\n\n**JD Note:** {jd_note}"
        recs_md = "## Recommendations\n\nN/A"
        return resume_preview, jd_preview, match_md, recs_md, empty_df(), "Provide a resume first."

    # Load pipeline assets only when user clicks Run (cached after first run)
    sbert, clf_tokenizer, clf_model, device, jobs_df, job_embeddings = get_assets()

    # Heuristic analysis
    r_skills = extract_skills(resume_text)
    r_edu_hints = extract_education(resume_text)
    r_edu_name, r_edu_rank = highest_edu_level(r_edu_hints)
    r_years = estimate_years_experience(resume_text)

    match_lines = []
    match_lines.append("## Resume ↔ Job Description Match")
    match_lines.append(resume_info)
    match_lines.append(jd_info)

    rec_lines = []
    rec_lines.append("## Recommendations")

    if jd_text:
        jd_skills = extract_skills(jd_text)
        jd_edu_hints = extract_education(jd_text)
        jd_edu_name, jd_edu_rank = highest_edu_level(jd_edu_hints)
        jd_min_years = parse_jd_min_years(jd_text)

        r_skill_set = set(r_skills)
        jd_skill_set = set(jd_skills)
        matched_skills = [s for s in jd_skills if s in r_skill_set]
        missing_skills = [s for s in jd_skills if s not in r_skill_set]

        skill_coverage = (len(matched_skills) / max(1, len(jd_skill_set))) if len(jd_skill_set) > 0 else None

        edu_ok = None
        if (jd_edu_rank is not None) and (r_edu_rank is not None):
            edu_ok = r_edu_rank >= jd_edu_rank

        exp_ok = None
        if (jd_min_years is not None) and (r_years is not None):
            exp_ok = r_years >= jd_min_years

        # SBERT similarity (additional signal)
        try:
            rv = sbert.encode([resume_text], convert_to_numpy=True)[0]
            jv = sbert.encode([jd_text], convert_to_numpy=True)[0]
            sim = cosine_similarity(rv, jv)
            sim_score = max(0.0, min(1.0, float(sim)))
        except Exception:
            sim = float("nan")
            sim_score = 0.0

        # Weighted score with dynamic availability
        components = []
        weights = []

        if skill_coverage is not None:
            components.append(float(skill_coverage))
            weights.append(0.55)

        components.append(float(sim_score))
        weights.append(0.25)

        if exp_ok is not None:
            components.append(1.0 if exp_ok else 0.0)
            weights.append(0.15)

        if edu_ok is not None:
            components.append(1.0 if edu_ok else 0.0)
            weights.append(0.05)

        total_w = sum(weights) if weights else 1.0
        overall = sum(c * w for c, w in zip(components, weights)) / total_w if total_w > 0 else 0.0

        if overall >= 0.70:
            label = "HIGH"
        elif overall >= 0.45:
            label = "MEDIUM"
        else:
            label = "LOW"

        match_lines.append("")
        match_lines.append(f"### Overall Match: **{label}**")
        match_lines.append(f"**Overall score (0–1):** {overall:.3f}")

        match_lines.append("")
        match_lines.append("### Skills (keyword-based)")
        match_lines.append(f"**Resume skills detected:** {len(r_skill_set)}")
        match_lines.append(f"**JD skills detected:** {len(jd_skill_set)}")
        match_lines.append(f"**Matched skills:** {len(set(matched_skills))}")
        if skill_coverage is not None:
            match_lines.append(f"**JD skill coverage:** {skill_coverage:.3f}")

        match_lines.append("")
        match_lines.append("### Education (heuristics)")
        match_lines.append(f"**Resume education hint:** {r_edu_name if r_edu_name else 'Unknown'}")
        match_lines.append(f"**JD education hint:** {jd_edu_name if jd_edu_name else 'Not mentioned'}")
        if edu_ok is not None:
            match_lines.append(f"**Education match:** {'YES' if edu_ok else 'NO'}")

        match_lines.append("")
        match_lines.append("### Experience (heuristics)")
        match_lines.append(f"**Estimated resume years:** {r_years if r_years is not None else 'Unknown'}")
        match_lines.append(f"**JD minimum years (if detected):** {jd_min_years if jd_min_years is not None else 'Not mentioned'}")
        if exp_ok is not None:
            match_lines.append(f"**Experience match:** {'YES' if exp_ok else 'NO'}")

        match_lines.append("")
        match_lines.append("### Semantic similarity (SBERT)")
        match_lines.append(f"**Resume↔JD cosine similarity:** {sim:.4f}" if not np.isnan(sim) else "**Resume↔JD cosine similarity:** N/A")

        rec_lines.append("### Missing skills vs Job Description")
        rec_lines.append(", ".join(missing_skills[:40]) if missing_skills else "None detected from the curated list.")

        if edu_ok is False:
            rec_lines.append("\n### Education suggestion")
            rec_lines.append("- JD appears to mention a higher degree than your resume shows. If applicable, add the degree/certification clearly.")

        if exp_ok is False:
            rec_lines.append("\n### Experience suggestion")
            rec_lines.append("- JD appears to ask for more years than detected in your resume. Emphasize relevant experience and dates clearly.")

    else:
        match_lines.append("")
        match_lines.append("**JD not provided.** Upload/paste a Job Description to compute a resume↔JD match. Showing Top-K job matches for the resume below.")
        rec_lines.append("### Missing skills vs Job Description")
        rec_lines.append("JD not provided.")

    rec_lines.append("\n### Generic resume improvement tips")
    rec_lines.append("- Add measurable impact (%, $, time saved, latency reduced, revenue increased).")
    rec_lines.append("- Mirror key keywords from the JD in your bullets (tools/methods) only if true.")
    rec_lines.append("- Put the most relevant projects/experience first; keep bullets concise and action-oriented.")
    rec_lines.append("- Ensure your skills section matches what you actually used in projects/roles.")

    match_md = "\n\n".join(match_lines)
    recs_md = "\n".join(rec_lines)

    # Top-K jobs for the resume (existing Step 6 pipeline)
    candidates = retrieve_top_k(
        resume_text,
        sbert,
        jobs_df,
        job_embeddings,
        k=int(retrieve_pool_size),
    )

    reranked = rerank_with_classifier(
        resume_text,
        candidates,
        clf_tokenizer,
        clf_model,
        device,
    ).copy()

    if dedup_by_job_title and ("job_title" in reranked.columns):
        reranked = reranked.drop_duplicates(subset=["job_title"], keep="first").reset_index(drop=True)

    out = reranked.head(int(top_k_to_show)).copy()
    for c in ["fit_score", "prob_high", "similarity_score"]:
        if c in out.columns:
            out[c] = pd.to_numeric(out[c], errors="coerce").round(4)

    out = out.reindex(columns=OUT_COLS)
    status = f"Done. Retrieved {int(retrieve_pool_size)} candidates; showing {len(out)} results. (took {time.perf_counter() - t0:.1f}s)"
    return resume_preview, jd_preview, match_md, recs_md, out, status

theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="slate")

with gr.Blocks(title="Resume ↔ JD Matching + Job Recommendations") as demo:
    gr.Markdown("# Resume ↔ Job Description Matching\nHeuristic JD match + SBERT retrieval + DistilBERT reranker (Top-K jobs)")

    with gr.Row():
        with gr.Column(scale=4):
            gr.Markdown("## Inputs")

            gr.Markdown("### Resume input")
            resume_pdf = gr.File(label="Upload Resume (PDF)", file_types=[".pdf"], type="filepath")
            resume_text = gr.Textbox(
                label="Or paste resume text (fallback)",
                lines=8,
                placeholder="Paste resume text here if your PDF is scanned/image-only…",
            )

            gr.Markdown("### Job Description input (optional)")
            jd_pdf = gr.File(label="Upload Job Description (PDF)", file_types=[".pdf"], type="filepath")
            jd_text = gr.Textbox(
                label="Or paste job description text (fallback)",
                lines=8,
                placeholder="Paste JD text here if your PDF is scanned/image-only…",
            )

            with gr.Accordion("Matching settings", open=True):
                retrieve_pool = gr.Slider(50, 500, value=200, step=10, label="Retrieve pool size")
                top_k = gr.Slider(5, 30, value=15, step=1, label="Top-K to show")
                prob_thr = gr.Slider(0.30, 0.70, value=0.45, step=0.01, label="Prob(HIGH) threshold (Top-K jobs)")
                dedup = gr.Checkbox(value=True, label="Deduplicate by job title")

            with gr.Row():
                run_btn = gr.Button("Run", variant="primary")
                clear_btn = gr.Button("Clear", variant="secondary")

        with gr.Column(scale=6):
            gr.Markdown("## Outputs")
            resume_preview = gr.Textbox(label="Resume preview (first ~300 chars)", lines=6, interactive=False)
            jd_preview = gr.Textbox(label="JD preview (first ~300 chars)", lines=6, interactive=False)

            match_md = gr.Markdown("")
            recs_md = gr.Markdown("")

            status_md = gr.Markdown("")
            results = gr.Dataframe(label="Top-K Jobs (from jobs_clean.csv)")

    run_btn.click(
        run_matching,
        inputs=[resume_pdf, resume_text, jd_pdf, jd_text, retrieve_pool, top_k, prob_thr, dedup],
        outputs=[resume_preview, jd_preview, match_md, recs_md, results, status_md],
    )

    clear_btn.click(
        lambda: (None, "", None, "", "", "", "", "", pd.DataFrame(columns=OUT_COLS), ""),
        outputs=[resume_pdf, resume_text, jd_pdf, jd_text, resume_preview, jd_preview, match_md, recs_md, results, status_md],
    )

demo.queue()
demo.launch(share=True, theme=theme, debug=False)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://369a841fe855301042.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


