In [None]:
import pandas as pd

df = pd.read_csv('/data/job_title_des.csv')
dff = pd.read_csv('/data/UpdatedResumeDataSet.csv')

df.head()
dff.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [2]:
import pandas as pd

print("Resume Dataset Overview")
print(df.info())
print("\nShape of Resume Data:", df.shape)
print("\nMissing Values in Resume Data:")
print(df.isnull().sum())

print("\nJob Description Dataset Overview")
print(dff.info())
print("\nShape of Job Description Data:", dff.shape)
print("\nMissing Values in Job Description Data:")
print(dff.isnull().sum())

Resume Dataset Overview
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2277 entries, 0 to 2276
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       2277 non-null   int64 
 1   Job Title        2277 non-null   object
 2   Job Description  2277 non-null   object
dtypes: int64(1), object(2)
memory usage: 53.5+ KB
None

Shape of Resume Data: (2277, 3)

Missing Values in Resume Data:
Unnamed: 0         0
Job Title          0
Job Description    0
dtype: int64

Job Description Dataset Overview
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 962 entries, 0 to 961
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  962 non-null    object
 1   Resume    962 non-null    object
dtypes: object(2)
memory usage: 15.2+ KB
None

Shape of Job Description Data: (962, 2)

Missing Values in Job Description Data:
Category    0
Resume      0
dt

**Setup and Required Dependencies Installations**

In [3]:
!pip install unidecode


Collecting unidecode
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.4.0-py3-none-any.whl (235 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/235.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m235.5/235.8 kB[0m [31m7.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.4.0


In [None]:
!pip install spacy
!pip install sentence-transformers unidecode streamlit pdfplumber python-docx
!python -m spacy download en_core_web_sm


In [5]:
import spacy
import re
from unidecode import unidecode
from sentence_transformers import SentenceTransformer, util
import torch


nlp = spacy.load("en_core_web_sm")

**Data Understanding & Preprocessing**

In [None]:
import pandas as pd
import spacy
import re

resumes = pd.read_csv('/data/UpdatedResumeDataSet.csv', dtype=str)
jobs = pd.read_csv('/data/job_title_des.csv', dtype=str)

resumes.columns = [c.strip() for c in resumes.columns]
jobs.columns = [c.strip() for c in jobs.columns]

print("Resumes rows:", len(resumes))
print("Jobs rows:", len(jobs))

nlp = spacy.load("en_core_web_sm")

# a clean text function
def clean_text(text):
    if pd.isna(text):
        return ""

    # Remove special characters, numbers, and extra spaces
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower().strip()

    # Tokenize + Lemmatize + Remove stopwords
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and len(token) > 2]

    return " ".join(tokens)

# Applying preprocessing to important text columns
text_cols_resumes = ['Resume']
text_cols_jobs = ['Job Description']

for col in text_cols_resumes:
    if col in resumes.columns:
        print(f"Cleaning resume column: {col}")
        resumes[f"{col}_cleaned"] = resumes[col].apply(clean_text)

for col in text_cols_jobs:
    if col in jobs.columns:
        print(f"Cleaning job column: {col}")
        jobs[f"{col}_cleaned"] = jobs[col].apply(clean_text)

resumes.to_csv("cleaned_resumes_updated.csv", index=False)
jobs.to_csv("cleaned_jobs_titles.csv", index=False)


Resumes rows: 962
Jobs rows: 2277
Cleaning resume column: Resume
Cleaning job column: Job Description


In [None]:
import pandas as pd
import re

resumes = pd.read_csv("/data/cleaned_resumes_updated.csv")
jobs = pd.read_csv("/data/cleaned_jobs_titles.csv")

# skill extractor
def extract_skills_from_text(text):
    text = str(text).lower()

    # trying to grab all words after "skills" or similar phrases
    possible_sections = re.findall(r"(skills\s*[:\-\*]*.*?)(education|experience|company|project|$)", text, re.S)

    if possible_sections:
        combined = " ".join([sec[0] for sec in possible_sections])
    else:
        combined = text

    # Extracting words that look like skill tokens
    skills = re.findall(r"[a-zA-Z\+\#\.]{2,}", combined)
    skills = list(set([s.strip() for s in skills if len(s.strip()) > 1]))

    return skills

# Applying on both datasets
jobs["skills_list"] = jobs["Job Description_cleaned"].apply(extract_skills_from_text)
resumes["skills_list"] = resumes["Resume_cleaned"].apply(extract_skills_from_text)

resumes.to_csv("cleaned_resumes_with_skills.csv", index=False)
jobs.to_csv("cleaned_jobs_with_skills.csv", index=False)


**Resume Parsing & Information Extraction (NER)**

In [None]:
import pandas as pd
import re
import json
from sklearn.model_selection import train_test_split

# Load cleaned datasets
resumes = pd.read_csv("/data/cleaned_resumes_with_skills.csv")
jobs = pd.read_csv("/data/cleaned_jobs_with_skills.csv")


# normalization mappings
skill_map = {
    "py": "python",
    "python 3": "python",
    "microsoft excel": "excel",
    "ms excel": "excel",
    "power bi": "powerbi",
    "tableau desktop": "tableau",
    "nlp": "natural language processing",
    "ml": "machine learning",
    "ai": "artificial intelligence",
    "db": "database",
    "sql server": "sql",
    "mysql database": "mysql",
    "react js": "react",
    "reactjs": "react",
    "nodejs": "node",
    "node js": "node"
}

def normalize_experience(text):
    text = re.sub(r'\b(\d+)\s*(yrs?|years?)\b', r'\1 years', text)
    text = re.sub(
        r'\b(one|two|three|four|five|six|seven|eight|nine|ten)\s+years?\b',
        lambda m: str({
            "one":1,"two":2,"three":3,"four":4,"five":5,
            "six":6,"seven":7,"eight":8,"nine":9,"ten":10
        }[m.group(1)]) + " years",
        text
    )
    return text

def canonicalize_skills(text):
    text = text.lower()
    for variant, canonical in skill_map.items():
        pattern = r'\b' + re.escape(variant) + r'\b'
        text = re.sub(pattern, canonical, text)
    return text

def normalize_text(text):
    if pd.isna(text):
        return ""
    text = normalize_experience(text)
    text = canonicalize_skills(text)
    return text

#  Applying normalization
resumes['Resume_cleaned'] = resumes['Resume_cleaned'].apply(normalize_text)
jobs['Job Description_cleaned'] = jobs['Job Description_cleaned'].apply(normalize_text)

resumes.to_csv("/data/normalized_resumes.csv", index=False)
jobs.to_csv("/data/normalized_jobs.csv", index=False)




In [1]:
# helper function for canonicalization
import re
import pandas as pd
import json

def canonicalize_skills(skill):
    """Normalize skills to lowercase and remove special characters."""
    return re.sub(r'[^a-z0-9 ]+', '', str(skill).lower()).strip()


In [None]:
# convert into lower case avoid memory overload

resumes = pd.read_csv("/data/cleaned_resumes_with_skills.csv")
jobs = pd.read_csv("/data/cleaned_jobs_with_skills.csv")

print("Jobs columns:", list(jobs.columns))
print("Resumes columns:", list(resumes.columns))


Jobs columns: ['Unnamed: 0', 'Job Title', 'Job Description', 'Job Description_cleaned', 'skills_list']
Resumes columns: ['Category', 'Resume', 'Resume_cleaned', 'skills_list']


In [None]:
# writes examples in batches instead of holding everything in RAM, so the system won't crashes
import os
from tqdm import tqdm

output_path = "/model/auto_train_from_jobs.json"
os.makedirs("/model", exist_ok=True)

# Starting file and open stream
with open(output_path, "w", encoding="utf-8") as f:
    f.write('{"examples": [\n')

    first = True
    for _, job_row in tqdm(jobs.iterrows(), total=len(jobs), desc="Processing jobs"):
        job_name = job_row.get("Job Title", "")
        skill_list = job_row.get("skills_list", [])

        # checking skill_list is a list
        if isinstance(skill_list, str):
            skill_list = [s.strip() for s in skill_list.split(",") if s.strip()]

        matched_resumes = resumes[resumes["Resume_cleaned"].str.contains(job_name.lower(), case=False, na=False)]

        for _, res_row in matched_resumes.iterrows():
            text = str(res_row["Resume_cleaned"])
            entities = []
            for skill in skill_list:
                skill_norm = canonicalize_skills(skill)
                for match in re.finditer(r'\b' + re.escape(skill_norm) + r'\b', text, flags=re.IGNORECASE):
                    start, end = match.span()
                    entities.append([start, end, "SKILL"])

            if entities:
                example = json.dumps([text, {"entities": entities}], ensure_ascii=False)
                if not first:
                    f.write(",\n")
                f.write(example)
                first = False

    f.write("\n]}")


Processing jobs: 100%|██████████| 2277/2277 [07:01<00:00,  5.40it/s]


In [None]:
from sklearn.model_selection import train_test_split

train_examples, test_examples = train_test_split(data["examples"], test_size=0.2, random_state=42)

with open("/split_train_data/train_data.json", "w") as f:
    json.dump({"examples": train_examples}, f, indent=2)

with open("/split_train_data/test_data.json", "w") as f:
    json.dump({"examples": test_examples}, f, indent=2)


Model Evaluation and Performance extraction using **Precision, Recall and F1-score.**

In [None]:
import spacy
from spacy.training import Example
import json

# auto-labeled data
with open("/model/auto_train_from_jobs.json", "r") as f:
    data = json.load(f)["examples"]

# blank model for English
nlp = spacy.blank("en")

# NER pipeline
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

# label(s)
ner.add_label("SKILL")

# Data preparation
TRAIN_DATA = []
for text, annot in data:
    TRAIN_DATA.append(Example.from_dict(nlp.make_doc(text), annot))

# Training
optimizer = nlp.initialize()
for i in range(5):
    losses = {}
    for batch in TRAIN_DATA[:500]:
        nlp.update([batch], sgd=optimizer, losses=losses)

nlp.to_disk("/model/custom_ner_model")



In [None]:
# Evaluating the model precison, F1, recall

import spacy
import json
from sklearn.metrics import precision_score, recall_score, f1_score

nlp_resume = spacy.load("/model/custom_ner_model")

with open("/split_train_data/test_data.json", "r", encoding="utf8") as f:
    TEST_DATA = json.load(f)["examples"]

def evaluate_ner(model, test_data):
    y_true, y_pred = [], []
    for text, annot in test_data:
        doc = model(text)
        gold_entities = set([(start, end, label) for start, end, label in annot["entities"]])
        pred_entities = set([(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents])

        for ent in gold_entities.union(pred_entities):
            y_true.append(1 if ent in gold_entities else 0)
            y_pred.append(1 if ent in pred_entities else 0)

    p = precision_score(y_true, y_pred)
    r = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return {"precision": p, "recall": r, "f1": f1}

metrics = evaluate_ner(nlp_resume, TEST_DATA)
print("Evaluation Metrics:")
print(metrics)


Evaluation Metrics:
{'precision': 0.49146381743417566, 'recall': 0.5823029088846283, 'f1': 0.5330409361530106}


**Job Description Analysis**

In [None]:
import pandas as pd
import re
import numpy as np

resumes = pd.read_csv("/normalized_resumes.csv")
jobs = pd.read_csv("/normalized_jobs.csv")

# Experience extraction (3+ years, 5 years of experience)
def extract_experience(text):
    matches = re.findall(r'(\d+)\s*\+?\s*(?:year|yr)', str(text).lower())
    return int(matches[0]) if matches else 0

# Education normalization
def normalize_education(text):
    text = str(text).lower()
    if "phd" in text or "doctor" in text:
        return 4
    elif "master" in text or "msc" in text:
        return 3
    elif "bachelor" in text or "bs" in text or "bsc" in text:
        return 2
    elif "associate" in text or "diploma" in text:
        return 1
    else:
        return 0

# Jaccard similarity for skills
def jaccard_similarity(list1, list2):
    set1, set2 = set(list1), set(list2)
    if not set1 or not set2:
        return 0
    return len(set1 & set2) / len(set1 | set2)

# Apply normalization
resumes["years_experience"] = resumes["Resume_cleaned"].apply(extract_experience)
jobs["required_experience"] = jobs["Job Description"].apply(extract_experience)

resumes["education_level"] = resumes["Resume_cleaned"].apply(normalize_education)
jobs["required_education"] = jobs["Job Description"].apply(normalize_education)

# Scoring formula
def compute_match_score(resume_row, job_row):
    # Skill score
    skills_resume = str(resume_row["skills_list"]).split(", ")
    skills_job = str(job_row["skills_list"]).split(", ")
    skill_score = jaccard_similarity(skills_resume, skills_job)

    # Experience alignment
    exp_res = resume_row["years_experience"]
    exp_req = job_row["required_experience"]
    if exp_req == 0:
        exp_score = 1.0
    else:
        exp_score = min(exp_res / exp_req, 1.0)

    # Education alignment
    edu_res = resume_row["education_level"]
    edu_req = job_row["required_education"]
    edu_score = 1.0 if edu_res >= edu_req else edu_res / (edu_req + 0.01)

    # Weighted final score
    final_score = (0.5 * skill_score) + (0.3 * exp_score) + (0.2 * edu_score)
    return round(final_score, 3)

# all resume–job pairs
results = []
for _, job in jobs.iterrows():
    for _, resume in resumes.iterrows():
        score = compute_match_score(resume, job)
        results.append({
            "Job Title": job["Job Title"],
            "Resume ID": resume["Resume ID"] if "Resume ID" in resumes.columns else _,
            "Match Score": score
        })

df_results = pd.DataFrame(results)
ranked_matches = df_results.sort_values(by="Match Score", ascending=False)

ranked_matches.to_csv("/final_resume_job_scores.csv", index=False)
print("✅ Final matching scores saved to: /final_resume_job_scores.csv")

ranked_matches.head(10)


✅ Final matching scores saved to: /content/final_resume_job_scores.csv


Unnamed: 0,Job Title,Resume ID,Match Score
1547032,DevOps Engineer,136,1.0
1547014,DevOps Engineer,118,1.0
2022230,Java Developer,106,1.0
2022248,Java Developer,124,1.0
1547026,DevOps Engineer,130,1.0
1547020,DevOps Engineer,124,1.0
1547002,DevOps Engineer,106,1.0
1547008,DevOps Engineer,112,1.0
2022254,Java Developer,130,1.0
2022236,Java Developer,112,1.0


**Semantic Matching & Score Calculation**

In [None]:
#  SEMANTIC MATCHING + SCORE CALCULATION
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util

resumes = pd.read_csv("/data/normalized_resumes.csv")
jobs = pd.read_csv("/data/normalized_jobs.csv")

# Ensure skill columns exist as Python sets
def to_skillset(x):
    if isinstance(x, str):
        return set([s.strip().lower() for s in x.split(",") if s.strip()])
    elif isinstance(x, list):
        return set([s.strip().lower() for s in x])
    else:
        return set()

resumes["skills_set"] = resumes.get("skills_list", resumes.get("skills", "")).apply(to_skillset)
jobs["skills_set"] = jobs.get("skills_list", jobs.get("skills", "")).apply(to_skillset)

# Create combined text fields for semantic encoding
def make_combined_text(df, text_col1, text_col2):
    t1 = df[text_col1].astype(str)
    t2 = df[text_col2].astype(str)
    return (t1 + " " + t2).str.lower()

resumes["combined_text"] = make_combined_text(resumes, "Resume_cleaned", "skills_list" if "skills_list" in resumes.columns else "skills")
jobs["combined_text"] = make_combined_text(jobs, "Job Description_cleaned", "skills_list" if "skills_list" in jobs.columns else "skills")

print("✅ Data prepared with combined text and skill sets")

# Load sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode all job and candidate texts
print("Encoding resumes and job descriptions...")
job_embeddings = model.encode(jobs['combined_text'].tolist(), convert_to_tensor=True, show_progress_bar=True)
candidate_embeddings = model.encode(resumes['combined_text'].tolist(), convert_to_tensor=True, show_progress_bar=True)

# Compute semantic similarity matrix (candidates × jobs)
cosine_matrix = util.cos_sim(candidate_embeddings, job_embeddings)

# Define Jaccard similarity for skill overlap
def jaccard(a, b):
    if not a or not b:
        return 0.0
    return len(a & b) / len(a | b)

# Combine semantic + skill scores ---
alpha = 0.75  # weight for semantic similarity
beta = 1 - alpha  # weight for skill overlap

results = []
top_k = 5  # top K job matches per candidate

print("Calculating final match scores...")
for i in range(cosine_matrix.shape[0]):
    row = cosine_matrix[i]
    vals, idxs = torch.topk(row, k=min(top_k, row.shape[0]))
    for v, j in zip(vals.tolist(), idxs.tolist()):
        sem_score = float(v)
        skill_overlap = jaccard(resumes.at[resumes.index[i], 'skills_set'], jobs.at[jobs.index[j], 'skills_set'])
        final = alpha * sem_score + beta * skill_overlap
        results.append({
            "Resume ID": resumes.index[i],
            "Candidate Name": resumes.get("candidate_name", pd.Series(resumes.index)).loc[resumes.index[i]],
            "Job ID": jobs.index[j],
            "Job Title": jobs.get("Job Title", pd.Series(jobs.index)).loc[jobs.index[j]],
            "Semantic Score": round(sem_score, 4),
            "Skill Overlap": round(skill_overlap, 4),
            "Final Score": round(final, 4)
        })

# Convert to DataFrame and save
results_df = pd.DataFrame(results)
results_df["Final %"] = (results_df["Final Score"] * 100).round(2)
results_df.sort_values(["Candidate Name", "Final %"], ascending=[True, False], inplace=True)

# Save final results
results_df.to_csv("/data/final_semantic_scores.csv", index=False)
print("✅ Final semantic match scores saved to: /data/final_semantic_scores.csv")

print(results_df.head(10))


✅ Data prepared with combined text and skill sets


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🔄 Encoding resumes and job descriptions...


Batches:   0%|          | 0/72 [00:00<?, ?it/s]

Batches:   0%|          | 0/31 [00:00<?, ?it/s]

⚙️ Calculating final match scores...
✅ Final semantic match scores saved to: /content/final_semantic_scores.csv
   Resume ID  Candidate Name  Job ID         Job Title  Semantic Score  \
1          0               0     605  Machine Learning          0.8251   
2          0               0     775  Machine Learning          0.8185   
0          0               0     329  Machine Learning          0.8501   
3          0               0      90  Machine Learning          0.8017   
4          0               0    1419  Machine Learning          0.7927   
5          1               1     681  Machine Learning          0.7783   
7          1               1    1831  Machine Learning          0.7278   
6          1               1    1962  Machine Learning          0.7296   
8          1               1     464  Machine Learning          0.7219   
9          1               1    1217  Machine Learning          0.7200   

   Skill Overlap  Final Score  Final %  
1         0.1353       0.6527   

Setup and App link

In [None]:
from google.colab import files
files.upload()


In [None]:
!pip install streamlit sentence-transformers pdfplumber python-docx unidecode torch


In [21]:
!pip install pyngrok


Collecting pyngrok
  Downloading pyngrok-7.4.0-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.4.0-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.4.0


In [32]:
from pyngrok import ngrok
ngrok.set_auth_token("NJI573LG5JAQYTZN54T6WZF3R7UYWDIS")


In [33]:
!streamlit run app.py &>/content/logs.txt &

from pyngrok import ngrok
public_url = ngrok.connect(8501)
print("✅ App is running at:", public_url)


✅ App is running at: NgrokTunnel: "https://marlin-nonatmospherical-longsomely.ngrok-free.dev" -> "http://localhost:8501"


In [None]:
!jupyter nbconvert --clear-output --clear-metadata --inplace your_notebook_name.ipynb


In [None]:
!zip -r /content/resume_matcher_backup.zip /content
from google.colab import files
files.download('/resume_matcher_backup.zip')
