### Resume (PDF/Text)
        ↓
### resume_parser.py  → clean text
        ↓
### skill_matcher.py
   - extract skills from resume
   - load role skills from job datasets
   - compare
   - compute:
       - matched skills
       - missing skills
       - match %
       - suggestions
        ↓
### Streamlit UI (Module 2)
   - upload resume
   - choose target role
   - show analysis


In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


resume_df = pd.read_csv(r"C:\Users\yshel\Desktop\DivyaPath-Ai\data\module2_resume\UpdatedResumeDataSet.csv")
job_df = pd.read_csv(r"C:\Users\yshel\Desktop\DivyaPath-Ai\data\module2_resume\all_job_post.csv")
role_df = pd.read_csv(r"C:\Users\yshel\Desktop\DivyaPath-Ai\data\module2_resume\job_title_des.csv")



In [2]:
print(resume_df.columns)
print(job_df.columns)
print(role_df.columns)

resume_df.head()

Index(['Category', 'Resume'], dtype='object')
Index(['job_id', 'category', 'job_title', 'job_description', 'job_skill_set'], dtype='object')
Index(['Unnamed: 0', 'Job Title', 'Job Description'], dtype='object')


Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [3]:
#Basic Cleaning Function (NLP Preprocessing)
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


In [4]:
resume_df["clean_resume"] = resume_df["Resume"].apply(clean_text)
job_df["clean_desc"] = job_df["job_description"].astype(str).apply(clean_text)


In [5]:
#Build Role → Skill Map (From Job Data)
role_skill_map = {}

for _, row in job_df.iterrows():
    role = str(row["job_title"]).lower()
    skills = row["job_skill_set"]

    if isinstance(skills, str):
        skills = skills.strip("[]").replace("'", "").split(",")

    skills = [s.strip().lower() for s in skills if len(s.strip()) > 1]
    role_skill_map[role] = list(set(skills))

list(role_skill_map.items())[:3]


[('sr human resource generalist',
  ['recruitment strategies',
   'communication',
   'analytical skills',
   'erisa',
   'compensation and benefits',
   'payroll processing',
   'influence',
   'confidentiality',
   'talent acquisition',
   'fmla',
   'employment laws and regulations',
   'hr policies and procedures',
   'performance management',
   'flsa',
   'salary administration',
   'employee relations',
   'hr audits and assessments',
   'collaboration',
   'professional development',
   'intellectual curiosity',
   'job design',
   'interpersonal skills',
   'employee benefits administration',
   'change management',
   'ada',
   'eeoc guidelines',
   'human capital management system']),
 ('human resources manager',
  ['employee relations',
   'problem-solving',
   'talent acquisition',
   'ms powerpoint',
   'communication',
   'analytical skills',
   'ms word',
   'attention to detail',
   'organizational skills',
   'organizational development',
   'adaptability',
   'perfor

In [6]:
#Skill Extraction from Resume
def extract_skills(text, skill_list):
    found = []
    for skill in skill_list:
        if skill in text:
            found.append(skill)
    return list(set(found))


In [7]:
sample_resume = clean_text(resume_df.iloc[0]["Resume"])
sample_role = list(role_skill_map.keys())[0]
required = role_skill_map[sample_role]

matched = extract_skills(sample_resume, required)
missing = list(set(required) - set(matched))

In [8]:
print("Role:", sample_role)
print("Matched:", matched)
print("Missing:", missing)

Role: sr human resource generalist
Matched: []
Missing: ['employee relations', 'communication', 'recruitment strategies', 'hr audits and assessments', 'analytical skills', 'erisa', 'collaboration', 'professional development', 'intellectual curiosity', 'compensation and benefits', 'payroll processing', 'influence', 'job design', 'confidentiality', 'interpersonal skills', 'employee benefits administration', 'talent acquisition', 'fmla', 'change management', 'ada', 'human capital management system', 'employment laws and regulations', 'hr policies and procedures', 'performance management', 'eeoc guidelines', 'flsa', 'salary administration']


In [9]:
#Similarity Scoring using NLP (TF-IDF)
vectorizer = TfidfVectorizer(max_features=3000)

texts = [
    sample_resume,
    clean_text(job_df.iloc[0]["job_description"])
]

X = vectorizer.fit_transform(texts)
sim = cosine_similarity(X[0:1], X[1:2])[0][0]

sim * 100


43.0623937325332

In [10]:
def analyze_resume_nlp(resume_text, role_text, role_skills):
    resume_clean = clean_text(resume_text)
    role_clean = clean_text(role_text)

    vect = TfidfVectorizer(max_features=3000)
    X = vect.fit_transform([resume_clean, role_clean])
    sim = cosine_similarity(X[0:1], X[1:2])[0][0] * 100

    matched = extract_skills(resume_clean, role_skills)
    missing = list(set(role_skills) - set(matched))

    return {
        "match_percent": round(sim, 2),
        "matched_skills": matched,
        "missing_skills": missing
    }


In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
from pathlib import Path

DATA_DIR = Path("C:/Users/yshel/Desktop/DivyaPath-Ai/data/module2_resume")
MODEL_DIR = Path("C:/Users/yshel/Desktop/DivyaPath-Ai/models")

resume_df = pd.read_csv(DATA_DIR / "UpdatedResumeDataSet.csv")
job_df = pd.read_csv(DATA_DIR / "all_job_post.csv")

# Combine all text we want the vectorizer to learn from
corpus = []

corpus.extend(resume_df["Resume"].dropna().astype(str).tolist())
corpus.extend(job_df["job_description"].dropna().astype(str).tolist())

# Train TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words="english"
)
vectorizer.fit(corpus)

# Save it
MODEL_DIR.mkdir(parents=True, exist_ok=True)
joblib.dump(vectorizer, MODEL_DIR / "resume_vectorizer.pkl")

print("resume_vectorizer.pkl created successfully")


resume_vectorizer.pkl created successfully
