In [19]:
import pandas as pd
import re
import numpy as np

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


In [21]:
df = pd.read_csv("../data/UpdatedResumeDataSet.csv")
print("Shape:", df.shape)
df.head()


Shape: (962, 2)


Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [22]:
job_descriptions = {
    "Data Science": """
    Python Machine Learning Deep Learning Statistics SQL
    Data Analysis Pandas NumPy Scikit-learn Model Deployment
    """,

    "HR": """
    Recruitment Talent Acquisition Payroll Employee Engagement
    HR Operations Communication Performance Management
    """,

    "DevOps Engineer": """
    Docker Kubernetes AWS CI CD Linux Automation
    Cloud Infrastructure Monitoring DevOps Tools
    """,

    "Web Designing": """
    HTML CSS JavaScript UI UX Responsive Design
    Figma Bootstrap Web Layouts
    """,

    "Python Developer": """
    Python OOP Flask Django APIs Databases
    Backend Development REST Services
    """
}


In [23]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z ]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


In [24]:
def skill_match(resume, jd):
    resume_words = set(clean_text(resume).split())
    jd_words = set(clean_text(jd).split())

    if len(jd_words) == 0:
        return 0.0

    return len(resume_words & jd_words) / len(jd_words)


In [26]:
def keyword_overlap(resume, jd):
    resume_words = set(clean_text(resume).split())
    jd_words = set(clean_text(jd).split())
    return len(resume_words & jd_words)



In [27]:
def resume_length_score(resume):
    length = len(resume.split())
    return 1 if 100 <= length <= 800 else 0


In [28]:
experience_keywords = [
    "experience", "years", "worked", "intern",
    "project", "projects", "company", "role"
]

def experience_score(resume):
    text = clean_text(resume)
    return sum(text.count(word) for word in experience_keywords)


In [17]:
pip install sentence-transformers flask torch


Collecting sentence-transformers
  Downloading sentence_transformers-5.2.0-py3-none-any.whl.metadata (16 kB)
Downloading sentence_transformers-5.2.0-py3-none-any.whl (493 kB)
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-5.2.0
Note: you may need to restart the kernel to use updated packages.


In [29]:
bert_model = SentenceTransformer("all-MiniLM-L6-v2")


In [30]:
def bert_similarity(resume, jd):
    embeddings = bert_model.encode([resume, jd])
    return cosine_similarity(
        [embeddings[0]],
        [embeddings[1]]
    )[0][0]


In [31]:
features = []

for _, row in df.iterrows():
    resume = row["Resume"]
    category = row["Category"]
    jd = job_descriptions.get(category, "")

    features.append({
        "skill_match": skill_match(resume, jd),
        "keyword_overlap": keyword_overlap(resume, jd),
        "resume_length": resume_length_score(resume),
        "experience_score": experience_score(resume),
        "bert_similarity": bert_similarity(resume, jd)
    })

features_df = pd.DataFrame(features)
features_df.head()


Unnamed: 0,skill_match,keyword_overlap,resume_length,experience_score,bert_similarity
0,0.785714,11,1,5,0.500591
1,0.357143,5,1,4,0.38312
2,0.5,7,1,4,0.387008
3,0.5,7,0,24,0.344844
4,0.214286,3,0,3,0.338331


In [32]:
final_df = pd.concat([df, features_df], axis=1)
final_df.head()



Unnamed: 0,Category,Resume,skill_match,keyword_overlap,resume_length,experience_score,bert_similarity
0,Data Science,Skills * Programming Languages: Python (pandas...,0.785714,11,1,5,0.500591
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,0.357143,5,1,4,0.38312
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",0.5,7,1,4,0.387008
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,0.5,7,0,24,0.344844
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",0.214286,3,0,3,0.338331


In [34]:
final_df.to_csv("../data/resume_features_bert.csv", index=False)
print("✅ Feature engineered dataset saved successfully")


✅ Feature engineered dataset saved successfully


In [35]:
final_df.describe()


Unnamed: 0,skill_match,keyword_overlap,resume_length,experience_score,bert_similarity
count,962.0,962.0,962.0,962.0,962.0
mean,0.095691,1.122661,0.72869,12.597713,0.103116
std,0.199205,2.381756,0.444867,12.332934,0.119364
min,0.0,0.0,0.0,2.0,-0.06467
25%,0.0,0.0,0.0,4.0,0.022647
50%,0.0,0.0,1.0,8.0,0.070292
75%,0.0,0.0,1.0,17.0,0.123006
max,0.785714,11.0,1.0,63.0,0.517819
