In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import PyPDF2
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

# Load pre-trained BERT model for embeddings
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anneezurike/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Load dataset (update the filename as needed)
df = pd.read_csv("synthetic_resumes_5000.csv")

In [6]:
def clean_text(text):
    # Check if the text is valid and non-empty
    if pd.isna(text) or text.strip() == "":
        return ""  # or return some default text
    """Cleans the extracted text by removing special characters, numbers, and stopwords."""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

In [8]:
def extract_text_from_pdf(pdf_file):
    """Extracts text from a PDF resume (given a PDF file object) and cleans it."""
    if isinstance(pdf_file, str):  # If the input is a file path, open the file
        with open(pdf_file, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = " ".join([page.extract_text() for page in reader.pages if page.extract_text()])
    else:  # If the input is already a file object (PdfFileReader accepts file-like objects)
        reader = PyPDF2.PdfReader(pdf_file)
        text = " ".join([page.extract_text() for page in reader.pages if page.extract_text()])

    return clean_text(text)

In [10]:
# Extract relevant skills and qualifications from resume
def extract_resume_info(text):
    doc = nlp(cleaned_text)
    skills = [ent.text for ent in doc.ents if ent.label_ in ["SKILLS", "EDUCATION", "DEGREE"]]
    # skills = {ent.label_: ent.text for ent in doc.ents}
    return ", ".join(set(skills))  # Remove duplicates
    # return skills

In [12]:
def rank_resumes_with_bert(resumes, job_description):
    """Ranks resumes using BERT embeddings and cosine similarity."""
    # Convert job description to embedding
    job_embedding = bert_model.encode(job_description).reshape(1, -1)

    # Convert resumes to embeddings
    # if isinstance(resumes, str):  # If a single string is passed, convert to list
    #     resumes = [resumes]
    
    # Convert resumes to embeddings
    resume_embeddings = np.array([bert_model.encode(resume) for resume in resumes])
    # resume_embeddings = np.stack([bert_model.encode(resume) for resume in resumes])
    # resume_embeddings = np.vstack([bert_model.encode(resume) for resume in resumes])
    
    # Compute similarity scores
    similarity_scores = cosine_similarity(resume_embeddings, job_embedding).flatten()
    
    return similarity_scores

In [14]:
# Combine relevant text fields for resume representation
resume_texts = (df["Skills"] + " " + df["Education"] + " " + df["Certifications"]).tolist()
job_descriptions = df["Job Description"].tolist()

In [16]:
# Choose a sample job description for ranking
sample_job_description = "Looking for a Software Engineer with Python, Java, and SQL experience."

# Rank resumes using BERT embeddings
df["BERT Score"] = rank_resumes_with_bert(resume_texts, sample_job_description)


TypeError: 'float' object is not subscriptable

In [None]:
# Sort resumes based on similarity scores
df_sorted = df.sort_values(by="BERT Score", ascending=False)

# Save ranked results to a new CSV file
# df_sorted.to_csv("ranked_resumes_bert.csv", index=False)

print("Resume ranking complete using BERT. Check 'ranked_resumes_bert.csv' for results.")

In [17]:
# Example usage for real resume
real_resume_text = extract_text_from_pdf("sample_resume.pdf")
real_job_description = clean_text(sample_job_description)
real_resume_score = rank_resumes_with_bert([real_resume_text], real_job_description)[0]

print(f"Similarity Score for Real Resume (BERT): {real_resume_score:.4f}")