In [1]:
from flask import Flask, request, jsonify, json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import PyPDF2
import spacy  # NLP library for text processing
import re
from nltk.corpus import stopwords
import nltk
from difflib import get_close_matches


# Download stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))
# Load spaCy NLP model
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anneezurike/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Load dataset (update the filename as needed)
df = pd.read_csv("synthetic_resumes_5000.csv")

# Combine relevant text fields for resume representation
resume_texts = df["Skills"] + " " + df["Education"] + " " + df["Certifications"]
job_descriptions = df["Job Description"]

In [5]:
def clean_text(text):
    # Check if the text is valid and non-empty
    if pd.isna(text) or text.strip() == "":
        return ""  # or return some default text
    """Cleans the extracted text by removing special characters, numbers, and stopwords."""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

In [7]:
def extract_text_from_pdf(pdf_file):
    """Extracts text from a PDF resume (given a PDF file object) and cleans it."""
    if isinstance(pdf_file, str):  # If the input is a file path, open the file
        with open(pdf_file, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = " ".join([page.extract_text() for page in reader.pages if page.extract_text()])
    else:  # If the input is already a file object (PdfFileReader accepts file-like objects)
        reader = PyPDF2.PdfReader(pdf_file)
        text = " ".join([page.extract_text() for page in reader.pages if page.extract_text()])

    return clean_text(text)

In [9]:
# Extract relevant skills and qualifications from resume
def extract_resume_info(text):
    doc = nlp(cleaned_text)
    skills = [ent.text for ent in doc.ents if ent.label_ in ["SKILLS", "EDUCATION", "DEGREE"]]
    # skills = {ent.label_: ent.text for ent in doc.ents}
    return ", ".join(set(skills))  # Remove duplicates
    # return skills

In [11]:
def rank_real_resume(resume_text, job_description, vectorizer, tfidf_resumes):
    """Ranks a real resume against job descriptions using TF-IDF."""
    tfidf_resume = vectorizer.transform([resume_text])
    tfidf_job = vectorizer.transform([job_description])
    similarity_score = cosine_similarity(tfidf_resume, tfidf_job)[0][0]
    return similarity_score

In [13]:
def extract_relevant_skills(resume_text, predefined_skills):
    """Extracts predefined relevant skills from the resume text."""
    words = set(resume_text.split())  # Convert resume text into a set of words
    matched_skills = words.intersection(predefined_skills)  # Find matching skills

    # Find similar skills using fuzzy matching
    for word in words:
        close_matches = get_close_matches(word, predefined_skills, n=1, cutoff=0.8)
        matched_skills.update(close_matches)

    # Find partial matches using regex
    for skill in predefined_skills:
        pattern = rf"\b{skill.split()[0]}\b"  # Match the first word of the skill
        if re.search(pattern, resume_text):
            matched_skills.add(skill)
        
    return list(matched_skills)

In [15]:
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words="english")

# Fit and transform resumes and job descriptions into TF-IDF vectors
tfidf_resumes = vectorizer.fit_transform(resume_texts.apply(lambda x : clean_text(x)))
tfidf_jobs = vectorizer.transform(job_descriptions.apply(lambda x : clean_text(x)))

In [17]:
# Compute Cosine Similarity between resumes and job descriptions
similarity_scores = cosine_similarity(tfidf_resumes, tfidf_jobs)

# Extract diagonal values where resumes are compared to their respective job descriptions
df["TF-IDF Score"] = np.diag(similarity_scores)

In [19]:
# Rank resumes based on similarity score
df_sorted = df.sort_values(by="TF-IDF Score", ascending=False)

# Save ranked results to a new CSV file
# df_sorted.to_csv("ranked_resumes_tfidf.csv", index=False)

In [21]:
import joblib

# joblib.dump(model, "resume_screening_model.pkl")  # Save model
joblib.dump(vectorizer, "rank_tfidf_vectorizer.pkl")  # Save TF-IDF vectorizer


['rank_tfidf_vectorizer.pkl']