In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import PyPDF2
import re
import nltk
from nltk.corpus import stopwords
from flask import Flask, request, jsonify, json
from skill_edu import education_keywords, skill_dataset
from datetime import datetime

# Download stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

# Load pre-trained BERT model for embeddings
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anneezurike/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [438]:
def extract_text_from_pdf(pdf_file):
    """Extracts text from a PDF resume (given a PDF file object) and cleans it."""
    if isinstance(pdf_file, str):  # If the input is a file path, open the file
        with open(pdf_file, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = " ".join([page.extract_text() for page in reader.pages if page.extract_text()])
            text = text.lower()
    else:  # If the input is already a file object (PdfFileReader accepts file-like objects)
        reader = PyPDF2.PdfReader(pdf_file)
        text = " ".join([page.extract_text() for page in reader.pages if page.extract_text()])
        text = text.lower()
    return text 

In [6]:
def clean_text(text):
    # Check if the text is valid and non-empty
    if pd.isna(text) or text.strip() == "":
        return ""  # or return some default text
    """Cleans the extracted text by removing special characters, numbers, and stopwords."""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

In [8]:
def remove_sensitive_info(text):
    """Removes potential bias-related words from resume text."""
    bias_keywords = ["male", "female", "black", "white", "asian", "hispanic", "married", "single"]
    for word in bias_keywords:
        text = text.replace(word, " ")
    return text

In [10]:
def filter_skills(skills_a, skills_b):
    if isinstance(skills_b, str):
        skills_b = [item.strip() for item in skills_b.split(", ")]
    skills = {skill for skill in skills_a if skill in skills_b}
    return skills

In [12]:
def extract_entities(text):
    """Efficiently extracts skills, education, and experience from resume text."""
    extracted_info = {"Skills": set(), "Education": set(), "Experience": set()}
    # Regex patterns to capture different ways experience is written
    experience_patterns = [
        # r'(\d+)\s*(?:\+|-)?\s*(?:years?|yrs?)\s*(?:of|in|working in|as)?\s*experience',
        r'(\d+\s*[+-]?\s*(?:years?|yrs?))',
        r'(\d+)\s*(?:to|-) (\d+)\s*years',  # e.g., "3 to 5 years"
        r'(\d+)-(\d+)\s*years'  # e.g., "3-5 years"
    ]

    # Convert text to lowercase for case-insensitive matching
    text = text.lower()
    skill_data = list(map(str.lower, skill_dataset))
    education_data = list(map(str.lower, education_keywords))

    # Regex-based experience extraction
    # Extract matches using regex
    for pattern in experience_patterns:
        matches = re.findall(pattern, text)
        if matches:
            for match in matches:
                if isinstance(match, tuple):  # Handles cases like "3-5 years"
                    extracted_info["Experience"].add(f"{match[0]}-{match[1]} years")
                else:
                    extracted_info["Experience"].add(f"{match} years")

    text = clean_text(text)

    # Fast skill matching using set intersection
    extracted_info["Skills"] = {skill for skill in skill_data if skill in text}

    # Fast education matching
    extracted_info["Education"] = {edu for edu in education_data if edu in text}

    return {
        "skills": list(extracted_info["Skills"]),
        "education": list(extracted_info["Education"]),
        "experience": list(extracted_info["Experience"])
    }

In [14]:
def compute_similarity(text1, text2):
    """Compute cosine similarity between two text embeddings."""
    embedding1 = bert_model.encode(text1).reshape(1, -1)
    embedding2 = bert_model.encode(text2).reshape(1, -1)
    return cosine_similarity(embedding1, embedding2)[0][0]

def get_resume_ranking_score(ranking_data, job_data):
    """Provides a detailed breakdown of resume scoring."""
    
    # Compute similarity scores for different sections
    general_score = compute_similarity(ranking_data["resume_text"], job_data["description"]) * 15  # 15% weight
    skills_score = compute_similarity(ranking_data["r_skills"], job_data["skills"]) * 40  # 40% weight
    experience_similarity = compute_similarity(ranking_data["resume_text"], job_data["experience"])  
    experience_score = compute_experience_score(ranking_data["experience"], job_data["experience"], experience_similarity) * 30  # 30% weight
    education_score = compute_similarity(ranking_data["education"], job_data["education"]) * 15  # 15% weight
    

    # Calculate total score
    total_score = skills_score + experience_score + education_score + general_score

    # Return breakdown
    return {
        "ts": round(total_score, 2),
        "ss": round(skills_score, 2),
        "ex": round(experience_similarity, 2),
        "ed": round(education_score, 2),
        "ge": round(general_score, 2)
    }

In [18]:
def rank_exp(text):
    # Find the first match
    for pattern in rank_exp_pattern:
        matches = re.findall(pattern, text)
        if matches:
            for match in matches:
                result = f"{match} years"
    return result or ""

In [16]:
def extract_experience(text):
    """Extracts numerical years of experience from explicit mentions or date ranges, including month-name formats."""
    
    # Explicit experience extraction (e.g., "5 years of experience")
    explicit_match = re.findall(r'(\d+)\s*(?:\+|-)?\s*years?', text)
    
    if explicit_match:
        return max(map(int, explicit_match))  # Take the highest number found
    
    # Patterns for different date formats
    date_patterns = [
        r'(\b\d{4}\b)\s*[-to]+\s*(\b\d{4}\b)',  # "2015 - 2020" / "2015 to 2020"
        r'(\d{2}/\d{4})\s*[-–to]+\s*(\d{2}/\d{2}/\d{4}|\d{2}/\d{4})',  # "05/2020 - 09/2024"
        r'(\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4})\s*[-to]+\s*(\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4})'
        # Handles: "March 2016 - June 2018"
    ]
    
    years = []

    for pattern in date_patterns:
        matches = re.findall(pattern, text, flags=re.IGNORECASE)
        for start, end in matches:
            try:
                # Extract year from different formats
                start_year = int(re.search(r'\d{4}', start).group())
                end_year = int(re.search(r'\d{4}', end).group())
                
                if start_year <= end_year:
                    years.append((start_year, end_year))
            except (ValueError, AttributeError):
                continue

    if not years:
        return 0, None, None  # No valid years found

    # Step 1: Sort year ranges by start year
    years.sort()
    
    # Step 2: Merge overlapping or consecutive time periods
    merged_ranges = []
    current_start, current_end = years[0]

    for start, end in years[1:]:
        if start <= current_end:  # Overlapping or consecutive
            current_end = max(current_end, end)  # Extend the range
        else:
            merged_ranges.append((current_start, current_end))  # Store the merged period
            current_start, current_end = start, end

    merged_ranges.append((current_start, current_end))  # Add last range

    # Step 3: Calculate total experience
    total_experience = sum(end - start for start, end in merged_ranges)
    min_year = min(start for start, _ in merged_ranges)
    max_year = max(end for _, end in merged_ranges)

    return total_experience, min_year, max_year
    # return years if years else 0  # Return inferred experience or 0 if nothing found


def compute_experience_score(resume_exp, job_exp, similarity_score):
    """Computes the final experience score combining numerical experience and text similarity."""
    resume_years = extract_experience(resume_exp)
    job_years = extract_experience(job_exp)

    if job_years == 0:  # No required experience specified
        num_experience_score = 1.0  # Full score if no experience requirement
    else:
        num_experience_score = min(resume_years / job_years, 1.5)  # Cap scaling at 1.5 to avoid over-rewarding

    # Multiply structured experience score with text similarity score to balance both
    final_experience_score = num_experience_score * similarity_score  

    return final_experience_score

In [160]:
import re

def extract_experience_years(text):
    """Extracts and computes total non-overlapping years of experience, removing education years."""
    
    text_lower = text.lower()
    
    # Identify positions of 'education' and 'work' in the text
    edu_index = text_lower.find("education")
    work_index = text_lower.find("work")  # Could be 'work experience', 'work history', etc.

    if edu_index != -1:
        if work_index != -1 and work_index > edu_index:
            # If 'work' is found AFTER 'education', extract the text between them
            education_text = text[edu_index:work_index]
        else:
            # If 'work' is not found, take everything from 'education' to the last occurrence of a year
            match = re.findall(r'\b\d{4}\b', text)
            if match:
                last_year_index = text.rfind(match[-1])  # Find last occurrence of a year
                education_text = text[edu_index:last_year_index + 4]  # +4 to include the last year
            else:
                education_text = text[edu_index:]  # If no years, take everything from 'education' onward
    else:
        education_text = ""  # No education section found

    # Patterns to extract year ranges (e.g., "2015 - 2020", "05/2020 - 09/2024", "March 2016 - June 2018")
    patterns = [
        r'(\b\d{4}\b)\s*[-to]+\s*(\b\d{4}\b)',  # Matches '2015 - 2020' or '2015 to 2020'
        r'(\d{2}/\d{4})\s*[-–to]+\s*(\d{2}/\d{2}/\d{4}|\d{2}/\d{4})',  # Matches '05/2020 - 09/2024'
        r'(\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4})\s*[-to]+\s*(\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4})'  # Matches 'March 2016 - June 2018'
    ]

    years = []
    all_years = set()  # To store all extracted years
    edu_years = set()  # To store years found in the education section

    for pattern in patterns:
        for match in re.findall(pattern, text):
            try:
                if len(match) == 2:  # Matching year ranges like '2015 - 2020'
                    start_year, end_year = map(int, match)
                    if start_year <= end_year:
                        years.append((start_year, end_year))
                        all_years.add(start_year)
                        all_years.add(end_year)
                elif len(match) == 1:  # Matching month/year or month-day/year
                    # In case of month/year or month-day/year, handle these formats differently
                    # For example, "05/2020 - 09/2024" or "March 2016 - June 2018"
                    # The extracted range can be converted into year-based range if necessary
                    pass
            except ValueError:
                continue  # Ignore invalid matches

    # Extract years from the education section
    for pattern in patterns:
        for match in re.findall(pattern, education_text):
            try:
                if len(match) == 2:  # Matching year ranges like '2015 - 2020'
                    start_year, end_year = map(int, match)
                    if start_year <= end_year:
                        edu_years.add(start_year)
                        edu_years.add(end_year)
                elif len(match) == 1:  # Matching month/year or month-day/year
                    # Handle month-based or month-day-based formats in the education section if needed
                    pass
            except ValueError:
                continue

    # Remove education years from the extracted work experience years
    years = [(start, end) for start, end in years if start not in edu_years and end not in edu_years]

    if not years:
        return 0, None, None  # No valid work experience years found

    # Step 1: Sort year ranges by start year
    years.sort()
    
    # Step 2: Merge overlapping or consecutive time periods
    merged_ranges = []
    current_start, current_end = years[0]

    for start, end in years[1:]:
        if start <= current_end:  # Overlapping or consecutive
            current_end = max(current_end, end)
        else:
            merged_ranges.append((current_start, current_end))
            current_start, current_end = start, end

    merged_ranges.append((current_start, current_end))

    # Step 3: Calculate total experience
    total_experience = sum(end - start for start, end in merged_ranges)
    min_year = min(start for start, _ in merged_ranges)
    max_year = max(end for _, end in merged_ranges)

    return total_experience, min_year, max_year

# Example test case
resume_text = """
Work Experience:
Software Engineer at DEF Inc, 2016 - 2019.
Senior Developer at GHI Tech, 2019 - 2024.

Education:
Bachelor of Science in Computer Science, XYZ University, 2008 - 2012.
Master's in Data Science, ABC Institute, 2012 - 2015.
"""

print(extract_experience_years(resume_text))


(7, 2016, 2024)


In [434]:
def get_education(text):
    text = fix_broken_words(text)
    
    edu_index = text.find("education") 
    work_index = re.search(r'work|experience', text[edu_index:])

    if edu_index != -1:
        if work_index != '':
            real_work_index = edu_index + work_index.start()
            return text[edu_index:real_work_index]
        else:
            # If 'work' is not found, take everything from 'education' to the last occurrence of a year
            matches = list(re.finditer(r'education', text))
    
            if matches:
                last_edu_index = matches[-1]  # Get the last occurrence (bottom-most)
                last_edu_index = last_edu_index.start()
                if edu_index != last_edu_index:
                    return text[last_edu_index:]
                else:
                    return text[edu_index:]
    else:
        education_text = ""  # No education section found

    return education_text

In [452]:
def get_years(text):
    # Get current year
    current_year = datetime.now().year

    # Patterns to extract year ranges
    patterns = [
        r'(\b\d{4}\b)\s*[-to]+\s*(\b\d{4}\b|\bcurrent\b|\bpresent\b)',  # Handles "2015 - 2020" and "2015 - current"
        r'(\d{2}/\d{4})\s*[-–to]+\s*(\d{2}/\d{2}/\d{4}|\d{2}/\d{4}|\bcurrent\b|\bpresent\b)',  # Handles "05/2020 - current"
        r'(\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4})\s*[-to]+\s*(\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}|\bcurrent\b|\bpresent\b)'  # Handles "March 2016 - current"
    ]

    years = []
    
    for pattern in patterns:
        matches = re.findall(pattern, text, flags=re.IGNORECASE)
        for start, end in matches:
            try:
                start_year = int(re.search(r'\d{4}', start).group())
                
                # If end year is "current" or "present", use current year
                if re.search(r'current|present', end, flags=re.IGNORECASE):
                    end_year = current_year
                else:
                    end_year = int(re.search(r'\d{4}', end).group())

                if start_year <= end_year:
                    years.append((start_year, end_year))
            except (ValueError, AttributeError):
                continue  # Skip if parsing fails

    return years

In [446]:
def fix_broken_words(text):
    """
    Fixes broken words by merging improperly split words.
    Also removes extra spaces.
    """
    # Fix cases like "w ork" -> "work", "wo rk" -> "work"
    text = re.sub(r'(\w)(?=\s{1,2}\w)', r'\1', text)   # Merges short split words

    # Remove extra whitespace everywhere
    text = re.sub(r'\s+', ' ', text).strip()

    # Step 2: Fix broken words (merge improperly split words)
    text = re.sub(r'(\b\w{1,10})\s(?=\w{1,10}\b)', r'\1', text)

    # Add space between a letter and a number (e.g., "september2011" -> "september 2011")
    text = re.sub(r'([a-zA-Z])(\d)', r'\1 \2', text)

    # Add space between a number and a letter (e.g., "2011newbrunswick" -> "2011 newbrunswick")
    text = re.sub(r'(\d)([a-zA-Z])', r'\1 \2', text)

    # Add space before any month name (e.g., "newbrunswickseptember" -> "newbrunswick september")
    months = r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec|" \
             r"January|February|March|April|May|June|July|August|September|" \
             r"October|November|December)"
    text = re.sub(r'(\w)(' + months + r')', r'\1 \2', text, flags=re.IGNORECASE)

    # Add space between a lowercase letter and an uppercase letter
    # text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
    
    return text

In [458]:
# Example Usage
file = "./../../applicants_resumes/resume 3.pdf"
# job_data['description'] = "Our client is looking for a PHP Symfony Developer who impresses with 5 years experience in the backend development of websites and applications as well as with APIs. Good knowledge of PHP (Symfony) and MySQL as well as object-oriented programming and an understanding of database and caching systems such as MySQL, Redis and ElasticSearch are an advantage. experience in software design techniques, test-driven development and distributed architecture. excellent communication skills. Fluent written and spoken German or English"
                                        
job_data = {
    "description": "Our client is looking for a PHP Symfony Developer who impresses with Practical  experience in the backend development of websites and applications as well as with APIs. Good knowledge of PHP (Symfony) and MySQL as well as object-oriented programming and an understanding of database and caching systems such as MySQL, Redis and ElasticSearch are an advantage. experience in software design techniques, test-driven development and distributed architecture. excellent communication skills. Fluent written and spoken German or English",
    "skills": "php, symfony, mysql, design patterns, clean code, html, git",
    "experience": "Software Developer with 5 years experience in web development",
    "education": "Computer Science bsc required"
}
# Apply clean_text to each value in job_example
# job_data = {key: clean_text(value) for key, value in job_data.items()}

resume_text = extract_text_from_pdf(file)
        
# similarity_score = get_resume_ranking_score(resume_text, job_data['description'])
ranking_data = extract_entities(resume_text)
ranking_data['r_skills'] = filter_skills(ranking_data['skills'], job_data['skills'])
ranking_data["resume_text"] = remove_sensitive_info(clean_text(resume_text)) # remove bias 
# ranking_data["score"] = round(similarity_score, 2)

# scores = get_resume_ranking_score(ranking_data, job_data)
# Drop the key 'resume_text'
del ranking_data['resume_text']
# result = ranking_data | scores

# print(f"🔹 **Real Resume Ranking Score:** {resume_score:.4f}")
# Remove extra spaces between letters in words
resume_text = re.sub(r'(?<=\b\w) (?=\w\b)', '', resume_text)

# Remove extra whitespace everywhere
resume_text = re.sub(r'\s+', ' ', resume_text).strip()

edu_text = get_education(resume_text)
education_years = get_years(edu_text)
years_range = get_years(resume_text)
# Remove education years from work experience years
filtered_years = [years for years in years_range if years not in education_years]

print(years_range, education_years, filtered_years)
# print(education_years)

[(2015, 2025), (2008, 2012), (2015, 2025), (2012, 2015), (2011, 2012)] [(2008, 2012)] [(2015, 2025), (2015, 2025), (2012, 2015), (2011, 2012)]


In [36]:
skill = "php, symfony, mysql, design patterns, clean code, html, git"
skill_dataset = skill.split(", ")

# print(f"🔹 **Real Resume Ranking Score:** {resume_score:.4f}")
print(skill_dataset)

['php', 'symfony', 'mysql', 'design patterns', 'clean code', 'html', 'git']


In [None]:
# Resume ranking based on job description
def rank_resumes(job_description, top_n=5):
    job_description_cleaned = clean_text(job_description)
    job_vector = vectorizer.transform([job_description_cleaned])
    
    # Use Nearest Neighbors to find similar resumes
    nn = NearestNeighbors(n_neighbors=top_n, metric='cosine')
    nn.fit(X)
    distances, indices = nn.kneighbors(job_vector)
    
    return df.iloc[indices[0]]

# Example usage
job_desc = "Looking for a data scientist with expertise in Python and machine learning."
top_resumes = rank_resumes(job_desc, top_n=5)
print(top_resumes[['category', 'resume_text']])

In [122]:
#final step above 