In [36]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import PyPDF2
import re
import nltk
from nltk.corpus import stopwords
from flask import Flask, request, jsonify, json
from skill_edu import education_keywords, skill_dataset
from datetime import datetime

# Download stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

# Load pre-trained BERT model for embeddings
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anneezurike/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def extract_text_from_pdf(pdf_file):
    """Extracts text from a PDF resume (given a PDF file object) and cleans it."""
    if isinstance(pdf_file, str):  # If the input is a file path, open the file
        with open(pdf_file, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = " ".join([page.extract_text() for page in reader.pages if page.extract_text()])
            text = text.lower()
    else:  # If the input is already a file object (PdfFileReader accepts file-like objects)
        reader = PyPDF2.PdfReader(pdf_file)
        text = " ".join([page.extract_text() for page in reader.pages if page.extract_text()])
        text = text.lower()
    return text 

In [8]:
def clean_text(text):
    # Check if the text is valid and non-empty
    if pd.isna(text) or text.strip() == "":
        return ""  # or return some default text
    """Cleans the extracted text by removing special characters, numbers, and stopwords."""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

In [10]:
def remove_sensitive_info(text):
    """Removes potential bias-related words from resume text."""
    bias_keywords = ["male", "female", "black", "white", "asian", "hispanic", "married", "single"]
    for word in bias_keywords:
        text = text.replace(word, " ")
    return text

In [12]:
def filter_skills(skills_a, skills_b):
    if isinstance(skills_b, str):
        skills_b = [item.strip() for item in skills_b.split(", ")]
    skills = {skill for skill in skills_a if skill in skills_b}
    return skills

In [152]:
def extract_entities(text):
    """Efficiently extracts skills, education, and experience from resume text."""
    extracted_info = {"Skills": set(), "Education": set(), "Experience": set()}
    # Regex patterns to capture different ways experience is written
    experience_patterns = [
        # r'(\d+)\s*(?:\+|-)?\s*(?:years?|yrs?)\s*(?:of|in|working in|as)?\s*experience',
        r'(\d+\s*[+-]?\s*(?:years?|yrs?))',
        r'(\d+)\s*(?:to|-) (\d+)\s*years',  # e.g., "3 to 5 years"
        r'(\d+)-(\d+)\s*years'  # e.g., "3-5 years"
    ]

    # Convert text to lowercase for case-insensitive matching
    text = text.lower()
    skill_data = list(map(str.lower, skill_dataset))
    education_data = list(map(str.lower, education_keywords))

    # Regex-based experience extraction
    # Extract matches using regex
    for pattern in experience_patterns:
        matches = re.findall(pattern, text)
        if matches:
            for match in matches:
                if isinstance(match, tuple):  # Handles cases like "3-5 years"
                    extracted_info["Experience"].add(f"{match[0]}-{match[1]} years")
                else:
                    extracted_info["Experience"].add(f"{match} years")

    text = clean_text(text)

    # Fast skill matching using set intersection
    extracted_info["Skills"] = {skill for skill in skill_data if skill in text}

    # Fast education matching
    extracted_info["Education"] = {edu for edu in education_data if edu in text}

    return {
        "skills": list(extracted_info["Skills"]),
        "education": list(extracted_info["Education"]),
        "experience": list(extracted_info["Experience"])
    }

In [16]:
def compute_similarity(text1, text2):
    """Compute cosine similarity between two text embeddings."""
    embedding1 = bert_model.encode(text1).reshape(1, -1)
    embedding2 = bert_model.encode(text2).reshape(1, -1)
    return cosine_similarity(embedding1, embedding2)[0][0]

def get_resume_ranking_score(ranking_data, job_data):
    """Provides a detailed breakdown of resume scoring."""
    
    # Compute similarity scores for different sections
    general_score = compute_similarity(ranking_data["resume_text"], job_data["description"]) * 15  # 15% weight
    skills_score = compute_similarity(ranking_data["r_skills"], job_data["skills"]) * 40  # 40% weight
    experience_similarity = compute_similarity(ranking_data["resume_text"], job_data["experience"])  
    experience_score = compute_experience_score(ranking_data["experience"], job_data["experience"], experience_similarity) * 30  # 30% weight
    education_score = compute_similarity(ranking_data["education"], job_data["education"]) * 15  # 15% weight
    

    # Calculate total score
    total_score = skills_score + experience_score + education_score + general_score

    # Return breakdown
    return {
        "ts": round(total_score, 2),
        "ss": round(skills_score, 2),
        "ex": round(experience_similarity, 2),
        "ed": round(education_score, 2),
        "ge": round(general_score, 2)
    }

In [18]:
def rank_exp(text):
    # Find the first match
    for pattern in rank_exp_pattern:
        matches = re.findall(pattern, text)
        if matches:
            for match in matches:
                result = f"{match} years"
    return result or ""

In [130]:
def extract_experience(text):
    """Extracts numerical years of experience from explicit mentions or date ranges, including month-name formats."""
    
    # Explicit experience extraction (e.g., "5 years of experience")
    explicit_match = re.findall(r'(\d+)\s*(?:\+|-)?\s*years?', text)
    
    if explicit_match:
        return max(map(int, explicit_match))  # Take the highest number found
    
    # Patterns for different date formats
    date_patterns = [
        r'(\b\d{4}\b)\s*[-to]+\s*(\b\d{4}\b)',  # "2015 - 2020" / "2015 to 2020"
        r'(\d{2}/\d{4})\s*[-–to]+\s*(\d{2}/\d{2}/\d{4}|\d{2}/\d{4})',  # "05/2020 - 09/2024"
        r'(\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4})\s*[-to]+\s*(\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4})'
        # Handles: "March 2016 - June 2018"
    ]
    
    years = []

    for pattern in date_patterns:
        matches = re.findall(pattern, text, flags=re.IGNORECASE)
        for start, end in matches:
            try:
                # Extract year from different formats
                start_year = int(re.search(r'\d{4}', start).group())
                end_year = int(re.search(r'\d{4}', end).group())
                
                if start_year <= end_year:
                    years.append((start_year, end_year))
            except (ValueError, AttributeError):
                continue

    if not years:
        return 0, None, None  # No valid years found

    # Step 1: Sort year ranges by start year
    years.sort()
    
    # Step 2: Merge overlapping or consecutive time periods
    merged_ranges = []
    current_start, current_end = years[0]

    for start, end in years[1:]:
        if start <= current_end:  # Overlapping or consecutive
            current_end = max(current_end, end)  # Extend the range
        else:
            merged_ranges.append((current_start, current_end))  # Store the merged period
            current_start, current_end = start, end

    merged_ranges.append((current_start, current_end))  # Add last range

    # Step 3: Calculate total experience
    total_experience = sum(end - start for start, end in merged_ranges)
    min_year = min(start for start, _ in merged_ranges)
    max_year = max(end for _, end in merged_ranges)

    return total_experience, min_year, max_year
    # return years if years else 0  # Return inferred experience or 0 if nothing found


def compute_experience_score(resume_exp, job_exp, similarity_score):
    """Computes the final experience score combining numerical experience and text similarity."""
    resume_years = extract_experience(resume_exp)
    job_years = extract_experience(job_exp)

    if job_years == 0:  # No required experience specified
        num_experience_score = 1.0  # Full score if no experience requirement
    else:
        num_experience_score = min(resume_years / job_years, 1.5)  # Cap scaling at 1.5 to avoid over-rewarding

    # Multiply structured experience score with text similarity score to balance both
    final_experience_score = num_experience_score * similarity_score  

    return final_experience_score

In [251]:
def extract_experience(text):
    """Extracts numerical years of experience from explicit mentions or date ranges, including month-name formats."""
    
    # Explicit experience extraction (e.g., "5 years of experience")
    explicit_match = re.findall(r'(\d+)\s*(?:\+|-)?\s*years?', text)
    
    if explicit_match:
        return max(map(int, explicit_match))  # Take the highest number found
    
    # Patterns for different date formats
    date_patterns = [
        r'(\b\d{4}\b)\s*[-to]+\s*(\b\d{4}\b)',  # "2015 - 2020" / "2015 to 2020"
        r'(\d{2}/\d{4})\s*[-–to]+\s*(\d{2}/\d{2}/\d{4}|\d{2}/\d{4})',  # "05/2020 - 09/2024"
        r'(\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4})\s*[-to]+\s*(\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4})'
        # Handles: "March 2016 - June 2018"
    ]
    
    years = []

    for pattern in date_patterns:
        matches = re.findall(pattern, text, flags=re.IGNORECASE)
        for start, end in matches:
            try:
                # Extract year from different formats
                start_year = int(re.search(r'\d{4}', start).group())
                end_year = int(re.search(r'\d{4}', end).group())
                
                if start_year <= end_year:
                    years.append((start_year, end_year))
            except (ValueError, AttributeError):
                continue

    if not years:
        return 0  # No valid years found

    # Step 1: Sort year ranges by start year
    years.sort()
    
    # Merge overlapping or consecutive time periods
    merged_ranges = []
    current_start, current_end = years[0]

    for start, end in years[1:]:
        if start <= current_end:  # Overlapping or consecutive
            current_end = max(current_end, end)  # Extend the range
        else:
            merged_ranges.append((current_start, current_end))  # Store the merged period
            current_start, current_end = start, end

    merged_ranges.append((current_start, current_end))  # Add last range

    # Step 3: Calculate total experience
    total_experience = sum(end - start for start, end in merged_ranges)
    min_year = min(start for start, _ in merged_ranges)
    max_year = max(end for _, end in merged_ranges)

    return total_experience, min_year, max_year

In [321]:
import re

def extract_years(text):
    """Extract all year patterns from text."""
    year_pattern = r'(\b\d{4}\b)'  # Matches four-digit years
    return [int(y) for y in re.findall(year_pattern, text)]

def section_based_year_filter(resume_text):
    """Extracts work experience years while filtering out education years using section-based splitting."""

    # Normalize text (remove extra spaces & lowercase for easier matching)
    text = resume_text.lower()

    # Define sections
    education_headers = ["education", "academic background", "certifications", "qualifications"]
    work_headers = ["work experience", "employment history", "professional experience", "career"]

    # Find positions of education and work sections
    education_pos = min((text.find(header) for header in education_headers if header in text), default=-1)
    work_pos = min((text.find(header) for header in work_headers if header in text), default=-1)

    education_years = set()

    if education_pos != -1:
        # If work comes after education, capture education section only
        if work_pos != -1 and work_pos > education_pos:
            education_text = text[education_pos:work_pos]
        else:
            # If work is not found, education might be at the end
            education_text = text[education_pos:]
        
        # Extract years in education section
        education_years = set(extract_years(education_text))

    # Extract all years in resume
    all_years = set(extract_years(text))

    # Work experience years = all years minus education years
    work_years = all_years - education_years

    # Calculate years of experience
    if work_years:
        min_year, max_year = min(work_years), max(work_years)
        years_of_experience = max_year - min_year
    else:
        years_of_experience = 0

    return {
        "work_years": work_years,
        "education_years": education_years,
        "years_of_experience": years_of_experience
    }

# Example Resume Text
resume_text = """
e d u c a t i o n
b.s. computer science
university of pittsburgh
september 2008 - april 2012
pittsburgh, pa

w o r k  e x p e r i e n c e
software engineer
embark
january 2015 - current/new york, ny
...
"""

# Run function
result = section_based_year_filter(resume_text)
print(result)  # Output: {'work_years': {2015, 2024}, 'education_years': {2008, 2012}, 'years_of_experience': 9}


{'work_years': {2008, 2012, 2015}, 'education_years': set(), 'years_of_experience': 7}


In [323]:
# Example Usage
file = "./applicants_resumes/resume 3.pdf"
# job_data['description'] = "Our client is looking for a PHP Symfony Developer who impresses with 5 years experience in the backend development of websites and applications as well as with APIs. Good knowledge of PHP (Symfony) and MySQL as well as object-oriented programming and an understanding of database and caching systems such as MySQL, Redis and ElasticSearch are an advantage. experience in software design techniques, test-driven development and distributed architecture. excellent communication skills. Fluent written and spoken German or English"
                                        
job_data = {
    "description": "Our client is looking for a PHP Symfony Developer who impresses with Practical  experience in the backend development of websites and applications as well as with APIs. Good knowledge of PHP (Symfony) and MySQL as well as object-oriented programming and an understanding of database and caching systems such as MySQL, Redis and ElasticSearch are an advantage. experience in software design techniques, test-driven development and distributed architecture. excellent communication skills. Fluent written and spoken German or English",
    "skills": "php, symfony, mysql, design patterns, clean code, html, git",
    "experience": "Software Engineer with 8 years experience in web development",
    "education": "Computer Science bsc required"
}
# Apply clean_text to each value in job_example
# job_data = {key: clean_text(value) for key, value in job_data.items()}

resume_text = extract_text_from_pdf(file)
        
# similarity_score = get_resume_ranking_score(resume_text, job_data['description'])
ranking_data = extract_entities(resume_text)
ranking_data['r_skills'] = filter_skills(ranking_data['skills'], job_data['skills'])
ranking_data["resume_text"] = remove_sensitive_info(clean_text(resume_text)) # remove bias 
# ranking_data["score"] = round(similarity_score, 2)

# scores = get_resume_ranking_score(ranking_data, job_data)
# Drop the key 'resume_text'
del ranking_data['resume_text']
result = ranking_data | scores

# print(f"🔹 **Real Resume Ranking Score:** {resume_score:.4f}")
print(extract_years_of_experience(resume_text))

0


In [36]:
skill = "php, symfony, mysql, design patterns, clean code, html, git"
skill_dataset = skill.split(", ")

# print(f"🔹 **Real Resume Ranking Score:** {resume_score:.4f}")
print(skill_dataset)

['php', 'symfony', 'mysql', 'design patterns', 'clean code', 'html', 'git']


In [None]:
# Resume ranking based on job description
def rank_resumes(job_description, top_n=5):
    job_description_cleaned = clean_text(job_description)
    job_vector = vectorizer.transform([job_description_cleaned])
    
    # Use Nearest Neighbors to find similar resumes
    nn = NearestNeighbors(n_neighbors=top_n, metric='cosine')
    nn.fit(X)
    distances, indices = nn.kneighbors(job_vector)
    
    return df.iloc[indices[0]]

# Example usage
job_desc = "Looking for a data scientist with expertise in Python and machine learning."
top_resumes = rank_resumes(job_desc, top_n=5)
print(top_resumes[['category', 'resume_text']])

In [122]:
#final step above 