In [2]:

# We can use TF-IDF to analyze a bunch of resumes in a folder and compare the similarity of the text in each resume
# to the text in a job description, thereby ranking the resumes by closeness of fit to the JD.
# What is TD-IDF?
# TF-IDF (Term Frequency-Inverse Document Frequency) is a numerical statistic that reflects the importance of a 
# word in a document relative to a collection of documents (corpus). It is commonly used in natural language 
# processing and information retrieval.

# TF-IDF assigns a weight to each term in a document based on how frequently it appears in that document (TF) and 
# how unique it is across the entire collection of documents (IDF). Terms that are common in a document but rare 
# in the corpus receive higher TF-IDF scores, indicating their importance in describing the content of that document.


In [None]:

import os
import PyPDF2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ''
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

# Path to the job description PDF
job_description_path = 'C:\\Users\\job_description.pdf'

# Path to the folder containing resumes in PDF format
resumes_folder = 'C:\\Users\\all_docs\\'

# Extract text from the job description
with open(job_description_path, 'rb') as file:
    job_description_text = extract_text_from_pdf(job_description_path)

# Collect resumes and their text
resumes = []
resume_texts = []
for resume_file in os.listdir(resumes_folder):
    if resume_file.endswith('.pdf'):
        resume_path = os.path.join(resumes_folder, resume_file)
        resumes.append(resume_file)
        resume_texts.append(extract_text_from_pdf(resume_path))

# Use TF-IDF vectorization to convert text to numerical representation
vectorizer = TfidfVectorizer(stop_words='english')
job_description_vector = vectorizer.fit_transform([job_description_text])
resume_vectors = vectorizer.transform(resume_texts)

# Calculate cosine similarity between job description and each resume
similarities = cosine_similarity(job_description_vector, resume_vectors).flatten()

# Get the indices of the top 3 most relevant resumes
top_indices = similarities.argsort()[-3:][::-1]

# Print the most relevant resumes
for index in top_indices:
    print(f"Resume: {resumes[index]} - Similarity: {similarities[index]}")
    
    

In [1]:

# Final Result:
Resume: tim.pdf - Similarity: 0.44980810055770165
Resume: dennis.pdf - Similarity: 0.4449728550719042
Resume: ryan.pdf - Similarity: 0.3947510657389466

# So, if we have dozens of resumes in a folder, or even hundreds, we would contact these three individuals first 
# because the experience of these three individuals is the most relevant to what's outlined in the JD. 
# This process takes just a few seconds to run and it's accurate to more than 15 decimal places of precision.
