In [60]:
# pip install nltk --upgrade

Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 6.4 MB/s eta 0:00:00
Installing collected packages: nltk
Successfully installed nltk-3.8.1
Note: you may need to restart the kernel to use updated packages.




In [132]:
import docx2txt
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances



In [133]:
# Load the applicant and job descriptions
# Set filenames to resume and jd variables
# List of text to store resumes and job descriptions
resumedocx_1 = "Caleb Picker Resume draft 16 July 09 2023.docx"
resumedocx_2 = "Caleb Picker Resume draft 18 July 09 2023.docx"
resumedocx_3 = "Caleb Picker Resume draft 17 July 09 2023 - full.docx"
resumedocx_4 = "Caleb Picker Resume draft 19 July 10 2023.docx"

resumes = [docx2txt.process(file) for file in [resumedocx_1, resumedocx_2, resumedocx_3, resumedocx_4]]

In [134]:
# IMport job description
job_descriptions = [
    docx2txt.process("Data Engineer.docx"),
    docx2txt.process("Senior Data Scientist.docx"),
    docx2txt.process("Sr Data Engineer.docx"),
    docx2txt.process("Sr Data Engineer Quality.docx"),
    docx2txt.process("Data Analyst.docx")
]


In [135]:
# Donwload NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initialize Stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# List of stopwords
stopwords_set = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\caleb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\caleb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\caleb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [136]:
# Pre Process Text function (to be used in final function)
def preprocess_text(text):
    # Tokenize the text into individual words
    tokens = word_tokenize(text)
    
    # Remove stopwords and perform stemming/lemmatization
    processed_tokens = [stemmer.stem(lemmatizer.lemmatize(token)) for token in tokens if token.lower() not in stopwords_set]
    
    # Join the processed tokens back into a string
    processed_text = ' '.join(processed_tokens)
    
    return processed_text

In [137]:
# List of text to store resume and job desription
def count_matrix_fun(resume,jd):
    # Preprocess resume and job description
    processed_resume = preprocess_text(resume)
    processed_jd = preprocess_text(jd)
    
    text = [processed_resume,processed_jd]
    # Count the number of words in the text matrix
    cv = CountVectorizer()
    count_matrix = cv.fit_transform(text)
    
    # Calculate cosine similarity similar to percent match
    cos_sim = cosine_similarity(count_matrix)[0][1]
    match = (cos_sim + 1)/2*100
    match = round(match,2)
    
    return match


In [138]:
def compare_resumes_to_job_descriptions(resumes, job_descriptions):
    matrix = []
    for i, resume_text in enumerate(resumes):
        row = []
        for j, jd_text in enumerate(job_descriptions):
            # Jaccard Similarity
            resume_keywords = set(preprocess_text(resume_text).split())
            jd_keywords = set(preprocess_text(jd_text).split())
            jaccard_sim = len(resume_keywords.intersection(jd_keywords)) / len(resume_keywords.union(jd_keywords))
            
            # TF-IDF Cosine Similarity
            vectorizer = TfidfVectorizer()
            tfidf_matrix = vectorizer.fit_transform([resume_text, jd_text])
            tfidf_cos_sim = 1 - pairwise_distances(tfidf_matrix[0], tfidf_matrix[1], metric='cosine')
            tfidf_percent_match = round((tfidf_cos_sim[0][0]+1)/2 * 100, 2)
            
            # Cosine Similarity
            cos_sim = count_matrix_fun(resume_text, jd_text)
            cos_percent_match = round((cos_sim + 1) / 2 * 100, 2)
            
            row.append(jaccard_sim)
            row.append(tfidf_percent_match)
            row.append(cos_sim)
        matrix.append(row)
    
    columns = []
    for j in range(len(job_descriptions)):
        columns.append("Job Description " + str(j+1) + " (Jaccard)")
        columns.append("Job Description " + str(j+1) + " (TF-IDF)")
        columns.append("Job Description " + str(j+1) + " (Cosine)")
    
    df = pd.DataFrame(matrix, index=[f"Resume {i+1}" for i in range(len(resumes))], columns=columns)
    return df


In [139]:
# resumes = [resume_1,resume_2,resume_3,resume_4]

# for i, resume_text in enumerate(resumes):
#    count_matrix_fun(resume_text,jd)

In [140]:
similarity_matrix = compare_resumes_to_job_descriptions(resumes, job_descriptions)

(similarity_matrix)


Unnamed: 0,Job Description 1 (Jaccard),Job Description 1 (TF-IDF),Job Description 1 (Cosine),Job Description 2 (Jaccard),Job Description 2 (TF-IDF),Job Description 2 (Cosine),Job Description 3 (Jaccard),Job Description 3 (TF-IDF),Job Description 3 (Cosine),Job Description 4 (Jaccard),Job Description 4 (TF-IDF),Job Description 4 (Cosine),Job Description 5 (Jaccard),Job Description 5 (TF-IDF),Job Description 5 (Cosine)
Resume 1,0.104987,75.06,64.61,0.141791,77.8,67.21,0.105128,74.96,64.61,0.131783,77.94,67.19,0.13615,77.45,67.29
Resume 2,0.122905,76.32,67.33,0.151832,77.0,67.72,0.119565,76.19,67.25,0.138587,77.1,67.79,0.134146,76.86,67.7
Resume 3,0.096654,77.13,64.03,0.127469,78.54,65.69,0.096892,76.99,64.0,0.117864,78.45,65.82,0.135652,78.45,65.67
Resume 4,0.121387,76.24,68.19,0.189944,78.57,70.96,0.117978,76.09,68.09,0.13764,77.64,69.24,0.141772,77.12,69.63
