In [32]:
# Import necessary libraries
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import PyPDF2

In [33]:
nlp = spacy.load("en_core_web_sm")
import en_core_web_sm
nlp = en_core_web_sm.load()

# Define functions for preprocessing and analysis
def preprocess(text):
    # Preprocess the text data by removing stop words, punctuation, etc.
    doc = nlp(text)
    tokens = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct and not token.like_num]
    return " ".join(tokens)
# Define functions for preprocessing and analysis
def extract_keywords(text):
    # Extract relevant keywords and phrases from the text data
    cv = CountVectorizer()
    word_count_vector = cv.fit_transform(text)
    return cv.get_feature_names_out()

# Define a function to extract named entities from the keywords
def extract_entities(text):
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        entities.append(ent.text)
    return entities

def compare_resumes(resume1, resume2):
    # Compare the contents of two resumes using cosine similarity
    vectorizer = CountVectorizer().fit_transform([resume1, resume2])
    vectors = vectorizer.toarray()
    return cosine_similarity([vectors[0]], [vectors[1]])[0][0]

def cluster_resumes(data, num_clusters):
    # Cluster resumes based on certain features using K-means clustering
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(data)
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(X)
    return kmeans.labels_

def visualize_data(data):
    # Visualize the results of the analysis using Seaborn and Matplotlib
    sns.histplot(data)
    plt.show()
    # Define a function to read in a PDF file and extract the text
def read_pdf(filepath):
    with open(filepath, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        resume_text = ""
        for page in pdf_reader.pages:
            resume_text += page.extract_text()
    return resume_text

In [34]:
# Read in the resume PDF files and preprocess the text data
resume1 = read_pdf("resume1.pdf")
resume2 = read_pdf("resume2.pdf")
resume1_processed = preprocess(resume1)
resume2_processed = preprocess(resume2)
# Extract keywords from the text data
keywords1 = extract_keywords([resume1_processed])
keywords2 = extract_keywords([resume2_processed])
# Extract keywords and named entities from the preprocessed resume text
entities1, entities2 = extract_entities(" ".join(keywords1)), extract_entities(" ".join(keywords2))

In [35]:
# Compare the contents of the two resumes
similarity_score = compare_resumes(resume1_processed, resume2_processed)
print(similarity_score)

0.38878731308710335


In [36]:
# Cluster resumes and visualize the results
df = pd.read_csv("resume_dataset.csv")
df.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [37]:
import gensim
from gensim import corpora

In [38]:
# Preprocess the resume text data
resume_text = df['Resume_str']
resume_text_processed = []
for text in resume_text:
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha]
    resume_text_processed.append(tokens)

In [39]:
# Create a dictionary of the processed resume text data
dictionary = corpora.Dictionary(resume_text_processed)

# Create a corpus of the processed resume text data
corpus = [dictionary.doc2bow(tokens) for tokens in resume_text_processed]

In [42]:
# Perform LDA topic modeling on the corpus
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=dictionary,
                                            num_topics=10,
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics=True)

In [43]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

In [44]:
# Visualize the LDA model
vis = gensimvis.prepare(lda_model, corpus, dictionary=lda_model.id2word)
pyLDAvis.display(vis)