In [7]:
! pip install pdfminer.six
! pip install nltk



In [55]:
#For converting pdf to text
from pdfminer.high_level import extract_text

#For NLP preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

#For calculating keywords
from sklearn.feature_extraction.text import TfidfVectorizer

#For final score assigning
import numpy as np
from scipy.stats import percentileofscore


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
# Function converts the pdf files to text and stores it in a string
def extract_text_from_pdf(pdf_path):
  return extract_text(pdf_path)



In [63]:
# This function uses the NLP module nltk to remove stopwords
# and lemmatize words in the text from above
def preprocess(extracted_text):

  nltk.download('stopwords')
  nltk.download('punkt')

  lemmatizer = WordNetLemmatizer()

  tokens = word_tokenize(extracted_text)

  stop_words = set(stopwords.words('english'))
  filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
  lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

  filtered_text = ' '.join(lemmatized_tokens)

  return filtered_text


In [64]:
# This function uses the TFIDF algorithm to get the keywords from the
# preprocessed text
def get_keywords(preprocessed_text):
  tfidf_vectorizer = TfidfVectorizer()

  tfidf_matrix = tfidf_vectorizer.fit_transform([preprocessed_text])

  feature_names = tfidf_vectorizer.get_feature_names_out()

  top_keywords = [feature_names[id] for id in tfidf_matrix[0].indices]

  return top_keywords


In [59]:
# This function uses a keyword matching algorithm to track how well
# the keywords from my resume match the keywords from the job description
def keyword_matching(description_keywords, resume_keywords):
  match_count = sum(1 for keyword in resume_keywords if keyword in description_keywords)
  match_percent = match_count/len(description_keywords)

  return match_percent * 100

In [68]:
# This function assigns a final score from 0-5 based on the percentile score
# of my resume compared to other resumes' match percents from my algorithm
def assign_score(percent_array):
  my_score = percent_array[0]

  percentile_rank = percentileofscore(percent_array, my_score)
  percentile_thresholds = [10, 25, 50, 75, 90]
  score_scale = [0, 1, 2, 3, 4, 5]

  for i, threshold in enumerate(percentile_thresholds):
    if percentile_rank <= threshold:
        my_final_score = score_scale[i+1]
        break
  else:
    my_final_score = score_scale[-1]

  print("My Final Score:", my_final_score)

In [65]:
#These lines take my resume pdf and run it through 3 functions defined above
path_for_resume = '/content/Aatyanth_Resume_VF.pdf'
extracted_text_from_resume = extract_text_from_pdf(path_for_resume)
prepreoccessed_resume = preprocess(extracted_text_from_resume)
keywords_resume = get_keywords(prepreoccessed_resume)


# These lines take the job description pdf and run it through 3 functions
# defined above
path_for_description = '/content/Find_the_Right_Job_description.pdf'
extracted_text_from_description = extract_text_from_pdf(path_for_description)
preprocessed_description = preprocess(extracted_text_from_description)
keywords_description = get_keywords(preprocessed_description)


# This line prints the keyword match percentage between my resume and the job
# description
print ('THIS IS MY MATCH PERCENTAGE: ')
print(keyword_matching(keywords_description, keywords_resume))

THIS IS MY MATCH PERCENTAGE: 
15.2


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [69]:
# This is for creating a standardized scoring system based on my code
# The sample resumes are all aquired online

# This line preprocesses and gets the keywords for sample resume 1
s1_path = '/content/Sample1.pdf'
s1_extracted_text_from_resume = extract_text_from_pdf(s1_path)
s1_prepreoccessed_resume = preprocess(s1_extracted_text_from_resume)
s1_keywords_resume = get_keywords(s1_prepreoccessed_resume)

# This line preprocesses and gets the keywords for sample resume 2
s2_path = '/content/sample2.pdf'
s2_extracted_text_from_resume = extract_text_from_pdf(s2_path)
s2_prepreoccessed_resume = preprocess(s2_extracted_text_from_resume)
s2_keywords_resume = get_keywords(s2_prepreoccessed_resume)

# This line preprocesses and gets the keywords for sample resume 3
s3_path = '/content/sample3.pdf'
s3_extracted_text_from_resume = extract_text_from_pdf(s3_path)
s3_prepreoccessed_resume = preprocess(s3_extracted_text_from_resume)
s3_keywords_resume = get_keywords(s3_prepreoccessed_resume)

# This line preprocesses and gets the keywords for sample resume 4
s4_path = '/content/sample4.pdf'
s4_extracted_text_from_resume = extract_text_from_pdf(s4_path)
s4_prepreoccessed_resume = preprocess(s4_extracted_text_from_resume)
s4_keywords_resume = get_keywords(s4_prepreoccessed_resume)

# This line creates an array containing the keyword match percentage for each
# sample resume 1-4
s_perctange_match= [keyword_matching(keywords_description, s1_keywords_resume), keyword_matching(keywords_description, s2_keywords_resume),
                    keyword_matching(keywords_description, s3_keywords_resume), keyword_matching(keywords_description, s4_keywords_resume)]

# This line creates an array containg the keyword match percentage for my
# resume and all the sample resumes 1-4
assigning_score = [keyword_matching(keywords_description, keywords_resume),keyword_matching(keywords_description, s1_keywords_resume),
                   keyword_matching(keywords_description, s2_keywords_resume), keyword_matching(keywords_description, s3_keywords_resume),
                   keyword_matching(keywords_description, s4_keywords_resume)]

# This line sends the assigning_score array as an input to the statistical
# function used to calculate the final score for how well my resume matches
# the job description
assign_score(assigning_score)

### MY FINAL SCORE IS A 4 ###

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


My Final Score: 4


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
