# Neptune Technologies Job Application form Screening

##### Importing neccessary libraries

In [33]:
import os  # to perform tasks like reading file names from directories and managing file paths.
import re # to enable pattern matching and manipulation of text strings using regular expressions
import nltk # for natural language processing tasks
from nltk.corpus import stopwords # Stopwords consist of articles, prepositions, and conjunctions

### Importing resume, cover letter and form data files

In [34]:
# Importing applicant's resume file
resume_filepath = "C:\\Users\\akunna1\\Desktop\\Projects\\Project_3\\project_directory\\GC_Downloads_txt\\john_doe_resume.txt"
#change for each applicant

In [35]:
# Importing applicant's cover letter file
cover_letter_filepath = "C:\\Users\\akunna1\\Desktop\\Projects\\Project_3\\project_directory\\GC_Downloads_txt\\john_doe_cover_letter.txt"
#change for each applicant

In [36]:
# Importing applicant's form data file
form_data_filepath = "C:\\Users\\akunna1\\Desktop\\Projects\\Project_3\\project_directory\\GC_Downloads_txt\\john_doe_form.txt"
#change for each applicant

### Resume Analysis and Grading Section (40%)

##### Removing StopWords, Numerical Digits, Punctuations, Symbols, Extra Spaces, Making Text lowercase and Normalizing and Tokenizing the Text from Resume

In [37]:
# Loading the stop words
stop_words = set(stopwords.words('english'))

# Adding auxiliary verbs to the stopwords list
auxiliary_verbs = ['am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
                   'have', 'has', 'had', 'do', 'does', 'did', 'can', 'could',
                   'shall', 'should', 'will', 'would', 'may', 'might', 'must']

stop_words.update(auxiliary_verbs)

# Initializing a list to store the processed resume
processed_resume = []

# Opening and reading the original file
with open(resume_filepath, 'r', encoding='utf-8') as file:
    for line in file:
        review_text = line.strip()
        
        # Text cleaning and normalization
        for punctuations_and_more in ['●','–', ',', '.', '"', '!', '?', ':', ';', '-', '(', ')', '[', ']', "'", '*', '$', '/', '+','’', 
                             '_','|','@']:
            review_text = review_text.replace(punctuations_and_more, ' ')  # Replacing punctuations with spaces
            review_text = re.sub('\s+', ' ', review_text)  # Removing extra spaces
            review_text = review_text.lower().strip()  # Making text lowercase

        # Removing numerical digits
        review_text = re.sub(r'\d', '', review_text)
        
        # Tokenizing the cleaned review text
        words = nltk.word_tokenize(review_text)
        
        # Removing stopwords, including auxiliary verbs
        filtered_words = [word for word in words if word not in stop_words]
        
        # Joining the filtered words back into a sentence
        processed_resume_x = ' '.join(filtered_words)
        
        processed_resume.append(processed_resume_x)

In [38]:
# Displaying processed_resume
print(processed_resume)

['john doe', 'software engineer', 'www johndoe com johndoe gmail com', 'linkedin', 'github', '', 'technical skills', 'technologies ruby rails activerecord postgresql rspec capybara html heroku travisci graphql', 'practices object oriented programming test driven development agile development restful design git version control continuous', 'integration mvc architecture json construction consumption', '', 'experience', 'backend software engineering student', '', 'turing school software design', '', 'investment hours creating projects range basic ruby database backend rails applications well rest api built', 'using test driven development following object oriented design principles', '', 'student success mentor help current students understand concepts surrounding ruby rails postgresql oop ttd many', 'technical topics', '', 'senior surgical neurophysiologist cnim', '', 'assure neuromonitoring national technologist', '', 'part two member team tasked establishing advanced neuromonitoring pr

##### Combining all lines in processed_resume into a single line

In [39]:
concatenated_resume = ' '.join(processed_resume)
print(concatenated_resume)

john doe software engineer www johndoe com johndoe gmail com linkedin github  technical skills technologies ruby rails activerecord postgresql rspec capybara html heroku travisci graphql practices object oriented programming test driven development agile development restful design git version control continuous integration mvc architecture json construction consumption  experience backend software engineering student  turing school software design  investment hours creating projects range basic ruby database backend rails applications well rest api built using test driven development following object oriented design principles  student success mentor help current students understand concepts surrounding ruby rails postgresql oop ttd many technical topics  senior surgical neurophysiologist cnim  assure neuromonitoring national technologist  part two member team tasked establishing advanced neuromonitoring procedures brain tumor removal surgeries culicchia neurological clinic new orleans

##### Word Pairs Calculation for Resume

In [40]:
def get_ordered_word_pair_frequency(concatenated_resume, window_size):
    pair_freq = {}  # For storing the frequency of ordered word pairs
    word_list = concatenated_resume.split()  # Splitting the concatenated text into individual words
    
    # Comparing adjacent words to find ordered word pairs within the window_size
    for i in range(len(word_list) - 1):
        word1, word2 = word_list[i], word_list[i + 1]
        
        # Checking if word1 and word2 are not the same before adding to pair_freq
        if word1 != word2:
            order_word_pair = (word1, word2)
            if order_word_pair not in pair_freq:
                pair_freq[order_word_pair] = 1  # If the pair is not in the dictionary, it adds it to the dictionary with a frequency of 1, indicating that we've seen it once
            else:
                pair_freq[order_word_pair] += 1  # It increments the frequency of the pair in the dictionary by 1, indicating that we've seen it again
    return pair_freq

window_size = 1

pair_freq = get_ordered_word_pair_frequency(concatenated_resume, window_size)

print("\033[4mAll the Word Pairs:\033[0m")

# Sorting all word pairs by frequency in descending order
for pair, freq in sorted(pair_freq.items(), key=lambda x: x[1], reverse=True):
    print("These word pairs:", pair, "appeared", freq, "times")

[4mAll the Word Pairs:[0m
These word pairs: ('ruby', 'rails') appeared 5 times
These word pairs: ('rspec', 'capybara') appeared 4 times
These word pairs: ('rails', 'postgresql') appeared 3 times
These word pairs: ('surgical', 'neurophysiologist') appeared 3 times
These word pairs: ('neuromonitoring', 'procedures') appeared 3 times
These word pairs: ('brain', 'tumor') appeared 3 times
These word pairs: ('tumor', 'removal') appeared 3 times
These word pairs: ('removal', 'surgeries') appeared 3 times
These word pairs: ('github', 'production') appeared 3 times
These word pairs: ('tech', 'ruby') appeared 3 times
These word pairs: ('postgresql', 'heroku') appeared 3 times
These word pairs: ('object', 'oriented') appeared 2 times
These word pairs: ('test', 'driven') appeared 2 times
These word pairs: ('driven', 'development') appeared 2 times
These word pairs: ('backend', 'software') appeared 2 times
These word pairs: ('software', 'engineering') appeared 2 times
These word pairs: ('turing',

##### Word Frequency Calculation for Resume

In [41]:
# Counting the frequency of individual words, storing results in word_freq
def get_single_word_frequency(processed_resume):
    word_freq = {}  # initializing an empty dictionary called word_freq to store word frequencies
    for review_text in processed_resume:
        for word in review_text.split():
            if word not in word_freq:
                word_freq[word] = 1
            else:
                word_freq[word] += 1
    return word_freq

word_freq = get_single_word_frequency(processed_resume)  # Calling the function to get the word frequencies

total_num_words = sum(word_freq.values())
print('number of unique words:', len(word_freq))
print('total number of word occurrences:', total_num_words)
print("")

# Calculating and displaying the top 10 most frequently occurring words
print("\033[4mAll the words and their frequency:\033[0m")

for word, freq in sorted(word_freq.items(), key=lambda x: x[1], reverse=True):
    print(word, "appeared", freq, "times")

number of unique words: 202
total number of word occurrences: 321

[4mAll the words and their frequency:[0m
surgeries appeared 7 times
ruby appeared 6 times
rails appeared 6 times
neuromonitoring appeared 6 times
software appeared 5 times
postgresql appeared 5 times
api appeared 5 times
surgical appeared 5 times
brain appeared 5 times
github appeared 4 times
rspec appeared 4 times
capybara appeared 4 times
heroku appeared 4 times
design appeared 4 times
backend appeared 4 times
team appeared 4 times
procedures appeared 4 times
graphql appeared 3 times
development appeared 3 times
neurophysiologist appeared 3 times
advanced appeared 3 times
tumor appeared 3 times
removal appeared 3 times
various appeared 3 times
patient appeared 3 times
implemented appeared 3 times
production appeared 3 times
tech appeared 3 times
johndoe appeared 2 times
com appeared 2 times
technical appeared 2 times
object appeared 2 times
oriented appeared 2 times
test appeared 2 times
driven appeared 2 times
engi

##### Resume score grading (addition) Part 1

In [42]:
# part 1 (14 points max)
# Purpose: grading by word pairs

# Logic for word pairs and their associated points
word_pair_points = {
    ('express', 'js'): 2,
    ('node', 'js'): 2,
    ('vue', 'js'): 2,
    ('c', '#'): 2,
    ('data', 'science'): 1,
    ('front', 'end'): 1,
    ('back', 'end'): 1,
    ('web', 'scraping'): 1,
    ('data', 'mining'): 1,
    ('google', 'cloud'): 1,
}

# Initializing total points
resume_points_1 = 0

# Iterating through the word pairs and their frequencies
for pair, freq in pair_freq.items():
    # Checking if the pair is in the word_pair_points dictionary
    if pair in word_pair_points:
        # Adding the associated points to the total
        resume_points_1 += word_pair_points[pair]

# Printing the total points awarded
print("Resume points earned (part 1):", resume_points_1, "points out of 14")

Resume points earned (part 1): 0 points out of 14


##### Resume score grading (addition) Part 2

In [43]:
# part 2 (12 points max)
# Purpose: grading by certains keywords

# Initializing a dictionary to store the points for specific words
word_points = {
    'portfolio': 1,
    'tensorflow': 1,
    'github': 1,
    'css': 1,
    'linkedin': 1,
    'html': 1,
    'mongodb': 1,
    'agile': 1,
    'aws': 1,
    'flask': 1,
    'django': 1,
    'atom': 1
}

# Initializing the total points
resume_points_2 = 0

# Iterating through the words in the processed resume
for review_text in processed_resume:
    for word in review_text.split():
        # Checking if the word is in the word_points dictionary
        if word.lower() in word_points:
            # Adding the corresponding points to resume_points_2
            resume_points_2 += word_points[word.lower()]

# Printing the total points
print("Resume points earned (part 2):", resume_points_2, "points out of 12")

Resume points earned (part 2): 7 points out of 12


##### Resume score grading (addition) Part 3

In [44]:
# part 3 (14 points max)
# Purpose: grading by education and school attended or worked in using certain keywords and wordpairs

# Part 3a
# having a doctoral degree --> 2 points
# keyword: phd or doctoral or docorate

# Initializing a dictionary to store the points for specific words
word_points_3a = {
    'phd': 2,
    'doctorate': 2,
    'doctoral': 2
}

# Initializing the total points
resume_points_3a = 0

# Iterating through the words in the processed resume
for review_text in processed_resume:
    for word in review_text.split():
        # Checking if the word is in the word_points dictionary
        if word.lower() in word_points_3a:
            # Adding the corresponding points to resume_points_3a
            resume_points_3a += word_points_3a[word.lower()]

# Printing the total points
print("Resume points earned (part 3a):", resume_points_3a, "points out of 2", "--> phd check ")



# Part 3b
# having a masters degree in the appropriate field --> 2 points
# keyword: master

# Initializing a dictionary to store the points for specific words
word_points_3b = {
    'master': 2
}

# Initializing the total points
resume_points_3b = 0

# Iterating through the words in the processed resume
for review_text in processed_resume:
    for word in review_text.split():
        # Checking if the word is in the word_points dictionary
        if word.lower() in word_points_3b:
            # Adding the corresponding points to resume_points_2
            resume_points_3b += word_points_3b[word.lower()]

# Printing the total points
print("Resume points earned (part 3b):", resume_points_3b, "points out of 2", "--> Master's degree check")

    
    
# Part 3c
# 1 working at/ attended an <10% school --> 4 points max
# use word pairs

# Logic for word pairs and their associated points
word_pair_points_3c = {
    ('harvard', 'university'): 4/15,
    ('stanford', 'university'): 4/15,
    ('columbia', 'university'): 4/15,
    ('california', 'institute'): 4/15,
    ('massachusetts', 'institute'): 4/15,
    ('university', 'chicago'):4/15,
    ('yale', 'university'): 4/15,
    ('princeton', 'university'): 4/15,
    ('brown', 'university'): 4/15,
    ('northwestern', 'university'): 4/15,
    ('dartmouth', 'college'): 4/15,
    ('cornell', 'university'): 4/15,
    ('university', 'pennsylvania'): 4/15,
    ('duke', 'university'): 4/15,
    ('john', 'hopkins'): 4/15
}

# the more universities, the higher the points
# 4/15 * 15 = 4 points max

# Initializing total points
resume_points_3c = 0

# Iterating through the word pairs and their frequencies
for pair, freq in pair_freq.items():
    # Checking if the pair is in the word_pair_points_major dictionary
    if pair in word_pair_points_3c:
        # Adding the associated points to the total
        resume_points_3c += word_pair_points_3c[pair]

# Printing the total points awarded for this section
print("Resume points earned (part 3c):", resume_points_3c, "points out of 4 points max", "--> top US Universities check i.e mostly Ivys")



# Part 3d
# working at/ attended an <20% school --> 2 points max
# use word pairs and keywords

# Logic for word pairs and their associated points
word_pair_points_3d = {
    ('southern', 'california'): 0.2,
    ('vanderbilt', 'university'): 0.2,
    ('washington', 'university'): 0.2,
    ('rice', 'university'): 0.2,
    ('georgetown', 'university'): 0.2,
    ('emory', 'university'): 0.2,
    ('carnegie', 'mellon'): 0.2,
    ('notre', 'dame'): 0.2,
    ('boston', 'university'): 0.2,
    ('tufts', 'university'): 0.2
}

# the more universities, the higher the points
# 0.2 * 10 = 2 points max

# Initializing total points
resume_points_3d = 0

# Iterating through the word pairs and their frequencies
for pair, freq in pair_freq.items():
    # Checking if the pair is in the word_pair_points_major dictionary
    if pair in word_pair_points_3d:
        # Adding the associated points to the total
        resume_points_3d += word_pair_points_3d[pair]

# Printing the total points awarded for this section
print("Resume points earned (part 3d):", resume_points_3d, "points out of 2 points max", "--> other highly rated US Universities check")  


    
# Part 3e
# having a bachelor's degree --> 2 points
# keyword: bachelor
# Initializing a dictionary to store the points for specific words
word_points_3e = {
    'bachelor': 2
}

# Initializing the total points
resume_points_3e = 0

# Iterating through the words in the processed resume
for review_text in processed_resume:
    for word in review_text.split():
        # Checking if the word is in the word_points dictionary
        if word.lower() in word_points_3e:
            # Adding the corresponding points to resume_points_2
            resume_points_3e += word_points_3e[word.lower()]

# Printing the total points
print("Resume points earned (part 3e):", resume_points_3e, "points out of 2", "--> Bachelors's degree check")

    
    
# Part 3f
# major type --> 2 points
# looks for word pairs for: Computer Science, Information Science, Software Engineering, Computer Engineering, 
# Electrical Engineering, Information Technology, Information Systems,Systems Engineering
    
# Logic for word pairs and their associated points
word_pair_points_3f = {
    ('computer', 'science'): 0.25,
    ('information', 'science'): 0.25,
    ('software', 'engineering'): 0.25,
    ('computer', 'engineering'): 0.25,
    ('electrical', 'engineering'): 0.25,
    ('information', 'technology'): 0.25,
    ('information', 'systems'): 0.25,
    ('systems', 'engineering'): 0.25
}

# the more majors, the higher the points
# 0.25 * 8 = 2 points

# Initializing total points
resume_points_3f = 0

# Iterating through the word pairs and their frequencies
for pair, freq in pair_freq.items():
    # Checking if the pair is in the word_pair_points_major dictionary
    if pair in word_pair_points_3f:
        # Adding the associated points to the total
        resume_points_3f += word_pair_points_3f[pair]

# Printing the total points awarded for this section
print("Resume points earned (part 3f):", resume_points_3f, "points out of 2 points max", "--> Degree major check")

Resume points earned (part 3a): 0 points out of 2 --> phd check 
Resume points earned (part 3b): 0 points out of 2 --> Master's degree check
Resume points earned (part 3c): 0 points out of 4 points max --> top US Universities check i.e mostly Ivys
Resume points earned (part 3d): 0 points out of 2 points max --> other highly rated US Universities check
Resume points earned (part 3e): 0 points out of 2 --> Bachelors's degree check
Resume points earned (part 3f): 0.25 points out of 2 points max --> Degree major check


##### Resume score grading (addition) Total Points

In [45]:
# Taking the total points earned for the resume
resume_points_earned = (resume_points_1 + resume_points_2 + resume_points_3a + resume_points_3b + resume_points_3c + 
                       resume_points_3d + resume_points_3e + resume_points_3f)
# Break down:
# resume_points_1: 14 max
# resume_points_2: 12 max
# resume_points_3: 14 max
# Total: 40 max points

print("Resume points earned:", resume_points_earned, "points out of 40")

Resume points earned: 7.25 points out of 40


##### Resume score grading (subtraction) part 4 and 5

In [46]:
# For Word Pairs Point Substraction

# Logic for word pairs and their associated points
word_pair_points_4 = {
    ('detail', 'oriented'): 0.5,
    ('team', 'player'): 0.5,
    ('go', 'getter'): 0.5,
    ('multiple', 'tasks'): 0.5,
    ('highly', 'motivated'): 0.5,
    ('problem', 'solver'): 0.5,
    ('worked', 'on'): 0.5,
    ('took', 'orders'): 0.5,
    ('time', 'management'): 0.5,
    ('innovative', 'thinker'): 0.5
}

# the more words, the higher the points deduction
# 0.5 * 10 = 5 points max deduction

# Initializing total points
resume_points_4 = 0

# Iterating through the word pairs and their frequencies
for pair, freq in pair_freq.items():
    # Checking if the pair is in the word_pair_points_major dictionary
    if pair in word_pair_points_4:
        # Adding the associated points to the total
        resume_points_4 += word_pair_points_4[pair]

# Printing the total points awarded for this section
print("Resume points deducted (part 4):", resume_points_4, "points out of 5 points max deduction")  

Resume points deducted (part 4): 0 points out of 5 points max deduction


In [47]:
# For Key words Point Substraction

# Initializing a dictionary to store the points for specific words
word_points_5 = {
    'passionate': 0.5,
    'challenging': 0.5,
    'references': 0.5,
    'innovative': 0.5,
    'creative': 0.5,
    'helped': 0.5,
    'sold': 0.5,
    'hard': 0.5,
    'great': 0.5,
    'difficult': 0.5,
    'bad': 0.5,
    'good': 0.5,
    'amazing': 0.5,
    'strong': 0.5,
    'knack': 0.5,
    'excellent': 0.5,
    'communicator': 0.5
}

# the more words, the higher the points deduction
# 0.5 * 17 = 8.5 points max deduction

# Initializing the total points
resume_points_5 = 0

# Iterating through the words in the processed resume
for review_text in processed_resume:
    for word in review_text.split():
        # Checking if the word is in the word_points dictionary
        if word.lower() in word_points_5:
            # Adding the corresponding points to resume_points_2
            resume_points_5 += word_points_5[word.lower()]

# Printing the total points
print("Resume points deducted (part 5):", resume_points_5, "points out of 8.5 points max deduction")

Resume points deducted (part 5): 0 points out of 8.5 points max deduction


##### Total Resume Calculation (Points - deduction)

In [48]:
resume_percent = "{:.2f}".format(resume_points_earned - (resume_points_4 + resume_points_5))
print("Resume total (out of 40):", resume_percent)

Resume total (out of 40): 7.25


### Cover Letter Analysis and Grading Section (40%)

##### Removing StopWords, Numerical Digits, Punctuations, Symbols, Extra Spaces, Making Text lowercase and Normalizing and Tokenizing the Text from Cover Letter

In [49]:
# Loading the stop words
stop_words = set(stopwords.words('english'))

# Adding auxiliary verbs to the stopwords list
auxiliary_verbs = ['am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
                   'have', 'has', 'had', 'do', 'does', 'did', 'can', 'could',
                   'shall', 'should', 'will', 'would', 'may', 'might', 'must']

stop_words.update(auxiliary_verbs)

# Initializing a list to store the processed cover letter
processed_cover_letter = []

# Opening and reading the original file
with open(cover_letter_filepath, 'r', encoding='utf-8') as file:
    for line in file:
        cover_letter_text = line.strip()
        
        # Text cleaning and normalization
        for punctuations_and_more in ['●','–', ',', '.', '"', '!', '?', ':', ';', '-', '(', ')', '[', ']', "'", '*', '$', '/', '+','’', 
                             '_','|','@']:
            cover_letter_text = cover_letter_text.replace(punctuations_and_more, ' ')  # Replacing punctuations with spaces
            cover_letter_text = re.sub('\s+', ' ', cover_letter_text)  # Removing extra spaces
            cover_letter_text = cover_letter_text.lower().strip()  # Making text lowercase

        # Removing numerical digits
        cover_letter_text = re.sub(r'\d', '', cover_letter_text)
        
        # Tokenizing the cleaned cover letter text
        words = nltk.word_tokenize(cover_letter_text)
        
        # Removing stopwords, including auxiliary verbs
        filtered_words = [word for word in words if word not in stop_words]
        
        # Joining the filtered words back into a sentence
        processed_cover_letter_x = ' '.join(filtered_words)
        
        processed_cover_letter.append(processed_cover_letter_x)

In [50]:
# Displaying processed_cover_letter
print(processed_cover_letter)

['dear hiring manager', 'writing express interest full stack software engineer summer intern position', 'neptune technologies advertised proactive results driven software engineer bring', 'unique blend skills experiences align core preferred qualifications', 'outlined role', 'throughout academic journey extensive hands experience actively pursued', 'bachelor degree backend software engineering turing school software design', 'demonstrating strong record academic achievement currently student success mentor', 'assisting peers grasping technical concepts surrounding ruby rails postgresql oop ttd', 'projects including development full stack application called saturn', 'earth showcase proficiency technologies ruby rails postgresql heroku', 'graphql', 'addition development skills possess robust understanding computer science', 'mathematics system design background senior surgical neurophysiologist equipped', 'unique perspective system management collaborative teamwork believe', 'valuable dy

##### Combining all lines in processed_cover_letter into a single line

In [51]:
concatenated_cover_letter = ' '.join(processed_cover_letter)
print(concatenated_cover_letter)

dear hiring manager writing express interest full stack software engineer summer intern position neptune technologies advertised proactive results driven software engineer bring unique blend skills experiences align core preferred qualifications outlined role throughout academic journey extensive hands experience actively pursued bachelor degree backend software engineering turing school software design demonstrating strong record academic achievement currently student success mentor assisting peers grasping technical concepts surrounding ruby rails postgresql oop ttd projects including development full stack application called saturn earth showcase proficiency technologies ruby rails postgresql heroku graphql addition development skills possess robust understanding computer science mathematics system design background senior surgical neurophysiologist equipped unique perspective system management collaborative teamwork believe valuable dynamic environment like neptune technologies imp

##### Word Pairs Calculation for Cover Letter

In [52]:
def get_ordered_word_pair_frequency(concatenated_cover_letter, window_size):
    pair_freq = {}  # For storing the frequency of ordered word pairs
    word_list = concatenated_cover_letter.split()  # Splitting the concatenated text into individual words

    # Comparing adjacent words to find ordered word pairs within the window_size
    for i in range(len(word_list) - 1):
        word1, word2 = word_list[i], word_list[i + 1]

        # Checking if word1 and word2 are not the same before adding to pair_freq
        if word1 != word2:
            order_word_pair = (word1, word2)
            if order_word_pair not in pair_freq:
                pair_freq[order_word_pair] = 1  # If the pair is not in the dictionary, it adds it to the dictionary with a frequency of 1, indicating that we've seen it once
            else:
                pair_freq[order_word_pair] += 1  # It increments the frequency of the pair in the dictionary by 1, indicating that we've seen it again
    return pair_freq

window_size = 1

pair_freq = get_ordered_word_pair_frequency(concatenated_cover_letter, window_size)

print("\033[4mAll the Word Pairs:\033[0m")

# Sorting all word pairs by frequency in descending order
for pair, freq in sorted(pair_freq.items(), key=lambda x: x[1], reverse=True):
    print("These word pairs:", pair, "appeared", freq, "times")

[4mAll the Word Pairs:[0m
These word pairs: ('neptune', 'technologies') appeared 5 times
These word pairs: ('full', 'stack') appeared 2 times
These word pairs: ('software', 'engineer') appeared 2 times
These word pairs: ('skills', 'experiences') appeared 2 times
These word pairs: ('experiences', 'align') appeared 2 times
These word pairs: ('preferred', 'qualifications') appeared 2 times
These word pairs: ('ruby', 'rails') appeared 2 times
These word pairs: ('rails', 'postgresql') appeared 2 times
These word pairs: ('projects', 'including') appeared 2 times
These word pairs: ('dear', 'hiring') appeared 1 times
These word pairs: ('hiring', 'manager') appeared 1 times
These word pairs: ('manager', 'writing') appeared 1 times
These word pairs: ('writing', 'express') appeared 1 times
These word pairs: ('express', 'interest') appeared 1 times
These word pairs: ('interest', 'full') appeared 1 times
These word pairs: ('stack', 'software') appeared 1 times
These word pairs: ('engineer', 'summ

##### Word Frequency Calculation for Cover Letter

In [53]:
# Counting the frequency of individual words, storing results in word_freq
def get_single_word_frequency(processed_cover_letter):
    word_freq = {}  # initializing an empty dictionary called word_freq to store word frequencies
    for cover_letter_text in processed_cover_letter:
        for word in cover_letter_text.split():
            if word not in word_freq:
                word_freq[word] = 1
            else:
                word_freq[word] += 1
    return word_freq

word_freq = get_single_word_frequency(processed_cover_letter)  # Calling the function to get the word frequencies

total_num_words = sum(word_freq.values())
print('number of unique words:', len(word_freq))
print('total number of word occurrences:', total_num_words)
print("")

# Calculating and displaying the top 10 most frequently occurring words
print("\033[4mAll the words and their frequency:\033[0m")

for word, freq in sorted(word_freq.items(), key=lambda x: x[1], reverse=True):
    print(word, "appeared", freq, "times")

number of unique words: 174
total number of word occurrences: 223

[4mAll the words and their frequency:[0m
technologies appeared 7 times
software appeared 5 times
neptune appeared 5 times
skills appeared 5 times
align appeared 3 times
experience appeared 3 times
design appeared 3 times
technical appeared 3 times
postgresql appeared 3 times
development appeared 3 times
full appeared 2 times
stack appeared 2 times
engineer appeared 2 times
unique appeared 2 times
experiences appeared 2 times
preferred appeared 2 times
qualifications appeared 2 times
academic appeared 2 times
success appeared 2 times
ruby appeared 2 times
rails appeared 2 times
projects appeared 2 times
including appeared 2 times
application appeared 2 times
proficiency appeared 2 times
system appeared 2 times
collaborative appeared 2 times
api appeared 2 times
knowledge appeared 2 times
dear appeared 1 times
hiring appeared 1 times
manager appeared 1 times
writing appeared 1 times
express appeared 1 times
interest app

##### Cover letter score grading (addition) Part 6

In [54]:
# Each question for part 6 and 7 are 40/58 points each totalling a combination of 40 points for both sections

# Part 6 (34 points max)
# Purpose: grading by word pairs

# Logic for word pairs and their associated points
word_pair_points = {
    ('pivotal', 'role'): 40/58,
    ('thank', 'you'): 40/58,
    ('machine', 'learning'): 40/58,
    ('neptune', 'technologies'): 40/58,
    ('strong', 'interest'): 40/58,
    ('deep', 'understanding'): 40/58,
    ('web', 'frameworks'): 40/58,
    ('profound', 'understanding'): 40/58,
    ('self', 'learner'): 40/58,
    ('user', 'friendly'): 40/58,
    ('dom', 'manipulation'): 40/58,
    ('web', 'applications'): 40/58,
    ('version', 'control'): 40/58,
    ('test', 'automation'): 40/58,
    ('continuous', 'integration'): 40/58,
    ('continuous', 'deployment'): 40/58,
    ('extensive', 'experience'): 40/58,
    ('various', 'domains'): 40/58,
    ('personal', 'ventures'): 40/58,
    ('software', 'engineering'): 40/58,
    ('ruby', 'rails'): 40/58,
    ('development', 'skills'): 40/58,
    ('database', 'expertise'): 40/58,
    ('sql', 'nosql'): 40/58,
    ('data', 'driven'): 40/58,
    ('enriched', 'experience'): 40/58,
    ('containerization', 'technologies'): 40/58,
    ('cloud', 'computing'): 40/58,
    ('collaboration', 'teamwork'): 40/58,
    ('firmly', 'believe'): 40/58,
    ('web', 'development'): 40/58,
    ('front', 'end'): 40/58,
    ('back', 'end'): 40/58,
    ('data', 'science'): 40/58
}

# Initializing total points
cover_letter_points_6 = 0

# Iterating through the word pairs and their frequencies
for pair, freq in pair_freq.items():
    # Checking if the pair is in the word_pair_points dictionary
    if pair in word_pair_points:
        # Adding the associated points to the total
        cover_letter_points_6 += word_pair_points[pair]

# Printing the total points awarded
print("Cover Letter points earned (part 6):", cover_letter_points_6, "points out of 34 max points")

Cover Letter points earned (part 6): 2.7586206896551726 points out of 34 max points


##### Cover letter score grading (addition) Part 7

In [55]:
# part 7 (24 points max)
# Purpose: grading by certains keywords

# Initializing a dictionary to store the points for specific words
word_points = {
    'engineer': 40/58,
    'developer': 40/58,
    'postgresql': 40/58,
    'commitment': 40/58,
    'typescript': 40/58,
    'redux': 40/58,
    'react': 40/58,
    'jquery': 40/58,
    'git': 40/58,
    'restful': 40/58,
    'testament': 40/58,
    'apis': 40/58,
    'storybook': 40/58,
    'java': 40/58,
    'docker': 40/58,
    'azure': 40/58,
    'portfolio': 40/58,
    'professional': 40/58,
    'node': 40/58,
    'express': 40/58,
    'vue': 40/58,
    'cloud': 40/58,
    'agile': 40/58,
    'tensorflow': 40/58
}

# Initializing the total points
cover_letter_points_7 = 0

# Iterating through the words in the processed cover letter
for cover_letter_text in processed_cover_letter:
    for word in cover_letter_text.split():
        # Checking if the word is in the word_points dictionary
        if word.lower() in word_points:
            # Adding the corresponding points to cover_letter_points_7
            cover_letter_points_7 += word_points[word.lower()]

# Printing the total points
print("Cover Letter points earned (part 7):", cover_letter_points_7, "points out of 24 max points")

Cover Letter points earned (part 7): 6.896551724137932 points out of 24 max points


##### Cover Letter score grading (addition) Total Points

In [56]:
# Taking the total points earned for the resume
cover_letter_points_earned = (cover_letter_points_6 + cover_letter_points_7)
print("Cover letter points earned:", (cover_letter_points_earned/58)*40, "points out of 40")

Cover letter points earned: 6.658739595719383 points out of 40


##### Cover letter score grading (subtraction) part 8 and 9

In [57]:
# Part 8
# For Word Pairs Point Subtraction

# Logic for word pairs and their associated points
word_pair_points_8 = {
    ('dear', 'sir'): 0.5,
    ('dear', 'hiring'): 0.5,
    ('i', 'want'): 0.5,
    ('i', 'need'): 0.5,
    ('detail', 'oriented'): 0.5,
    ('team', 'player'): 0.5,
    ('go', 'getter'): 0.5,
    ('multiple', 'tasks'): 0.5,
    ('highly', 'motivated'): 0.5,
    ('problem', 'solver'): 0.5,
    ('worked', 'on'): 0.5,
    ('took', 'orders'): 0.5,
    ('pretty', 'good'): 0.5,
    ('time', 'management'): 0.5,
    ('extra', 'mile'): 0.5,
    ('feel', 'free'): 0.5,
    ('zero', 'experience'): 0.5,
    ('no', 'experience'): 0.5,
    ('much', 'experience'): 0.5,
    ('innovative', 'thinker'): 0.5,
    ('big', 'deal'): 0.5,
    ('good', 'deal'): 0.5,
    ('great', 'deal'): 0.5
}

# the more words, the higher the points deduction
# 0.5 * 23 = 11.5 points max deduction

# Initializing total points
cover_letter_points_8 = 0

# Iterating through the word pairs and their frequencies
for pair, freq in pair_freq.items():
    # Checking if the pair is in the word_pair_points_8 dictionary
    if pair in word_pair_points_8:
        # Adding the associated points to the total
        cover_letter_points_8 += word_pair_points_8[pair]

# Printing the total points deducted for this section
print("Cover Letter points deducted (part 8):", cover_letter_points_8, "points out of 11.5 points max deduction")

Cover Letter points deducted (part 8): 0.5 points out of 11.5 points max deduction


In [58]:
# Part 9
# For Cover Letter Key words Point Substraction

# Initializing a dictionary to store the points for specific words
word_points_9 = {
    'perfectionist': 0.5,
    'challenging': 0.5,
    'references': 0.5,
    'innovative': 0.5,
    'creative': 0.5,
    'helped': 0.5,
    'sold': 0.5,
    'hard': 0.5,
    'great': 0.5,
    'difficult': 0.5,
    'bad': 0.5,
    'good': 0.5,
    'amazing': 0.5,
    'knack': 0.5,
    'excellent': 0.5,
    'communicator': 0.5,
    'soon': 0.5,
    'feelings': 0.5,
    'very': 0.5,
    'bugs': 0.5,
    'errors': 0.5,
    'probably': 0.5,
    'maybe': 0.5,
    'overrated': 0.5,
    'bills': 0.5,
    'big': 0.5,
    'lazy': 0.5,
    'worries': 0.5,
    'worry': 0.5,
    'horrible': 0.5,
    'salary': 0.5,
    'pay': 0.5,
    'benefits': 0.5,
    'money': 0.5,
    'decent': 0.5,
    'minimal': 0.5,
    'if': 0.5
}

# the more words, the higher the points deduction
# 0.5 * 37 = 18.5 points max deduction

# Initializing the total points
cover_letter_points_9 = 0

# Iterating through the words in the processed cover letter
for review_text in processed_cover_letter:
    for word in review_text.split():
        # Checking if the word is in the word_points_5 dictionary
        if word.lower() in word_points_9:
            # Adding the corresponding points to cover_letter_points_9
            cover_letter_points_9 += word_points_9[word.lower()]

# Printing the total points deducted for this section
print("Cover Letter points deducted (part 9):", cover_letter_points_9, "points out of 18.5 points max deduction")

Cover Letter points deducted (part 9): 0.5 points out of 18.5 points max deduction


##### Total Cover Letter Calculation (Points - deduction)

In [59]:
cover_letter_percent = "{:.2f}".format(((cover_letter_points_earned/58)*40) - (cover_letter_points_8 + cover_letter_points_9))
print("Cover letter total (out of 40):", cover_letter_percent)

Cover letter total (out of 40): 5.66


### Form Data Analysis and Grading Section (20%)

##### Removing StopWords, Numerical Digits, Punctuations, Symbols, Extra Spaces, Making Text lowercase and Normalizing and Tokenizing the Text from the form data

In [60]:
# Loading the stop words
stop_words = set(stopwords.words('english'))

# Adding auxiliary verbs to the stopwords list
auxiliary_verbs = ['am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
                   'have', 'has', 'had', 'do', 'does', 'did', 'can', 'could',
                   'shall', 'should', 'will', 'would', 'may', 'might', 'must']

stop_words.update(auxiliary_verbs)

# Initializing a list to store the processed form_data
processed_form_data = []

# Opening and reading the original file
with open(form_data_filepath, 'r', encoding='utf-8') as file:
    for line in file:
        form_data_text = line.strip()
        
        # Text cleaning and normalization
        for punctuations in ['–', ',', '.', '"', '!', '?', ':', ';', '-', '(', ')', '[', ']', "'", '*', '$', '+','’', 
                             '_', '{', '}']:
            form_data_text = form_data_text.replace(punctuations, ' ')  # Replacing punctuations with spaces
            form_data_text = re.sub('\s+', ' ', form_data_text)  # Removing extra spaces
            form_data_text = form_data_text.lower().strip()  # Making text lowercase

        # Removing numerical digits
        form_data_text = re.sub(r'\d', '', form_data_text)
        
        # Tokenizing the cleaned form_data text
        words = nltk.word_tokenize(form_data_text)
        
        # Removing stopwords, including auxiliary verbs
        filtered_words = [word for word in words if word not in stop_words]
        
        # Joining the filtered words back into a sentence
        processed_form_data_x = ' '.join(filtered_words)
        
        processed_form_data.append(processed_form_data_x)

In [61]:
# Displaying processed_form_data
print(processed_form_data)

['', 'firstname john', 'middlename', 'lastname doe', 'dob', 'gender male', 'phonenumber', 'email johndoe @ gmail com', 'homenumber', 'unitnumber c', 'streetname banana drive', 'city baton blue', 'usstate maryland', 'zipcode', 'degreetype bachelor degree', 'major biological health science', 'uni name university south florida', 'graddate', 'major', 'current employer n/a', 'current job title n/a', 'current job duration n/a', 'previous employers n/a', 'linkinput https //linkedin com', 'linkinput akunnatechstudio com', 'skills n/a', 'certs n/a', 'refs n/a', '']


##### Combining all lines in processed_form_data into a single line

In [62]:
concatenated_form_data = ' '.join(processed_form_data)
print(concatenated_form_data)

 firstname john middlename lastname doe dob gender male phonenumber email johndoe @ gmail com homenumber unitnumber c streetname banana drive city baton blue usstate maryland zipcode degreetype bachelor degree major biological health science uni name university south florida graddate major current employer n/a current job title n/a current job duration n/a previous employers n/a linkinput https //linkedin com linkinput akunnatechstudio com skills n/a certs n/a refs n/a 


##### Form Data Grading

In [63]:
# starting with 20 points for form data grading
starting_score = 20;

# Counting the frequency of 'check' and 'N/A' in concatenated_form_data
check_count = concatenated_form_data.lower().count("check")
na_count = concatenated_form_data.lower().count("n/a")

# Subtracting 0.5 * the frequency from starting_score
starting_score -= 0.5 * check_count
starting_score -= 0.5 * na_count

print("Form data score:", starting_score)

Form data score: 16.5


### Applicant's Score

In [64]:
# Calculating the applicant's overall score
applicant_percent = "{:.2f}".format(float(resume_percent) + float(cover_letter_percent) + float(starting_score))
print("Applicant's Overall Score:", applicant_percent, "%")

# Determining the applicant's evaluation based on the score
if 90 <= float(applicant_percent) <= 100:
    print("This applicant is excellent!")
elif 85 <= float(applicant_percent) < 90:
    print("This applicant is very good!")
elif 80 <= float(applicant_percent) < 85:
    print("This applicant is good")
elif 75 <= float(applicant_percent) < 80:
    print("This applicant is fair")
elif 70 <= float(applicant_percent) < 75:
    print("This applicant is marginal")
else:
    print("This applicant cannot be considered for this job")


Applicant's Overall Score: 29.41 %
This applicant cannot be considered for this job
