##### Importing neccessary libraries

In [124]:
import os
import re
import nltk
import pandas as pd

from nltk.corpus import stopwords, words, wordnet
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter

##### Calling Directory

In [125]:
# Directory containing the 50 resumes (.txt files)
output_dir = "C:\\Users\\akunna1\\Desktop\\Projects\\Project_5\\Resumes_50\\output_resumes"

##### Data Cleaning

In [126]:
# Loading the stop words
stop_words = set(stopwords.words('english'))

# Adding auxiliary verbs to the stopwords list
auxiliary_verbs = ['am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
                   'have', 'has', 'had', 'do', 'does', 'did', 'can', 'could',
                   'shall', 'should', 'will', 'would', 'may', 'might', 'must']

stop_words.update(auxiliary_verbs)

# Initializing a list to store the processed resumes
resume_50 = [] 

# Grouping all the files together
for file_name in os.listdir(output_dir):
    file_path = os.path.join(output_dir, file_name)
    
    # Check if the file is a text file
    if file_name.endswith('.txt'):
        # Opening and reading the original file
        with open(file_path, 'r', encoding='utf-8') as file:
            resume_text = file.read()

            # Text cleaning and normalization
            for punctuations_and_more in ['○','‘','·','➢','❖','◤','','>','<','”','“','—','®','&','◢','~','�','%','▪','@','•','\\','●','|','–', ',', '.', '"', '!', '?', ':', ';', '-', '(', ')', '[', ']', "'", '*', '$', '/', '+', '’', '_']:
                resume_text = resume_text.replace(punctuations_and_more, ' ')  # Replacing punctuations with spaces
                resume_text = re.sub('\s+', ' ', resume_text)  # Removing extra spaces
                resume_text = resume_text.lower().strip()  # Making text lowercase

            # Removing specified words from resume_text inlcuding months, unicodes, 'software', 'engineer'
            words_to_remove = ['january', 'february', 'march', 'may', 'june', 'september', 'november', 'october', 'april', 'july', 'december', 'august', '\uf0b7', '\uf10b', '\uf0e0', '\uf08c', '\uf015', '\uf092', 'software', 'engineer']
            for word in words_to_remove:
                resume_text = resume_text.replace(word, '')
                
            # Removing applicant's names
            names_to_remove = ['alex', 'desjardins', 'naraghi', 'andrea', 'lopez', 'andrew', 'sawyer', 'anthony', 'dahanne', 'freay', 'brendan', 'ngo', 'brian', 'kasper', 'cass', 'outlaw', 'cate', 'miller', 'daniel', 'salib', 'ebel', 'rodriguez',
                              'ellek', 'linton', 'goce', 'anastasovski', 'hugh', 'brady', 'jake', 'gutierrez', 'jeff', 'farkas', 'jeremy', 'beker', 'joan', 'lindsay', 'john', 'powell', 'jonathan', 'gelie', 'jordan', 'harrington', 'justin', 'ehlert', 'kailyn', 'williams', 'karthik',
                              'kumar', 'kenneth', 'chan', 'kevin', 'beason', 'liina', 'laufer', 'chaffee', 'maggie', 'ebers', 'matthew', 'budiman', 'schnaider', 'scholta', 'michael', 'allain', 'palmer', 'wheatman', 'mike', 'rolish', 'neil', 'menon', 'nick', 'mccrea', 'prakhar', 'garg',
                              'razaq', 'jinad', 'reid', 'main', 'samuel', 'arminana', 'sangwoo', 'cho', 'steve', 'howe', 'tim', 'cieplowski', 'tyler', 'meara', 'vaibhabv', 'singh', 'yura', 'kim', 'zameer', 'manji']
            for word in names_to_remove:
                resume_text = resume_text.replace(word, '')
            
            # Removing numerical digits
            resume_text = re.sub(r'\d', '', resume_text)
            
            # Removing state names, city names, region names ,.etc
            states_and_more = ['francisco', 'seattle', 'toronto', 'canada', 'usa', 'angeles', 'san', 'houston', 'austin', 'sacramento', 'texas', 'arizona', 'alabama', 'alaska', 'arizona', 'arkansas', 'california', 'colorado', 'connecticut', 'delaware', 'florida', 'georgia', 'hawaii', 'idaho', 'illinois', 'indiana', 'iowa', 'kansas', 'kentucky', 'louisiana', 'maine', 'maryland', 'massachusetts', 'michigan', 'minnesota', 'mississippi', 'missouri', 'montana', 'nebraska', 'nevada', 'hampshire', 'jersey', 'mexico', 'york', 'carolina', 'dakota', 'ohio', 'oklahoma', 'oregon', 'pennsylvania', 'rhode', 'tennessee', 'utah', 'vermont', 'virginia', 'washington', 'wisconsin', 'wyoming']
            for word in states_and_more:
                resume_text = resume_text.replace(word, '')
            
            # Tokenizing the cleaned resume text
            words = nltk.word_tokenize(resume_text)

            # Removing stopwords, including auxiliary verbs
            filtered_words = [word for word in words if word not in stop_words]

            # Joining the filtered words back into a sentence
            processed_resume = ' '.join(filtered_words)

            # Append the processed resume to the list
            resume_50.append(processed_resume)

In [127]:
# Concatenating resume_50 and displaying it
concatenated_resume_50 = ' '.join(resume_50)
print(concatenated_resume_50)

www com ar gmail com linkedin github technical skills technologies ruby rails activerecord postgresql rspec capybara html heroku travisci graphql practices object oriented programming test driven development agile development restful design git version control continuous integration mvc architecture json construction consumption experience backend ing student turing sol design investment hours creating projects range basic ruby database backend rails applications well rest api built using test driven development following object oriented design principles student success mentor help current students understand concepts surrounding ruby rails postgresql oop ttd many technical topics senior surgical neurophysiologist cnim assure neuromonitoring national technologist part two member team tasked establishing advanced neuromonitoring procedures brain tumor removal surgeries culicchia neurological clinic new orleans personally trained surgical neurophysiologists hospitals nationwide neuromon

##### Calculating Absolute and Relative Frequency for words

In [128]:
# Tokenizing the concatenated text
tokens = word_tokenize(concatenated_resume_50)

# Calculating absolute frequency
freq_dist = FreqDist(tokens)

# Calculating relative frequency as a percentage
total_words = len(tokens)
relative_freq = {word: (count / total_words) * 100 for word, count in freq_dist.items()}

# Creating a DataFrame for displaying the results
data = {'Word': list(freq_dist.keys()), 'Absolute Frequency': list(freq_dist.values()), 'Relative Frequency (%)': [relative_freq[word] for word in freq_dist.keys()]}
df = pd.DataFrame(data)

# Sorting the DataFrame in descending order by both absolute and relative frequency
df = df.sort_values(by=['Absolute Frequency', 'Relative Frequency (%)'], ascending=[False, False])

# Sorting the DataFrame by absolute frequency in descending order
df = df.sort_values(by='Absolute Frequency', ascending=False)

# Specifying the directory to save the .csv file
csv_dir = "C:\\Users\\akunna1\\Desktop\\Projects\\Project_5\\csv_output"

# Checking if the directory exists, if not, create it
if not os.path.exists(csv_dir):
    os.makedirs(csv_dir)

# Saving the original DataFrame to a .csv file in the specified directory
csv_file_path = os.path.join(csv_dir, 'resume_word_count.csv')
df.to_csv(csv_file_path, index=False)

# Printing a completion message
print(f"\nWord Count DataFrame has been saved to: {csv_file_path}")

# Counting the number of word
df_num = df.shape[0]

# Displaying the number of values
print(f"\n\033[4mNumber of words:\033[0m {df_num}\n")

# Formatting the DataFrame for display
df_styled = df.style.set_properties(**{'text-align': 'center', 'border': '1px solid black'}).hide_index()
df_styled.format({'Relative Frequency (%)': '{:.5f}%'})

# Displaying the formatted DataFrame
display(df_styled)


Word Count DataFrame has been saved to: C:\Users\akunna1\Desktop\Projects\Project_5\csv_output\resume_word_count.csv

[4mNumber of words:[0m 5267



Word,Absolute Frequency,Relative Frequency (%)
university,154,0.72882%
development,148,0.70043%
data,135,0.63890%
math,126,0.59631%
com,122,0.57738%
systems,122,0.57738%
team,114,0.53952%
using,112,0.53005%
web,108,0.51112%
new,102,0.48273%


##### Calculating Absolute and Relative Frequency for word pairs

In [129]:
def get_ordered_word_pair_frequency(concatenated_resume_50, window_size):
    pair_freq = {}  # For storing the frequency of ordered word pairs
    word_list = concatenated_resume_50.split()  # Splitting the concatenated text into individual words
    
    # Comparing adjacent words to find ordered word pairs within the window_size
    for i in range(len(word_list) - 1):
        word1, word2 = word_list[i], word_list[i + 1]
        
        # Checking if word1 and word2 are not the same before adding to pair_freq
        if word1 != word2:
            order_word_pair = (word1, word2)
            if order_word_pair not in pair_freq:
                pair_freq[order_word_pair] = 1  # If the pair is not in the dictionary, it adds it to the dictionary with a frequency of 1, indicating that we've seen it once
            else:
                pair_freq[order_word_pair] += 1  # It increments the frequency of the pair in the dictionary by 1, indicating that we've seen it again
    return pair_freq

window_size = 4

pair_freq = get_ordered_word_pair_frequency(concatenated_resume_50, window_size)

# Creating a DataFrame for displaying the results
data = {'Word Pairs': list(pair_freq.keys()), 'Absolute Frequency': list(pair_freq.values())}
df = pd.DataFrame(data)

# Calculating relative frequency as a percentage
total_pairs = sum(pair_freq.values())
df['Relative Frequency (%)'] = (df['Absolute Frequency'] / total_pairs) * 100

# Sorting the DataFrame by absolute frequency in descending order
df = df.sort_values(by='Absolute Frequency', ascending=False)

# Specifying the directory to save the .csv file
csv_dir = "C:\\Users\\akunna1\\Desktop\\Projects\\Project_5\\csv_output"

# Checking if the directory exists, if not, create it
if not os.path.exists(csv_dir):
    os.makedirs(csv_dir)

# Saving the original DataFrame to a .csv file in the specified directory
csv_file_path = os.path.join(csv_dir, 'resume_word_pairs.csv')
df.to_csv(csv_file_path, index=False)

# Printing a completion message
print(f"\nWord Pair DataFrame has been saved to: {csv_file_path}")

# Counting the number of word pairs
df_num = df.shape[0]

# Displaying the number of values
print(f"\n\033[4mNumber of word pairs:\033[0m {df_num}\n")

# Formatting the DataFrame for display
df2_styled = df.style.set_properties(**{'text-align': 'center', 'border': '1px solid black'}).hide_index()
df2_styled.format({'Relative Frequency (%)': '{:.5f}%'})

# Display the formatted DataFrame
display(df2_styled)


Word Pair DataFrame has been saved to: C:\Users\akunna1\Desktop\Projects\Project_5\csv_output\resume_word_pairs.csv

[4mNumber of word pairs:[0m 17666



Word Pairs,Absolute Frequency,Relative Frequency (%)
"('computer', 'science')",53,0.25172%
"('gmail', 'com')",29,0.13773%
"('operator', 'theory')",29,0.13773%
"('state', 'university')",27,0.12824%
"('full', 'stack')",23,0.10924%
"('nest', 'algebras')",23,0.10924%
"('github', 'com')",21,0.09974%
"('machine', 'learning')",21,0.09974%
"('linkedin', 'com')",18,0.08549%
"('real', 'e')",17,0.08074%


In [130]:
# Word cloud to visualize word frequency --> created using online word cloud generator

In [131]:
# Funnel graph to visualize word pair frequency --> created using MS Excel

##### Grouping adverbs, verbs, adjectives, nouns used

In [132]:
import os
import re
import nltk
import pandas as pd

from nltk.corpus import stopwords, words, wordnet
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter

# Downloading the words corpus and WordNet from NLTK
nltk.download('words')
nltk.download('wordnet')

# Tokenizing the concatenated text
tokens = word_tokenize(concatenated_resume_50)

# Performing part-of-speech tagging
pos_tags = pos_tag(tokens)

# Initializing lists to store words based on their parts of speech
adverbs = []
verbs = []
adjectives = []
nouns = []

# Creating a set of English words from both NLTK words corpus and WordNet for efficient lookup
english_words = set(words.words())
english_words.update(set(wordnet.words()))

# Grouping words based on their parts of speech and filtering out non-English words
for word, pos_tag in pos_tags:
    # Filtering out non-English words
    if word.lower() not in english_words:
        continue
    
    if pos_tag.startswith('RB'):  # Adverbs
        adverbs.append(word)
    elif pos_tag.startswith('VB'):  # Verbs
        verbs.append(word)
    elif pos_tag.startswith('JJ'):  # Adjectives
        adjectives.append(word)
    elif pos_tag.startswith('NN'):  # Nouns
        nouns.append(word)

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\akunna1\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\akunna1\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#####  Printing out the top 10 Adverbs

In [133]:
# Function to get the top N words with the highest frequency
def get_top_words(word_list, n=10):
    # Counting the frequency of each word
    word_counts = Counter(word_list)

    # Getting the top N words with the highest frequency
    top_words = word_counts.most_common(n)

    return top_words

# Getting the top 10 adverbs with the highest frequency
top_adverbs = get_top_words(adverbs, n=10)

# Printing the result
print(f"Top 10 adverbs with the highest frequency: {top_adverbs}")

Top 10 adverbs with the highest frequency: [('well', 14), ('successfully', 11), ('directly', 11), ('first', 10), ('automatically', 9), ('back', 9), ('closely', 8), ('also', 6), ('fully', 5), ('seamlessly', 5)]


#####  Printing the top 10 Verbs

In [134]:
# Function to get the top N words with the highest frequency
def get_top_words(word_list, n=10):
    # Counting the frequency of each word
    word_counts = Counter(word_list)

    # Getting the top N words with the highest frequency
    top_words = word_counts.most_common(n)

    return top_words

# Getting the top 10 verbs with the highest frequency
top_verbs = get_top_words(verbs, n=10)

# Printing the result
print(f"Top 10 verbs with the highest frequency: {top_verbs}")

Top 10 verbs with the highest frequency: [('using', 112), ('developed', 91), ('ing', 69), ('based', 58), ('designed', 57), ('implemented', 55), ('built', 51), ('learning', 45), ('testing', 45), ('worked', 43)]


#####  Printing the top 10 Adjectives

In [135]:
# Function to get the top N words with the highest frequency
def get_top_words(word_list, n=10):
    # Counting the frequency of each word
    word_counts = Counter(word_list)

    # Getting the top N words with the highest frequency
    top_words = word_counts.most_common(n)

    return top_words

# Getting the top 10 adjectives with the highest frequency
top_adjectives = get_top_words(adjectives, n=10)

# Printing the result
print(f"Top 10 adjectives with the highest frequency: {top_adjectives}")

Top 10 adjectives with the highest frequency: [('new', 102), ('technical', 75), ('senior', 56), ('present', 40), ('mobile', 38), ('full', 37), ('c', 35), ('high', 32), ('intern', 30), ('user', 28)]


##### Printing out the top 10 Nouns

In [136]:
# Function to get the top N words with the highest frequency
def get_top_words(word_list, n=10):
    # Counting the frequency of each word
    word_counts = Counter(word_list)

    # Getting the top N words with the highest frequency
    top_words = word_counts.most_common(n)

    return top_words

# Getting the top 10 nouns with the highest frequency
top_nouns = get_top_words(nouns, n=10)

# Printing the result
print(f"Top 10 nouns with the highest frequency: {top_nouns}")

Top 10 nouns with the highest frequency: [('development', 148), ('university', 148), ('data', 130), ('math', 118), ('team', 104), ('system', 93), ('computer', 91), ('science', 89), ('experience', 84), ('design', 82)]
