<a href="https://colab.research.google.com/github/Zantorym/Aidi-capstone-I/blob/review/AIDI1003_Capstone_Dataset_Tokenization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code Setup

Install autotime for reporting of how long code runs

In [1]:
!pip install ipython-autotime
%load_ext autotime

time: 2.28 ms (started: 2021-12-11 03:12:47 +00:00)


Import libraries needed by code

In [2]:
import pickle
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import TreebankWordTokenizer
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

time: 732 ms (started: 2021-12-11 03:12:47 +00:00)


Define constants for easier configuration of runs across users

In [3]:
# Constants
# Pickle Input
JD_FILES_PICKLE_OUTPATH='/content/drive/MyDrive/AIDI1003/JDs/jds.pickle'
RESUME_FILES_PICKLE_OUTPATH='/content/drive/MyDrive/AIDI1003/Resumes/resumes.pickle'

# Tokenization
'''
0 - string split
1 - NLTK TreebankWordTokenizer
Note: not defined value = 0 = string split
'''
TOKENIZATION_ALGORITHM=0


# NGrams
NGRAM_COUNT=2

# Stop Words
FILTER_STOP_WORDS=1
STOP_WORDS_SOURCE=0
'''
1 - Use NLTK stop words
2 - Use Scikit Learn stop words
Note: not defined value = 0 = intersection of both NLTK and Scikit-learn
'''

# Case Folding
# Note: case folding is always performed as job description and resumes
#       should have minimal use of proper nouns for differentiating against 
#       common words.

# Stemming
STEMMER_ALGORITHM=0
'''
1 = Use Porter stemmer
2 = Use Snowball stemmer
Note: not defined value = 0 = no stemming performed
'''

# Lemmatization
# Note: Cannot perform lematization as punctuation is removed from source text.
#       Lemmatization requires parts of speech to work properly.

# Filtering non-alphabetic tokens
FILTER_NON_ALPHABETIC_TOKENS = 1
'''
0 = Don't filter
1 = Filter
'''

# Pickle Output
JD_TOKENS_PICKLE_OUTPATH='/content/drive/MyDrive/AIDI1003/JDs/jds-tokenized.pickle'
RESUME_TOKENS_PICKLE_OUTPATH='/content/drive/MyDrive/Durham College/Capstone - I/data/Datasets/resumes-tokenized.pickle'

# Number of results to show in ranked output
NUM_RESULTS_TO_SHOW=20

time: 18.6 ms (started: 2021-12-11 03:12:48 +00:00)


Attach google drive to colab instance

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
time: 844 ms (started: 2021-12-11 03:12:48 +00:00)


# Processing

## Pre-requisites

Load the dataset from pickled version output of previous stage

In [5]:
jd_files_dict = resume_files_dict = {}
with open(JD_FILES_PICKLE_OUTPATH, 'rb') as fh:
  jd_files_dict = pickle.load(fh)
with open(RESUME_FILES_PICKLE_OUTPATH, 'rb') as fh:
  resume_files_dict = pickle.load(fh)

time: 1.42 s (started: 2021-12-11 03:12:49 +00:00)


Print the counts of files loaded

In [6]:
print('Count of JDs:', len(jd_files_dict))
print('Count of Resumes:', len(resume_files_dict))

Count of JDs: 151210
Count of Resumes: 50023
time: 1.87 ms (started: 2021-12-11 03:12:50 +00:00)


Prepare stop words

In [7]:
if FILTER_STOP_WORDS == 1:
  if STOP_WORDS_SOURCE != 2:
    nltk.download('stopwords')
    nltk_stop_words = nltk.corpus.stopwords.words('english')
  
  if STOP_WORDS_SOURCE != 1:
    from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words

  if STOP_WORDS_SOURCE == 1:
    stop_words = nltk_stop_words
  elif STOP_WORDS_SOURCE == 2:
    stop_words = sklearn_stop_words
  else:
    stop_words = sklearn_stop_words.intersection(nltk_stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
time: 63.8 ms (started: 2021-12-11 03:12:50 +00:00)


Prepare stemmer algorithm

In [8]:
if STEMMER_ALGORITHM == 1 or STEMMER_ALGORITHM == 2:
  from nltk.stem.snowball import SnowballStemmer
  if STEMMER_ALGORITHM == 1:
    stemmer = SnowballStemmer(language='porter')
  elif STEMMER_ALGORITHM == 2:
    stemmer = SnowballStemmer(language='english')

time: 4.03 ms (started: 2021-12-11 03:12:50 +00:00)


## Function Definitions

Function to filter out non-alphabetic strings from a list of strings

In [9]:
'''
Filters out non-alphabetic strings from a list of strings

args:
  * words - A list of strings

Returns:
  * A filtered list of strings
'''
def filter_alpha_space(words):
  fil = []
  for string in words:
    if (any(x.isalpha() for x in string) and all(x.isalpha() or x.isspace() for x in string)):
      fil.append(string) 
  return fil

time: 7.27 ms (started: 2021-12-11 03:12:50 +00:00)


Simplified tokenization function used in vectorization

In [10]:
'''
Tokenization function used in vectorization

args:
  * text - the input text to be tokenized

returns:
  * the tokens that represent the text
'''
def tokenize(text):
  tokenized = text.lower().split()
  tokenized = [token for token in tokenized if (len(token)>1 and all(char.isalpha()or char.isdigit() for char in token))]
  return tokenized

time: 5.77 ms (started: 2021-12-11 03:12:50 +00:00)


Function to request file types for processing

In [11]:
'''
Gets and checks the type of file for processing
Accepted file types:
  * 1 - Job Description file (JD)
  * 2 - Resume file

args:
  * field_name - the name of the type being requested

returns:
  * the file type obtained, can be None if input is not valid
'''
def get_file_type(field_name: str):
  user_input: str = \
    input("Please enter the file type for the {} file(s).\n".format(field_name))

  if user_input == '1' or user_input == '2':
    user_input = int(user_input)
    print('You have entered "{}" as the {} file type.'.format(user_input, 
                                                              field_name))    
    print('Thank you for your input.')
  else:
    print('You have entered an invalid value of "{}" as the {} file type.'\
          .format(user_input, field_name))
    user_input = None
  return user_input

time: 8.88 ms (started: 2021-12-11 03:12:50 +00:00)


## Input Handling

Get input file type

In [12]:
input_type = None
while input_type is None:
  input_type = get_file_type("input")

input_dictionary = jd_files_dict if input_type == 1 else resume_files_dict

Please enter the file type for the input file(s).
1
You have entered "1" as the input file type.
Thank you for your input.
time: 3.75 s (started: 2021-12-11 03:12:50 +00:00)


Get the input filename

In [13]:
input_filename = None
while input_filename is None:
  input_filename = input("Please enter the file name for the input file.\n")
  print('You have entered "{}" as the input file name.'.format(input_filename))
  if not (input_filename in input_dictionary):
    print('The specified filename is not in the input dictionary, try again.')
    input_filename = None

Please enter the file name for the input file.
ABAP Consuultant_23157
You have entered "ABAP Consuultant_23157" as the input file name.
time: 8.54 s (started: 2021-12-11 03:12:54 +00:00)


Get output file type

In [14]:
output_type = None
while output_type is None:
  output_type = get_file_type("output")

output_dictionary = jd_files_dict if output_type == 1 else resume_files_dict

Please enter the file type for the output file(s).
1
You have entered "1" as the output file type.
Thank you for your input.
time: 1.27 s (started: 2021-12-11 03:13:03 +00:00)


## Tokenization

Tokenize resume files

In [15]:
# Tokenize Resumes
resumes_tokenized = {}
for key in resume_files_dict:
  tokenized = []

  # Tokenize words
  if TOKENIZATION_ALGORITHM == 1:
    tokenized = TreebankWordTokenizer().tokenize(resume_files_dict[key].lower())
  else:
    tokenized = resume_files_dict[key].lower().split()

  if FILTER_STOP_WORDS == 1:
    tokenized = [token for token in tokenized if (token not in stop_words and len(token)>2)]

  if STEMMER_ALGORITHM == 1 or STEMMER_ALGORITHM == 2:
    tokenized = [stemmer.stem(token) for token in tokenized]

  # Tokenize ngrams
  if NGRAM_COUNT > 1:
    # Handle files that are "empty", i.e. contains only spaces
    if len(tokenized) == 0:
      ngram_tokens = []
    else:
      ngram_tokens = [' '.join(t) for t in ngrams(tokenized, NGRAM_COUNT)]
    tokenized += ngram_tokens

  if FILTER_NON_ALPHABETIC_TOKENS == 1:
    tokenized = filter_alpha_space(tokenized)

  resumes_tokenized[key] = tokenized

time: 1min (started: 2021-12-11 03:13:04 +00:00)


Tokenize job description files

In [16]:
# Tokenize JDs
jds_tokenized = {}
for key in jd_files_dict:
  tokenized = []

  # Tokenize words
  if TOKENIZATION_ALGORITHM == 1:
    tokenized = TreebankWordTokenizer().tokenize(jd_files_dict[key].lower())
  else:
    tokenized = jd_files_dict[key].lower().split()

  if FILTER_STOP_WORDS == 1:
    tokenized = [token for token in tokenized if (token not in stop_words and len(token)>2)]

  if STEMMER_ALGORITHM == 1 or STEMMER_ALGORITHM == 2:
    tokenized = [stemmer.stem(token) for token in tokenized]

  # Tokenize ngrams
  if NGRAM_COUNT > 1:
    # Handle files that are "empty", i.e. contains only spaces
    if len(tokenized) == 0:
      ngram_tokens = []
    else:
      ngram_tokens = [' '.join(t) for t in ngrams(tokenized, NGRAM_COUNT)]
    tokenized += ngram_tokens

  if FILTER_NON_ALPHABETIC_TOKENS == 1:
    tokenized = filter_alpha_space(tokenized)

  jds_tokenized[key] = tokenized

time: 2min 24s (started: 2021-12-11 03:14:05 +00:00)


Note: due to long processing times the manually tokenized versions above are not used in vectorization. Instead pre-existing libraries are used as they are optimized for processing speed.

## Vectorization

Add an entry for the input text into the output dictionary before vectorization

In [17]:
input_key = 'input:'+input_filename
output_dictionary[input_key] = input_dictionary[input_filename]

time: 2.05 ms (started: 2021-12-11 03:16:30 +00:00)


Prepare corpus

In [18]:
corpus_raw = pd.DataFrame.from_dict(output_dictionary, orient='index', columns=['text'])

time: 109 ms (started: 2021-12-11 03:16:30 +00:00)


Save filenames for numerical index retrieval later

In [19]:
corpus_filenames = corpus_raw.index.values

time: 1.91 ms (started: 2021-12-11 03:16:30 +00:00)


Vectorize

In [20]:
vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words=stop_words, ngram_range=(1, 2))
corpus_vectors = vectorizer.fit_transform(corpus_raw['text'])

time: 2min 23s (started: 2021-12-11 03:16:30 +00:00)


Get vector representing input

In [21]:
input_index = np.where(corpus_filenames == input_key)[0][0]
input_vector = corpus_vectors[input_index]

time: 8.52 ms (started: 2021-12-11 03:18:53 +00:00)


# Similarity

Perform Cosine Similarity

In [22]:
cos_similarity_output = cosine_similarity(corpus_vectors, input_vector)

time: 1.06 s (started: 2021-12-11 03:18:53 +00:00)


Get the most similar results

In [23]:
cos_similarity_df = pd.DataFrame(cos_similarity_output, index = \
                                 corpus_filenames, columns = ['similarity'])
drop_indices = [input_key]
if input_type == output_type:
  drop_indices.append(input_filename)

cos_similarity_df.drop(index = drop_indices).nlargest(NUM_RESULTS_TO_SHOW,
                                                          'similarity')

Unnamed: 0,similarity
Contract Administr_23653,0.163727
C Developer_28577,0.11195
Sr Java Developer_23658,0.105989
SAP ABAP Consultan_58429,0.094648
HCL tech is Lookin_60402,0.091055
Opening for Positi_60744,0.083848
SAP ABAP Consultan_58075,0.074864
SAP SRM Consultant_67527,0.07474
Application Develo_33324,0.07189
Sap ABAP HANA Open_57494,0.06697


time: 97.1 ms (started: 2021-12-11 03:18:54 +00:00)
