<a href="https://colab.research.google.com/github/Zantorym/Aidi-capstone-I/blob/review/AIDI1003_Capstone_Dataset_Tokenization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install ipython-autotime
%load_ext autotime

Collecting ipython-autotime
  Downloading ipython_autotime-0.3.1-py2.py3-none-any.whl (6.8 kB)
Installing collected packages: ipython-autotime
Successfully installed ipython-autotime-0.3.1
time: 173 µs (started: 2021-11-04 18:07:13 +00:00)


In [None]:
import pickle
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.util import ngrams

time: 1.41 s (started: 2021-11-04 18:07:13 +00:00)


In [None]:
# Constants
# Pickle Input
JD_FILES_PICKLE_OUTPATH='/content/drive/MyDrive/Durham College/Capstone - I/data/Datasets/jds.pickle'
RESUME_FILES_PICKLE_OUTPATH='/content/drive/MyDrive/Durham College/Capstone - I/data/Datasets/resumes.pickle'

# Tokenization
'''
0 - string split
1 - NLTK TreebankWordTokenizer
Note: not defined value = 0 = string split
'''
TOKENIZATION_ALGORITHM=0


# NGrams
NGRAM_COUNT=2

# Stop Words
FILTER_STOP_WORDS=1
STOP_WORDS_SOURCE=0
'''
1 - Use NLTK stop words
2 - Use Scikit Learn stop words
Note: not defined value = 0 = intersection of both NLTK and Scikit-learn
'''

# Case Folding
# Note: case folding is always performed as job description and resumes
#       should have minimal use of proper nouns for differentiating against 
#       common words.

# Stemming
STEMMER_ALGORITHM=0
'''
1 = Use Porter stemmer
2 = Use Snowball stemmer
Note: not defined value = 0 = no stemming performed
'''

# Lemmatization
# Note: Cannot perform lematization as punctuation is removed from source text.
#       Lemmatization requires parts of speech to work properly.

# Filtering non-alphabetic tokens
FILTER_NON_ALPHABETIC_TOKENS = 1
'''
0 = Don't filter
1 = Filter
'''

# Pickle Output
JD_TOKENS_PICKLE_OUTPATH='/content/drive/MyDrive/AIDI1003/JDs/jds-tokenized.pickle'
RESUME_TOKENS_PICKLE_OUTPATH='/content/drive/MyDrive/Durham College/Capstone - I/data/Datasets/resumes-tokenized.pickle'

time: 10.2 ms (started: 2021-11-04 18:11:12 +00:00)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
jd_files_dict = resume_files_dict = {}
with open(JD_FILES_PICKLE_OUTPATH, 'rb') as fh:
  jd_files_dict = pickle.load(fh)
with open(RESUME_FILES_PICKLE_OUTPATH, 'rb') as fh:
  resume_files_dict = pickle.load(fh)

time: 9.8 s (started: 2021-11-04 18:07:21 +00:00)


In [None]:
if FILTER_STOP_WORDS == 1:
  if STOP_WORDS_SOURCE != 2:
    nltk.download('stopwords')
    nltk_stop_words = nltk.corpus.stopwords.words('english')
  
  if STOP_WORDS_SOURCE != 1:
    from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words

  if STOP_WORDS_SOURCE == 1:
    stop_words = nltk_stop_words
  elif STOP_WORDS_SOURCE == 2:
    stop_words = sklearn_stop_words
  else:
    stop_words = sklearn_stop_words.intersection(nltk_stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
time: 62.3 ms (started: 2021-11-04 18:07:31 +00:00)


In [None]:
if STEMMER_ALGORITHM == 1 or STEMMER_ALGORITHM == 2:
  from nltk.stem.snowball import SnowballStemmer
  if STEMMER_ALGORITHM == 1:
    stemmer = SnowballStemmer(language='porter')
  elif STEMMER_ALGORITHM == 2:
    stemmer = SnowballStemmer(language='english')

time: 13.3 ms (started: 2021-11-04 18:07:32 +00:00)


In [None]:
'''
Filters out non-alphabetic strings from a list of strings

args:
  * words - A list of strings

Returns:
  * A filtered list of strings
'''
def filter_alpha_space(words):
  fil = []
  for string in df:
    if (any(x.isalpha() for x in string) and all(x.isalpha() or x.isspace() for x in string)):
      fil.append(string) 
  return fil

In [None]:
# Tokenize Resumes
resumes_tokenized = {}
for key in resume_files_dict:
  tokenized = []

  # Tokenize words
  if TOKENIZATION_ALGORITHM == 1:
    tokenized = TreebankWordTokenizer().tokenize(resume_files_dict[key].lower())
  else:
    tokenized = resume_files_dict[key].lower().split()

  if FILTER_STOP_WORDS == 1:
    tokenized = [token for token in tokenized if (token not in stop_words and len(token)>2)]

  if STEMMER_ALGORITHM == 1 or STEMMER_ALGORITHM == 2:
    tokenized = [stemmer.stem(token) for token in tokenized]

  # Tokenize ngrams
  if NGRAM_COUNT > 1:
    # Handle files that are "empty", i.e. contains only spaces
    if len(tokenized) == 0:
      ngram_tokens = []
    else:
      ngram_tokens = [' '.join(t) for t in ngrams(tokenized, NGRAM_COUNT)]
    tokenized += ngram_tokens

  if FILTER_NON_ALPHABETIC_TOKENS == 1:
    tokenized = filter_alpha_space(tokenized)

  resumes_tokenized[key] = tokenized

time: 3min 24s (started: 2021-11-04 18:07:47 +00:00)


In [None]:
# Tokenize JDs
jds_tokenized = {}
for key in jd_files_dict:
  tokenized = []

  # Tokenize words
  if TOKENIZATION_ALGORITHM == 1:
    tokenized = TreebankWordTokenizer().tokenize(jd_files_dict[key].lower())
  else:
    tokenized = jd_files_dict[key].lower().split()

  if FILTER_STOP_WORDS == 1:
    tokenized = [token for token in tokenized if (token not in stop_words and len(token)>2)]

  if STEMMER_ALGORITHM == 1 or STEMMER_ALGORITHM == 2:
    tokenized = [stemmer.stem(token) for token in tokenized]

  # Tokenize ngrams
  if NGRAM_COUNT > 1:
    # Handle files that are "empty", i.e. contains only spaces
    if len(tokenized) == 0:
      ngram_tokens = []
    else:
      ngram_tokens = [' '.join(t) for t in ngrams(tokenized, NGRAM_COUNT)]
    tokenized += ngram_tokens

  if FILTER_NON_ALPHABETIC_TOKENS == 1:
    tokenized = filter_alpha_space(tokenized)

  jds_tokenized[key] = tokenized

time: 6min 43s (started: 2021-10-27 19:51:58 +00:00)


In [None]:
with open(JD_TOKENS_PICKLE_OUTPATH, 'wb') as fh:
   pickle.dump(jds_tokenized, fh)
with open(RESUME_TOKENS_PICKLE_OUTPATH, 'wb') as fh:
   pickle.dump(resumes_tokenized, fh)

time: 15.5 s (started: 2021-11-04 18:11:12 +00:00)
