In [1]:
!pip install ipython-autotime
%load_ext autotime

time: 1.71 ms (started: 2021-10-27 19:48:50 +00:00)


In [2]:
import pickle
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.util import ngrams

time: 584 ms (started: 2021-10-27 19:48:50 +00:00)


In [3]:
# Constants
# Pickle Input
JD_FILES_PICKLE_OUTPATH='/content/drive/MyDrive/AIDI1003/JDs/jds.pickle'
RESUME_FILES_PICKLE_OUTPATH='/content/drive/MyDrive/AIDI1003/Resumes/resumes.pickle'

# Tokenization
'''
0 - string split
1 - NLTK TreebankWordTokenizer
Note: not defined value = 0 = string split
'''
TOKENIZATION_ALGORITHM=0


# NGrams
NGRAM_COUNT=2

# Stop Words
FILTER_STOP_WORDS=1
STOP_WORDS_SOURCE=0
'''
1 - Use NLTK stop words
2 - Use Scikit Learn stop words
Note: not defined value = 0 = intersection of both NLTK and Scikit-learn
'''

# Case Folding
# Note: case folding is always performed as job description and resumes
#       should have minimal use of proper nouns for differentiating against 
#       common words.

# Stemming
STEMMER_ALGORITHM=2
'''
1 = Use Porter stemmer
2 = Use Snowball stemmer
Note: not defined value = 0 = no stemming performed
'''

# Lemmatization
# Note: Cannot perform lematization as punctuation is removed from source text.
#       Lemmatization requires parts of speech to work properly.

# Pickle Output
JD_TOKENS_PICKLE_OUTPATH='/content/drive/MyDrive/AIDI1003/JDs/jds-tokenized.pickle'
RESUME_TOKENS_PICKLE_OUTPATH='/content/drive/MyDrive/AIDI1003/Resumes/resumes-tokenized.pickle'

time: 10.6 ms (started: 2021-10-27 19:48:51 +00:00)


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
time: 2.44 ms (started: 2021-10-27 19:48:51 +00:00)


In [5]:
jd_files_dict = resume_files_dict = {}
with open(JD_FILES_PICKLE_OUTPATH, 'rb') as fh:
  jd_files_dict = pickle.load(fh)
with open(RESUME_FILES_PICKLE_OUTPATH, 'rb') as fh:
  resume_files_dict = pickle.load(fh)

time: 10.3 s (started: 2021-10-27 19:48:51 +00:00)


In [6]:
if FILTER_STOP_WORDS == 1:
  if STOP_WORDS_SOURCE != 2:
    nltk.download('stopwords')
    nltk_stop_words = nltk.corpus.stopwords.words('english')
  
  if STOP_WORDS_SOURCE != 1:
    from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words

  if STOP_WORDS_SOURCE == 1:
    stop_words = nltk_stop_words
  elif STOP_WORDS_SOURCE == 2:
    stop_words = sklearn_stop_words
  else:
    stop_words = sklearn_stop_words.intersection(nltk_stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
time: 43.7 ms (started: 2021-10-27 19:49:01 +00:00)


In [7]:
if STEMMER_ALGORITHM == 1 or STEMMER_ALGORITHM == 2:
  from nltk.stem.snowball import SnowballStemmer
  if STEMMER_ALGORITHM == 1:
    stemmer = SnowballStemmer(language='porter')
  elif STEMMER_ALGORITHM == 2:
    stemmer = SnowballStemmer(language='english')

time: 2.57 ms (started: 2021-10-27 19:49:01 +00:00)


In [8]:
# Tokenize Resumes
resumes_tokenized = {}
for key in resume_files_dict:
  tokenized = []

  # Tokenize words
  if TOKENIZATION_ALGORITHM == 1:
    tokenized = TreebankWordTokenizer().tokenize(resume_files_dict[key].lower())
  else:
    tokenized = resume_files_dict[key].lower().split()

  if FILTER_STOP_WORDS == 1:
    tokenized = [token for token in tokenized if token not in stop_words]

  if STEMMER_ALGORITHM == 1 or STEMMER_ALGORITHM == 2:
    tokenized = [stemmer.stem(token) for token in tokenized]

  # Tokenize ngrams
  if NGRAM_COUNT > 1:
    # Handle files that are "empty", i.e. contains only spaces
    if len(tokenized) == 0:
      ngram_tokens = []
    else:
      ngram_tokens = [' '.join(t) for t in ngrams(tokenized, NGRAM_COUNT)]
    tokenized += ngram_tokens

  resumes_tokenized[key] = tokenized

time: 2min 57s (started: 2021-10-27 19:49:01 +00:00)


In [9]:
# Tokenize JDs
jds_tokenized = {}
for key in jd_files_dict:
  tokenized = []

  # Tokenize words
  if TOKENIZATION_ALGORITHM == 1:
    tokenized = TreebankWordTokenizer().tokenize(jd_files_dict[key].lower())
  else:
    tokenized = jd_files_dict[key].lower().split()

  if FILTER_STOP_WORDS == 1:
    tokenized = [token for token in tokenized if token not in stop_words]

  if STEMMER_ALGORITHM == 1 or STEMMER_ALGORITHM == 2:
    tokenized = [stemmer.stem(token) for token in tokenized]

  # Tokenize ngrams
  if NGRAM_COUNT > 1:
    # Handle files that are "empty", i.e. contains only spaces
    if len(tokenized) == 0:
      ngram_tokens = []
    else:
      ngram_tokens = [' '.join(t) for t in ngrams(tokenized, NGRAM_COUNT)]
    tokenized += ngram_tokens

  jds_tokenized[key] = tokenized

time: 6min 43s (started: 2021-10-27 19:51:58 +00:00)


In [10]:
with open(JD_TOKENS_PICKLE_OUTPATH, 'wb') as fh:
   pickle.dump(jds_tokenized, fh)
with open(RESUME_TOKENS_PICKLE_OUTPATH, 'wb') as fh:
   pickle.dump(resumes_tokenized, fh)

time: 49.8 s (started: 2021-10-27 19:58:41 +00:00)
