# Preparation

In [1]:
!pip install ipython-autotime
%load_ext autotime

time: 2.19 ms (started: 2021-12-02 02:28:09 +00:00)


In [2]:
import pickle
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import TreebankWordTokenizer
from nltk.util import ngrams
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

time: 573 ms (started: 2021-12-02 02:28:09 +00:00)


In [3]:
# Constants
# Pickle Input
JD_FILES_PICKLE_OUTPATH='/content/drive/MyDrive/AIDI1003/JDs/jds.pickle'
RESUME_FILES_PICKLE_OUTPATH='/content/drive/MyDrive/AIDI1003/Resumes/resumes.pickle'

NUM_RESULTS_TO_SHOW=20

time: 4.51 ms (started: 2021-12-02 02:28:10 +00:00)


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
time: 3.55 ms (started: 2021-12-02 02:28:10 +00:00)


In [5]:
jd_files_dict = resume_files_dict = {}
with open(JD_FILES_PICKLE_OUTPATH, 'rb') as fh:
  jd_files_dict = pickle.load(fh)
with open(RESUME_FILES_PICKLE_OUTPATH, 'rb') as fh:
  resume_files_dict = pickle.load(fh)

time: 780 ms (started: 2021-12-02 02:28:10 +00:00)


In [6]:
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
stop_words = sklearn_stop_words.intersection(stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
time: 41.6 ms (started: 2021-12-02 02:28:10 +00:00)


In [7]:
def tokenize(text):
  tokenized = text.lower().split()
  tokenized = [token for token in tokenized if (len(token)>1 and all(char.isalpha()or char.isdigit() for char in token))]
  return tokenized

time: 3.58 ms (started: 2021-12-02 02:28:10 +00:00)


In [8]:
print('Count of JDs:', len(jd_files_dict))
print('Count of Resumes:', len(resume_files_dict))

Count of JDs: 151210
Count of Resumes: 50023
time: 4.42 ms (started: 2021-12-02 02:28:10 +00:00)


# Processing

File Types:
* 1 - Job Description file (JD)
* 2 - Resume file

In [9]:
def get_file_type(field_name: str):
  user_input: str = \
    input("Please enter the file type for the {} file(s).\n".format(field_name))

  if user_input == '1' or user_input == '2':
    user_input = int(user_input)
    print('You have entered "{}" as the {} file type.'.format(user_input, 
                                                              field_name))    
    print('Thank you for your input.')
  else:
    print('You have entered an invalid value of "{}" as the {} file type.'\
          .format(user_input, field_name))
    user_input = None
  return user_input

time: 6.62 ms (started: 2021-12-02 02:28:10 +00:00)


## Inputs

Get input file type

In [10]:
input_type = None
while input_type is None:
  input_type = get_file_type("input")

input_dictionary = jd_files_dict if input_type == 1 else resume_files_dict

Please enter the file type for the input file(s).
1
You have entered "1" as the input file type.
Thank you for your input.
time: 2.16 s (started: 2021-12-02 02:28:10 +00:00)


Get the input filename

In [11]:
input_filename = None
while input_filename is None:
  input_filename = input("Please enter the file name for the input file.\n")
  print('You have entered "{}" as the input file name.'.format(input_filename))
  if not (input_filename in input_dictionary):
    print('The specified filename is not in the input dictionary, try again.')
    input_filename = None

Please enter the file name for the input file.
LEED Reporting Eng_12627
You have entered "LEED Reporting Eng_12627" as the input file name.
time: 16.5 s (started: 2021-12-02 02:28:13 +00:00)


Get output file type

In [12]:
output_type = None
while output_type is None:
  output_type = get_file_type("output")

output_dictionary = jd_files_dict if input_type == 1 else resume_files_dict

Please enter the file type for the output file(s).
1
You have entered "1" as the output file type.
Thank you for your input.
time: 3.79 s (started: 2021-12-02 02:28:29 +00:00)


## Similarity Check Logic

Add an entry for the input text into the output dictionary before vectorization

In [13]:
input_key = 'input:'+input_filename
output_dictionary[input_key] = input_dictionary[input_filename]

time: 1.28 ms (started: 2021-12-02 02:28:33 +00:00)


Prepare corpus

In [14]:
corpus_raw = pd.DataFrame.from_dict(output_dictionary, orient='index', columns=['text'])

time: 72.6 ms (started: 2021-12-02 02:28:33 +00:00)


Save filenames for numerical index retrieval  later

In [15]:
corpus_filenames = corpus_raw.index.values

time: 1.03 ms (started: 2021-12-02 02:28:33 +00:00)


In [16]:
corpus_raw.head()

Unnamed: 0,text
_1158,Amazon Web Services AWS is looking for a passi...
15x Bricklayers_87905,Apply now Job Title Bricklayers Location New...
_19361,ORACLE PLSQL DEVELOPER One of our Fortune 100...
1C Developer_15174,Gsoft is looking for an 1C Developer who will ...
1C Developer_5043,Link Ltd is actively looking for energetic and...


time: 14.2 ms (started: 2021-12-02 02:28:33 +00:00)


Vectorize

In [17]:
vectorizer = TfidfVectorizer()
corpus_vectors = vectorizer.fit_transform(corpus_raw['text'])

time: 33.5 s (started: 2021-12-02 02:28:33 +00:00)


Get vector representing input

In [18]:
input_index = np.where(corpus_filenames == input_key)[0][0]
input_vector = corpus_vectors[input_index]

time: 7.09 ms (started: 2021-12-02 02:29:07 +00:00)


Perform Cosine Similarity

In [19]:
cos_similarity_output = cosine_similarity(corpus_vectors, input_vector)

time: 264 ms (started: 2021-12-02 02:29:07 +00:00)


## Results

Get the most similar results

In [20]:
cos_similarity_df = pd.DataFrame(cos_similarity_output, index = \
                                 corpus_filenames, columns = ['similarity'])
cos_similarity_df.drop(index = [input_key, 
                                input_filename]).nlargest(NUM_RESULTS_TO_SHOW,
                                                          'similarity')

Unnamed: 0,similarity
Construction Site_15274,0.48016
Designer Architect_3251,0.345975
Construction Healt_38071,0.336241
Civil Engineer_4812,0.32434
Project Coordinato_8670,0.314234
Construction Super_8353,0.311522
Construction Super_5913,0.309241
Mechanical Enginee_14234,0.306291
Construction Proje_14320,0.305974
Monster_45652,0.300677


time: 64 ms (started: 2021-12-02 02:29:07 +00:00)
