# Preparation

In [1]:
!pip install ipython-autotime
%load_ext autotime

Collecting ipython-autotime
  Downloading ipython_autotime-0.3.1-py2.py3-none-any.whl (6.8 kB)
Installing collected packages: ipython-autotime
Successfully installed ipython-autotime-0.3.1
time: 980 µs (started: 2021-12-03 16:28:15 +00:00)


In [2]:
import pickle
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import TreebankWordTokenizer
from nltk.util import ngrams
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

time: 1.24 s (started: 2021-12-03 16:28:15 +00:00)


In [3]:
# Constants
# Pickle Input
JD_FILES_PICKLE_OUTPATH='/content/drive/MyDrive/AIDI1003/JDs/jds.pickle'
RESUME_FILES_PICKLE_OUTPATH='/content/drive/MyDrive/AIDI1003/Resumes/resumes.pickle'

NUM_RESULTS_TO_SHOW=20

time: 3.23 ms (started: 2021-12-03 16:28:17 +00:00)


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
time: 3.54 ms (started: 2021-12-03 16:28:17 +00:00)


In [5]:
jd_files_dict = resume_files_dict = {}
with open(JD_FILES_PICKLE_OUTPATH, 'rb') as fh:
  jd_files_dict = pickle.load(fh)
with open(RESUME_FILES_PICKLE_OUTPATH, 'rb') as fh:
  resume_files_dict = pickle.load(fh)

time: 7.91 s (started: 2021-12-03 16:28:17 +00:00)


In [6]:
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
stop_words = sklearn_stop_words.intersection(stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
time: 126 ms (started: 2021-12-03 16:28:24 +00:00)


In [7]:
def tokenize(text):
  tokenized = text.lower().split()
  tokenized = [token for token in tokenized if (len(token)>1 and all(char.isalpha()or char.isdigit() for char in token))]
  return tokenized

time: 2.1 ms (started: 2021-12-03 16:28:25 +00:00)


In [8]:
print('Count of JDs:', len(jd_files_dict))
print('Count of Resumes:', len(resume_files_dict))

Count of JDs: 151210
Count of Resumes: 50023
time: 1.19 ms (started: 2021-12-03 16:28:25 +00:00)


# Processing

File Types:
* 1 - Job Description file (JD)
* 2 - Resume file

In [9]:
def get_file_type(field_name: str):
  user_input: str = \
    input("Please enter the file type for the {} file(s).\n".format(field_name))

  if user_input == '1' or user_input == '2':
    user_input = int(user_input)
    print('You have entered "{}" as the {} file type.'.format(user_input, 
                                                              field_name))    
    print('Thank you for your input.')
  else:
    print('You have entered an invalid value of "{}" as the {} file type.'\
          .format(user_input, field_name))
    user_input = None
  return user_input

time: 12.1 ms (started: 2021-12-03 16:28:25 +00:00)


## Inputs

Get input file type

In [10]:
input_type = None
while input_type is None:
  input_type = get_file_type("input")

input_dictionary = jd_files_dict if input_type == 1 else resume_files_dict

Please enter the file type for the input file(s).
1
You have entered "1" as the input file type.
Thank you for your input.
time: 2.29 s (started: 2021-12-03 16:28:25 +00:00)


Get the input filename

In [11]:
input_filename = None
while input_filename is None:
  input_filename = input("Please enter the file name for the input file.\n")
  print('You have entered "{}" as the input file name.'.format(input_filename))
  if not (input_filename in input_dictionary):
    print('The specified filename is not in the input dictionary, try again.')
    input_filename = None

Please enter the file name for the input file.
ABAP Consuultant_23157
You have entered "ABAP Consuultant_23157" as the input file name.
time: 1.57 s (started: 2021-12-03 16:28:27 +00:00)


Get output file type

In [12]:
output_type = None
while output_type is None:
  output_type = get_file_type("output")

output_dictionary = jd_files_dict if output_type == 1 else resume_files_dict

Please enter the file type for the output file(s).
1
You have entered "1" as the output file type.
Thank you for your input.
time: 1.27 s (started: 2021-12-03 16:28:29 +00:00)


## Similarity Check Logic

Add an entry for the input text into the output dictionary before vectorization

In [13]:
input_key = 'input:'+input_filename
output_dictionary[input_key] = input_dictionary[input_filename]

time: 1.49 ms (started: 2021-12-03 16:28:30 +00:00)


Prepare corpus

In [14]:
corpus_raw = pd.DataFrame.from_dict(output_dictionary, orient='index', columns=['text'])

time: 91.3 ms (started: 2021-12-03 16:28:30 +00:00)


Save filenames for numerical index retrieval  later

In [15]:
corpus_filenames = corpus_raw.index.values

time: 1.42 ms (started: 2021-12-03 16:28:30 +00:00)


In [16]:
corpus_raw.head()

Unnamed: 0,text
_1158,Amazon Web Services AWS is looking for a passi...
15x Bricklayers_87905,Apply now Job Title Bricklayers Location New...
_19361,ORACLE PLSQL DEVELOPER One of our Fortune 100...
1C Developer_15174,Gsoft is looking for an 1C Developer who will ...
1C Developer_5043,Link Ltd is actively looking for energetic and...


time: 18 ms (started: 2021-12-03 16:28:30 +00:00)


Vectorize

In [17]:
vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words=stop_words, ngram_range=(1, 2))
corpus_vectors = vectorizer.fit_transform(corpus_raw['text'])

time: 2min 36s (started: 2021-12-03 16:28:30 +00:00)


Get vector representing input

In [18]:
input_index = np.where(corpus_filenames == input_key)[0][0]
input_vector = corpus_vectors[input_index]

time: 17.6 ms (started: 2021-12-03 16:31:06 +00:00)


Perform Cosine Similarity

In [19]:
cos_similarity_output = cosine_similarity(corpus_vectors, input_vector)

time: 1.08 s (started: 2021-12-03 16:31:07 +00:00)


## Results

Get the most similar results

In [20]:
cos_similarity_df = pd.DataFrame(cos_similarity_output, index = \
                                 corpus_filenames, columns = ['similarity'])
drop_indices = [input_key]
if input_type == output_type:
  drop_indices.append(input_filename)

cos_similarity_df.drop(index = drop_indices).nlargest(NUM_RESULTS_TO_SHOW,
                                                          'similarity')

Unnamed: 0,similarity
Contract Administr_23653,0.163727
C Developer_28577,0.11195
Sr Java Developer_23658,0.105989
SAP ABAP Consultan_58429,0.094648
HCL tech is Lookin_60402,0.091055
Opening for Positi_60744,0.083848
SAP ABAP Consultan_58075,0.074864
SAP SRM Consultant_67527,0.07474
Application Develo_33324,0.07189
Sap ABAP HANA Open_57494,0.06697


time: 111 ms (started: 2021-12-03 16:31:08 +00:00)
