<a href="https://colab.research.google.com/github/Zantorym/Aidi-capstone-I/blob/main/AIDI1003_Capstone_Similarity_MAP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preparation

In [1]:
# Timer to measure code execution time
!pip install ipython-autotime
%load_ext autotime

Collecting ipython-autotime
  Downloading ipython_autotime-0.3.1-py2.py3-none-any.whl (6.8 kB)
Installing collected packages: ipython-autotime
Successfully installed ipython-autotime-0.3.1
time: 882 µs (started: 2021-12-09 04:27:52 +00:00)


In [2]:
# Importing libraries
import pickle
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import TreebankWordTokenizer
from nltk.util import ngrams
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

time: 1.23 s (started: 2021-12-09 04:27:52 +00:00)


In [44]:
# Constants
# Pickle Input
JD_FILES_PICKLE_OUTPATH='/content/drive/MyDrive/Durham College/Capstone - I/data/Datasets/jds.pickle'
RESUME_FILES_PICKLE_OUTPATH='/content/drive/MyDrive/Durham College/Capstone - I/data/Datasets/resumes.pickle'

PREPROCESS_METHOD = 1 # 0 if we don't want to pre-process, 1 if we want to pre-process
VECTORIZATION_METHOD = 1 # 0 for TF-IDF, 1 for bag of words
SIMILARITY_METHOD = 0 # 0 for cosine similarity, 1 for euclidean distance

time: 2.76 ms (started: 2021-12-09 05:23:08 +00:00)


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
time: 31.5 s (started: 2021-12-09 04:27:55 +00:00)


In [5]:
jd_files_dict = resume_files_dict = {}
with open(JD_FILES_PICKLE_OUTPATH, 'rb') as fh:
  jd_files_dict = pickle.load(fh)
with open(RESUME_FILES_PICKLE_OUTPATH, 'rb') as fh:
  resume_files_dict = pickle.load(fh)

time: 8.75 s (started: 2021-12-09 04:28:27 +00:00)


In [6]:
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
stop_words = sklearn_stop_words.intersection(stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
time: 129 ms (started: 2021-12-09 04:28:36 +00:00)


In [7]:
def tokenize(text):
  tokenized = text.lower().split()
  tokenized = [token for token in tokenized if (token not in stop_words and len(token)>1 and all(char.isalpha() or char.isdigit() for char in token) and any(char.isalpha() for char in token))] # No stop words, Length > 1, only alphanumeric tokens, at least one alphabet
  string = ' '.join(tokenized) # Convert to string for TFIDF vectorizer
  return string

time: 5.3 ms (started: 2021-12-09 04:28:36 +00:00)


In [8]:
print('Count of JDs:', len(jd_files_dict))
print('Count of Resumes:', len(resume_files_dict))

Count of JDs: 151210
Count of Resumes: 50023
time: 4.56 ms (started: 2021-12-09 04:28:36 +00:00)


In [9]:
combined_files_dict = {}
for filename in jd_files_dict:
  modified_fn = 'jd:' + filename
  if len(jd_files_dict[filename].strip()) == 0:
    continue
  combined_files_dict[modified_fn] = jd_files_dict[filename]
for filename in resume_files_dict:
  modified_fn = 'rs:' + filename
  if len(resume_files_dict[filename].strip()) == 0:
    continue
  combined_files_dict[modified_fn] = resume_files_dict[filename]

time: 254 ms (started: 2021-12-09 04:28:36 +00:00)


In [10]:
jd_filenames = [key for key in combined_files_dict.keys() if key.startswith('jd:')]

time: 48.4 ms (started: 2021-12-09 04:28:37 +00:00)


In [11]:
resume_filenames = [key for key in combined_files_dict.keys() if key.startswith('rs:')]

time: 36.9 ms (started: 2021-12-09 04:28:37 +00:00)


In [12]:
print('Count of Combines:', len(combined_files_dict))

Count of Combines: 201189
time: 3.22 ms (started: 2021-12-09 04:28:37 +00:00)


In [13]:
# Converting corpus from dictionary to dataframe
corpus_raw = pd.DataFrame.from_dict(combined_files_dict, orient='index', columns=['text'])

time: 124 ms (started: 2021-12-09 04:28:37 +00:00)


In [36]:
# Applying some pre-processing to the dataset
if PREPROCESS_METHOD == 1:
  corpus_raw['text'] = corpus_raw['text'].apply(tokenize)

time: 1min 17s (started: 2021-12-09 05:14:06 +00:00)


In [37]:
if VECTORIZATION_METHOD == 0:
  vectorizer = TfidfVectorizer()
  corpus_vectors = vectorizer.fit_transform(corpus_raw['text'])
else:
  vectorizer = CountVectorizer()
  corpus_vectors = vectorizer.fit_transform(corpus_raw['text'])

time: 34.3 s (started: 2021-12-09 05:15:24 +00:00)


In [38]:
corpus_vectors.shape

(201189, 660008)

time: 4.05 ms (started: 2021-12-09 05:15:58 +00:00)


In [39]:
corpus_filenames = corpus_raw.index.values # List of file names in the corpus

time: 1.26 ms (started: 2021-12-09 05:15:58 +00:00)


### Test files for evaluating

In [19]:
EVAL_MATRIX_FILE_PATH = '/content/drive/MyDrive/Durham College/Capstone - I/Evaluation_Matrix.xlsx'

eval_matrix = pd.ExcelFile(EVAL_MATRIX_FILE_PATH)
jd2jd = pd.read_excel(eval_matrix, 'JD_2_JD') # JD_2_JD testing dataset
r2r = pd.read_excel(eval_matrix, 'Resume_2_Resume') # Resume_2_Resume testing dataset
jd2r = pd.read_excel(eval_matrix, 'JD_2_Resume') # JD_2_Resume testing dataset
r2jd = pd.read_excel(eval_matrix, 'Resume_2_JD') # Resume_2_JD testing dataset

jd2jd = jd2jd.drop('Contributer', axis=1) # Removing the contributer column
jd2jd.set_index('Query_File_ID', inplace=True) # Makining Query_File_ID the index

r2r = r2r.drop('Contributor', axis=1) # Removing the contributer column
r2r.set_index('Query_File_ID', inplace=True) # Makining Query_File_ID the index

jd2r = jd2r.drop('Contributor', axis=1) # Removing the contributer column
jd2r.set_index('Query_File_ID', inplace=True) # Makining Query_File_ID the index

r2jd = r2jd.drop('Contributor', axis=1) # Removing the contributer column
r2jd.set_index('Query_File_ID', inplace=True) # Makining Query_File_ID the index

time: 470 ms (started: 2021-12-09 04:32:12 +00:00)


In [20]:
r2jd

Unnamed: 0_level_0,Relevent_File_ID_1,Relevent_File_ID_2,Relevent_File_ID_3,Relevent_File_ID_4,Relevent_File_ID_5,Relevent_File_ID_6,Relevent_File_ID,Relevent_File_ID.1,Relevent_File_ID.2,Relevent_File_ID.3
Query_File_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
HadoopDeveloper_25461,Sr Technical Consu_73658,Hadoop Administrat_19699,Big Data Solutions_31787,Big Data Consultan_23750,ETL Architect Need_24275,Insurance AAE Big_23754,,,,
SeniorHadoopDeveloper_47290,Sr Technical Consu_73658,Hadoop Administrat_19699,Big Data Solutions_31787,Hadoop Developer_20482,ETL Architect_24376,Advanced Analytics_23873,,,,
MurthyMantha_41937,ETL Architect_24376,Hadoop Developer_20482,Hadoop Administrat_19699,Advanced Analytics_23873,Sr BigData Develop_18207,ETL Hadoop UI_20409,,,,
FullStackNetDeveloper_15695,JavaHadoop Develop_21652,Multiple Java Posi_22877,ETL Hadoop UI_20409,Advanced Analytics_23873,JavaAWS Developer_18799,Big Data Solutions_31908,,,,
TableauDeveloperBIConsultant_45792,Hadoop Developer_20900,Data Architect_32168,Big Data Solutions_31908,Sr Technical Consu_73658,Hadoop Administrat_19699,Big Data Solutions_31787,,,,
...,...,...,...,...,...,...,...,...,...,...
WilliamWhite_30728,Service Technician_101359,Service Technician_89585,Service Technician_79953,Service Technician_51292,Service Technician_46608,Service Technician_46243,Service Technician_46092,Service Technician_46040,Service Technician_45968,Service Technician_45589
JulieAdcock_42058,Project Management_74694,Project Management_63194,PROJECT MANAGEMENT_49412,Project Management_43218,Project Management_85981,Project Management_63194,,,,
MachineOperator_29318,Machine Operator_81613,Machine Operator_100793,Machine Operator_80619,Machine Operator_51402,Machine Operators_44249,,,,,
MagdalenaLuczak_15102,Administrative Ass_149915,Administrative Ass_150611,Administrative Ass_150374,Administrative Ass_150193,Administrative Ass_149104,,,,,


time: 39.4 ms (started: 2021-12-09 04:32:12 +00:00)


### Similarity score

In [21]:
def find_similarity(test_file, similarity_metric = 0, jd_or_resume = 0):
  """
  Given a test file, it returns the similarity score against that file for all JDs/Resumes

  params:
    test_file: One entry from the corpus representing a JD or Resume
    similarity_metric: Which similarity metric to use (default = 0)
                       0 - Cosine Similarity
                       1 - Euclidean distance
    jd_or_resume: Whether to compare against JDs or to compare against resumes
                  0 - JDs
                  1 - Resumes
  
  returns: A pandas dataframe containing the similarity scores of all the required files
  """

  sample_index = np.where(corpus_filenames == test_file)[0][0]
  sample = corpus_vectors[sample_index]
  
  test_output = []
  if similarity_metric == 0:
    test_output = cosine_similarity(corpus_vectors, sample)
  else:
    test_output = euclidean_distances(corpus_vectors, sample)

  test = pd.DataFrame(test_output, index = corpus_filenames, columns = ['similarity'])

  if jd_or_resume == 0:
    test = test.loc[ jd_filenames, : ]
  else:
    test = test.loc[ resume_filenames, : ]

  if similarity_metric == 0:
    test.sort_values(by=['similarity'], ascending = False, inplace = True) # Cosine
  else:
    test.sort_values(by=['similarity'], ascending = True, inplace = True) # Euclidean

  return test

time: 24.1 ms (started: 2021-12-09 04:32:12 +00:00)


### MAP Score stuff

In [22]:
def generate_MAP_input(test_file_type = 0):
  """
  Generates the two inputs required for calculating the MAP score

  params:
          test_file_type: which of the 4 test files it is (default = 0)
                          0 - JD_2_JD
                          1 - Resume_2_Resume
                          2 - JD_2_Resume
                          3 - Resume_2_JD

  returns:
          the two inputs for the MAP score function
  """
  actual = []
  predicted = []

  test_file = []
  ind_prefix = ''
  res_prefix = ''
  jd_or_resume = 0

  if test_file_type == 0:
    test_file = jd2jd
    ind_prefix = 'jd:'
    res_prefix = 'jd:'
  elif test_file_type == 1:
    test_file = r2r
    ind_prefix = 'rs:'
    res_prefix = 'rs:'
    jd_or_resume = 1
  elif test_file_type == 2:
    test_file = jd2r
    ind_prefix = 'jd:'
    res_prefix = 'rs:'
    jd_or_resume = 1
  else:
    test_file = r2jd
    ind_prefix = 'rs:'
    res_prefix = 'jd:'

  for index, row in test_file.iterrows():
    # List of files relevant to the query file in the testing document
    try: # Had to implement a try-except statement because sometimes there are multiple entries for one file (eg. line 141 and 142 of JD_2_JD are the same)
      relevant_files = test_file.loc[index].tolist()
    except:
      relevant_files = test_file.loc[index].iloc[0].tolist() # we select the first entry from the list of entries in the testing dataset
    relevant_files = [res_prefix + file for file in relevant_files if not(pd.isnull(file))]

    
    # Finding files relevant to the query file using our code
    test = find_similarity(ind_prefix + index, SIMILARITY_METHOD, jd_or_resume)

    # Removing top result if it is the same as the query file
    if test.index[0] == ind_prefix + index:
      test = test.iloc[1: , :]

    predicted_files = test.head(len(relevant_files)).index # Getting the top predicted files

    actual.append(relevant_files)
    predicted.append(predicted_files)

  return predicted, actual

time: 41 ms (started: 2021-12-09 04:32:12 +00:00)


In [23]:
"""
A function to calcualte the precision@k.

Input: Two lists and a number.
      - 'predicted' is the list of file names that our algorithm generates in response to a specific query
      - 'actual' is the list of file names that our AI algorithm is supposed to return
      - 'k' is the k-index for which we're supposed to calculate the precision@k

Output: A number denoting the precision@k
"""

def precision_at_k(predicted, actual, k):
    act_set = set(actual)
    pred_set = set(predicted[:k])
    result = len(act_set & pred_set) / float(k)
    return result

time: 6.87 ms (started: 2021-12-09 04:32:13 +00:00)


In [24]:
"""
A function to calculate the average precision for a specific query.

Input: Two lists.
      - 'predicted' is the list of file names that our algorithm generates in response to a specific query
      - 'actual' is the list of file names that our AI algorithm is supposed to return

Output: A number denoting the average precision for a query.

Things to check for: If the length of our predicted array is less than the length of our actual array, the code will fail (ideally this shouldn't happen, and should be checked for before calling the map score function)
"""

def avg_precision(predicted, actual):
  avg_prec = 0
  n = 0

  for i in range(len(actual)):
    if predicted[i] == actual[i]:
      avg_prec += precision_at_k(predicted, actual, i+1)
      n += 1
  
  if n>0:
    avg_prec /= n
  
  return avg_prec

time: 12.3 ms (started: 2021-12-09 04:32:13 +00:00)


In [25]:
"""
A function to calculate the Mean Average Precision (MAP) Score for the entire testing dataset.

Input: Two 2D Lists. 
      - 'predicted' is the list of list of file names that our algorithm generates. Each list corresponds to one input
      - 'actual' is the list of list of file names that we're supposed to get. Each list corresponds to one input

Output: A number denoting the map_score
"""

def score(predicted, actual):
  map_score = 0
  n = 0

  for i in range(len(actual)):
    map_score += avg_precision(predicted[i], actual[i])
    n += 1

  if n>0:
    map_score /= n
  
  return map_score

time: 8.44 ms (started: 2021-12-09 04:32:13 +00:00)


In [45]:
# For JD_2_JD
predicted, actual = generate_MAP_input(0)
print("JD-2-JD score: ", score(predicted, actual))

JD-2-JD score:  0.05444659776055125
time: 2min 49s (started: 2021-12-09 05:23:13 +00:00)


In [46]:
# For Resume_2_Resume
predicted, actual = generate_MAP_input(1)
print("Resume-2-Resume score: ", score(predicted, actual))

Resume-2-Resume score:  0.021722846441947566
time: 1min 49s (started: 2021-12-09 05:26:02 +00:00)


In [47]:
# For JD_2_Resume
predicted, actual = generate_MAP_input(2)
print("JD-2-Resume score: ", score(predicted, actual))

JD-2-Resume score:  0.0
time: 23.6 s (started: 2021-12-09 05:27:52 +00:00)


In [48]:
# For Resume_2_JD
predicted, actual = generate_MAP_input(3)
print("Resume-2-JD score: ", score(predicted, actual))

Resume-2-JD score:  0.0
time: 55.5 s (started: 2021-12-09 05:28:16 +00:00)
