<a href="https://colab.research.google.com/github/Zantorym/Aidi-capstone-I/blob/main/AIDI1003_Capstone_Similarity_MAP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preparation

In [1]:
# Timer to measure code execution time
!pip install ipython-autotime
%load_ext autotime

Collecting ipython-autotime
  Downloading ipython_autotime-0.3.1-py2.py3-none-any.whl (6.8 kB)
Installing collected packages: ipython-autotime
Successfully installed ipython-autotime-0.3.1
time: 1.99 ms (started: 2021-12-03 15:38:29 +00:00)


In [58]:
# Importing libraries
import pickle
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import TreebankWordTokenizer
from nltk.util import ngrams
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

time: 4.51 ms (started: 2021-12-03 16:31:54 +00:00)


In [3]:
# Constants
# Pickle Input
JD_FILES_PICKLE_OUTPATH='/content/drive/MyDrive/Durham College/Capstone - I/data/Datasets/jds.pickle'
RESUME_FILES_PICKLE_OUTPATH='/content/drive/MyDrive/Durham College/Capstone - I/data/Datasets/resumes.pickle'

time: 2.82 ms (started: 2021-12-03 15:38:31 +00:00)


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
time: 23.2 s (started: 2021-12-03 15:38:31 +00:00)


In [5]:
jd_files_dict = resume_files_dict = {}
with open(JD_FILES_PICKLE_OUTPATH, 'rb') as fh:
  jd_files_dict = pickle.load(fh)
with open(RESUME_FILES_PICKLE_OUTPATH, 'rb') as fh:
  resume_files_dict = pickle.load(fh)

time: 7.77 s (started: 2021-12-03 15:38:54 +00:00)


In [6]:
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
stop_words = sklearn_stop_words.intersection(stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
time: 167 ms (started: 2021-12-03 15:39:02 +00:00)


In [7]:
def tokenize(text):
  tokenized = text.lower().split()
  tokenized = [token for token in tokenized if (token not in stop_words and len(token)>1 and all(char.isalpha() or char.isdigit() for char in token) and any(char.isalpha() for char in token))] # No stop words, Length > 1, only alphanumeric tokens, at least one alphabet
  string = ' '.join(tokenized) # Convert to string for TFIDF vectorizer
  return string

time: 5.92 ms (started: 2021-12-03 15:39:03 +00:00)


In [8]:
print('Count of JDs:', len(jd_files_dict))
print('Count of Resumes:', len(resume_files_dict))

Count of JDs: 151210
Count of Resumes: 50023
time: 6.59 ms (started: 2021-12-03 15:39:03 +00:00)


In [9]:
combined_files_dict = {}
for filename in jd_files_dict:
  modified_fn = 'jd:' + filename
  if len(jd_files_dict[filename].strip()) == 0:
    continue
  combined_files_dict[modified_fn] = jd_files_dict[filename]
for filename in resume_files_dict:
  modified_fn = 'rs:' + filename
  if len(resume_files_dict[filename].strip()) == 0:
    continue
  combined_files_dict[modified_fn] = resume_files_dict[filename]

time: 275 ms (started: 2021-12-03 15:39:11 +00:00)


In [10]:
jd_filenames = [key for key in combined_files_dict.keys() if key.startswith('jd:')]

time: 44 ms (started: 2021-12-03 15:39:14 +00:00)


In [11]:
resume_filenames = [key for key in combined_files_dict.keys() if key.startswith('rs:')]

time: 35.5 ms (started: 2021-12-03 15:39:17 +00:00)


In [12]:
print('Count of Combines:', len(combined_files_dict))

Count of Combines: 201189
time: 1.34 ms (started: 2021-12-03 15:39:22 +00:00)


In [155]:
# Converting corpus from dictionary to dataframe
corpus_raw = pd.DataFrame.from_dict(combined_files_dict, orient='index', columns=['text'])

time: 102 ms (started: 2021-12-03 18:42:16 +00:00)


In [156]:
# Applying some pre-processing to the dataset
corpus_raw['text'] = corpus_raw['text'].apply(tokenize)

time: 1min 17s (started: 2021-12-03 18:42:18 +00:00)


In [157]:
vectorizer = TfidfVectorizer()
corpus_vectors = vectorizer.fit_transform(corpus_raw['text'])

time: 36.2 s (started: 2021-12-03 18:43:42 +00:00)


In [158]:
corpus_vectors.shape

(201189, 660008)

time: 3.82 ms (started: 2021-12-03 18:44:34 +00:00)


In [45]:
# Seperating the vectorised corpus into JDs and Resumes
# corpus_filenames = corpus_raw.index.values

# jd_corpus = []
# resume_corpus = []

# for i in range(len(corpus_filenames)):
#   if corpus_filenames[i].startswith('jd'):
#     jd_corpus.append(corpus_vectors[i])
#   else:
#     resume_corpus.append(corpus_vectors[i])
#   i += 1

time: 22.9 s (started: 2021-12-03 16:14:36 +00:00)


### Test files for evaluating

In [159]:
EVAL_MATRIX_FILE_PATH = '/content/drive/MyDrive/Durham College/Capstone - I/Evaluation_Matrix.xlsx'

eval_matrix = pd.ExcelFile(EVAL_MATRIX_FILE_PATH)
jd2jd = pd.read_excel(eval_matrix, 'JD_2_JD') # JD_2_JD testing dataset
r2r = pd.read_excel(eval_matrix, 'Resume_2_Resume') # Resume_2_Resume testing dataset
jd2r = pd.read_excel(eval_matrix, 'JD_2_Resume') # JD_2_Resume testing dataset
r2jd = pd.read_excel(eval_matrix, 'Resume_2_JD') # Resume_2_JD testing dataset

jd2jd = jd2jd.drop('Contributer', axis=1) # Removing the contributer column
jd2jd.set_index('Query_File_ID', inplace=True) # Makining Query_File_ID the index

r2r = r2r.drop('Contributor', axis=1) # Removing the contributer column
r2r.set_index('Query_File_ID', inplace=True) # Makining Query_File_ID the index

jd2r = jd2r.drop('Contributor', axis=1) # Removing the contributer column
jd2r.set_index('Query_File_ID', inplace=True) # Makining Query_File_ID the index

r2jd = r2jd.drop('Contributor', axis=1) # Removing the contributer column
r2jd.set_index('Query_File_ID', inplace=True) # Makining Query_File_ID the index

time: 102 ms (started: 2021-12-03 18:46:34 +00:00)


In [160]:
jd2jd

Unnamed: 0_level_0,Relevent_File_ID_1,Relevent_File_ID_2,Relevent_File_ID_3,Relevent_File_ID_4,Relevent_File_ID_5,Relevent_File_ID_6,Relevent_File_ID_7,Relevent_File_ID_8,Relevent_File_ID_9,Relevent_File_ID_10,Relevent_File_ID_11,Relevent_File_ID_12,Relevent_File_ID_13,Relevent_File_ID_14,Relevent_File_ID_15,Relevent_File_ID_16,Relevent_File_ID_17,Relevent_File_ID_18,Relevent_File_ID_19,Relevent_File_ID_20,Relevent_File_ID_21,Relevent_File_ID_22,Relevent_File_ID_23,Relevent_File_ID_24,Relevent_File_ID_25,Relevent_File_ID_26,Relevent_File_ID_27,Relevent_File_ID_28
Query_File_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
Accommodation Mana_78084,Accommodation Mana_79161,Accommodation Mana_80346,Accommodation Mana_119245,Accommodation Mana_97716,Accommodation Mana_79861,,,,,,,,,,,,,,,,,,,,,,,
Accommodation Mana_79161,Accommodation Mana_78084,Accommodation Mana_79861,Accommodation Mana_97716,Accommodation Mana_80346,Accommodation Mana_119245,,,,,,,,,,,,,,,,,,,,,,,
Accommodation Mana_80346,Accommodation Mana_79861,Accommodation Mana_79161,Accommodation Mana_78084,Accommodation Mana_119245,Accommodation Mana_97716,,,,,,,,,,,,,,,,,,,,,,,
Accommodation Mana_119245,Accommodation Mana_80346,Accommodation Mana_97716,Accommodation Mana_78084,Accommodation Mana_79161,Accommodation Mana_79861,,,,,,,,,,,,,,,,,,,,,,,
Accommodation Mana_97716,Accommodation Mana_79161,Accommodation Mana_78084,Accommodation Mana_80346,Accommodation Mana_119245,Accommodation Mana_79861,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Graphic Designer_73758,Graphic Designer_65971,Graphic Designer_57970,Graphic Designer_59070,Graphic Designer_100270,Graphic Designer_72604,Graphic Designer_22119,,,,,,,,,,,,,,,,,,,,,,
Graphic Designer_99274,Graphic Designer_82710,Graphic Designer_72325,Graphic Designer_21275,Graphic Designer_101633,Graphic Designer_74965,Graphic Designer_73758,,,,,,,,,,,,,,,,,,,,,,
Graphic Designer_77557,Graphic Designer_72099,Graphic Designer_73758,Graphic Designer_75197,Graphic Designer_101633,Graphic Designer_72325,Graphic Designer_99274,,,,,,,,,,,,,,,,,,,,,,
Graphic Designer_75197,Graphic Designer_73758,Graphic Designer_103036,Graphic Designer_34790,Graphic Designer_100481,Graphic Designer_72556,Graphic Designer_72604,,,,,,,,,,,,,,,,,,,,,,


time: 59.9 ms (started: 2021-12-03 18:46:38 +00:00)


### Similarity score

In [161]:
def find_similarity(test_file, similarity_metric = 0, jd_or_resume = 0):
  """
  Given a test file, it returns the similarity score against that file for all JDs/Resumes

  params:
    test_file: One entry from the corpus representing a JD or Resume
    similarity_metric: Which similarity metric to use (default = 0)
                       0 - Cosine Similarity
                       1 - Euclidean distance
    jd_or_resume: Whether to compare against JDs or to compare against resumes
                  0 - JDs
                  1 - Resumes
  
  returns: A pandas dataframe containing the similarity scores of all the required files
  """

  sample_index = np.where(corpus_filenames == test_file)[0][0]
  sample = corpus_vectors[sample_index]
  
  test_output = []
  if similarity_metric == 0:
    test_output = cosine_similarity(corpus_vectors, sample)
  else:
    test_output = euclidean_distances(corpus_vectors, sample)

  test = pd.DataFrame(test_output, index = corpus_filenames, columns = ['similarity'])

  if jd_or_resume == 0:
    test = test.loc[ jd_filenames, : ]
  else:
    test = test.loc[ resume_filenames, : ]

  test.sort_values(by=['similarity'], ascending = False, inplace = True)

  return test

time: 16.4 ms (started: 2021-12-03 18:46:43 +00:00)


### MAP Score stuff

In [169]:
def generate_MAP_input(test_file_type = 0):
  """
  Generates the two inputs required for calculating the MAP score

  params:
          test_file_type: which of the 4 test files it is (default = 0)
                          0 - JD_2_JD
                          1 - Resume_2_Resume
                          2 - JD_2_Resume
                          3 - Resume_2_JD

  returns:
          the two inputs for the MAP score function
  """
  actual = predicted = []

  test_file = []
  prefix = ''
  jd_or_resume = 0

  if test_file_type == 0:
    test_file = jd2jd
    prefix = 'jd:'
  elif test_file_type == 1:
    test_file = r2r
    prefix = 'rs:'
    jd_or_resume = 1
  elif test_file_type == 2:
    test_file = jd2r
    prefix = 'rs:'
    jd_or_resume = 1
  else:
    test_file = r2jd
    prefix = 'jd:'

  for index, row in test_file.iterrows():
    # List of files relevant to the query file in the testing document
    try: # Had to implement a try-except statement because sometimes there are multiple entries for one file (eg. line 141 and 142 of JD_2_JD are the same)
      relevant_files = test_file.loc[index].tolist()
    except:
      relevant_files = test_file.loc[index].iloc[0].tolist() # we select the first entry from the list of entries in the testing dataset
    relevant_files = [prefix + file for file in relevant_files if not(pd.isnull(file))]

    
    # Finding files relevant to the query file using our code
    test = find_similarity(prefix + index, 0, jd_or_resume)

    # Removing top result if it is the same as the query file
    if test.index[0] == prefix + index:
      test = test.iloc[1: , :]

    predicted_files = test.head(len(relevant_files)).index # Getting the top predicted files

    actual.append(relevant_files)
    predicted.append(predicted_files)

  return predicted, actual

time: 29.2 ms (started: 2021-12-03 19:00:24 +00:00)


In [163]:
"""
A function to calcualte the precision@k.

Input: Two lists and a number.
      - 'predicted' is the list of file names that our algorithm generates in response to a specific query
      - 'actual' is the list of file names that our AI algorithm is supposed to return
      - 'k' is the k-index for which we're supposed to calculate the precision@k

Output: A number denoting the precision@k
"""

def precision_at_k(predicted, actual, k):
    act_set = set(actual)
    pred_set = set(predicted[:k])
    result = len(act_set & pred_set) / float(k)
    return result

time: 2.56 ms (started: 2021-12-03 18:46:48 +00:00)


In [164]:
"""
A function to calculate the average precision for a specific query.

Input: Two lists.
      - 'predicted' is the list of file names that our algorithm generates in response to a specific query
      - 'actual' is the list of file names that our AI algorithm is supposed to return

Output: A number denoting the average precision for a query.

Things to check for: If the length of our predicted array is less than the length of our actual array, the code will fail (ideally this shouldn't happen, and should be checked for before calling the map score function)
"""

def avg_precision(predicted, actual):
  avg_prec = 0
  n = 0

  for i in range(len(actual)):
    if predicted[i] == actual[i]:
      avg_prec += precision_at_k(predicted, actual, i+1)
      n += 1

  avg_prec /= n
  return avg_prec

time: 4.7 ms (started: 2021-12-03 18:46:49 +00:00)


In [165]:
"""
A function to calculate the Mean Average Precision (MAP) Score for the entire testing dataset.

Input: Two 2D Lists. 
      - 'predicted' is the list of list of file names that our algorithm generates. Each list corresponds to one input
      - 'actual' is the list of list of file names that we're supposed to get. Each list corresponds to one input

Output: A number denoting the map_score
"""

def score(predicted, actual):
  map_score = 0
  n = 0

  for i in range(len(actual)):
    map_score += avg_precision(predicted[i], actual[i])
    n += 1

  map_score /= n
  return map_score

time: 4.71 ms (started: 2021-12-03 18:46:52 +00:00)


In [None]:
# For JD_2_JD
predicted, actual = generate_MAP_input(0)
print("JD-2-JD score: ", score(predicted, actual))

In [None]:
# For Resume_2_Resume
predicted, actual = generate_MAP_input(1)
print("Resume-2-Resume score: ", score(predicted, actual))

In [None]:
# For JD_2_Resume
predicted, actual = generate_MAP_input(2)
print("JD-2-Resume score: ", score(predicted, actual))

In [None]:
# For Resume_2_JD
predicted, actual = generate_MAP_input(3)
print("Resume-2-JD score: ", score(predicted, actual))