## Importing necessary modules

In [2]:
import os, json, sys
from glob import glob
import pandas as pd
import numpy as np
from google.colab import drive
from collections import Counter
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# nb_path = '/content/notebooks'
# os.symlink('/content/drive/My Drive/Colab Notebooks', nb_path)
# sys.path.insert(0,nb_path)

pd.set_option('display.max_columns', None)

This notebook is run on Google Colab hence all files are stored in my Google Drive for portability and resources provided.

In [3]:
drive.mount('/content/drive')
candidate = '/content/drive/MyDrive/Datasets/Hiredly/Dataset/Candidate'
json_files = [candidate+'/'+pos_json for pos_json in os.listdir(candidate) if pos_json.endswith('.json')]


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Read files


All JSON formatted candidate files are located in a folder. Here a function is written to read and extract/format the deeply nested JSON files into dataframe.

Some of the files have multiple column with the same name, hence suffix is added if the column name is duplicated. This is necessary step before appending all files into one single dataframe.



In [4]:
def read_files():
  counter = Counter()

  # Get filenames in folder and read as JSON file
  dfs = []
  for file_name in [file for file in os.listdir(candidate) if file.endswith('.json')]:
    with open(candidate + '/' + file_name) as json_file:
      dfs.append(json.load(json_file))
  
  to_concat = []
  
  # record path is for list of dicts
  # meta is for dict inside dict, add arg errors = 'ignore' to handle nan values, and sep='_'

  # Restructure key:value in nested JSON for each file read
  for x in dfs:
    df_1 = pd.json_normalize(x[0]['data']).drop(columns=['sections'])

    df_1 = pd.json_normalize(x[0]['data']).drop(columns=['sections'], errors='ignore')
    df_2 = pd.json_normalize(x[0]['data'], record_path=['sections'], errors='ignore').set_index('sectionType').T
    df_2.index = df_1.index
    df_3 = pd.concat([df_1, df_2], axis=1)
    df_3.certifications = ', '.join([x for x in df_3.certifications[0]])

    # Rename duplicated columns and add prefix
    counter = Counter()

    empty_list = []
    df_3.columns = df_3.columns.str.lower()
    for x in range(df_3.shape[1]):
      counter.update([df_3.columns[x]])
      if counter[df_3.columns[x]] == 1:
        empty_list.append(df_3.columns[x])
      else: 
        tx = counter[df_3.columns[x]] -1
        empty_list.append(df_3.columns[x].lower().replace('.', '_').replace('/','_') + '_' + str(tx))
    
    
    df_3.columns = empty_list
    to_concat.append(df_3)

  # Combine all files into single DataFrame
  final_df = pd.concat(to_concat, ignore_index=True)
  final_df.columns = [p.lower().replace('.', '_').replace('/','_') for p in final_df.columns]
  return final_df.sort_index(axis=1)

## Cleaning texts

Tokenization of all relevant columns where extra spaces, punctuation, etc are removed. We manually provide a list of stopwords for removal as they add no value to the model.

In [5]:
df = read_files()
obj_col = df.select_dtypes('object').columns
for x in obj_col: df[x] = df[x].fillna('empty')

skill_sw = ['core', 'skill', 'skills', 'professional', 'and', 'abilities', 'language', 'excellent', 'good', 'proficient','languages']

df.skills_interests_languages = [re.sub(r'[^\w\s]|\d|/^\s+|\s+$|\s+(?=\s)/g', '', a).lower().split() for a in df.skills_interests_languages]
df.skills_interests_languages = df.skills_interests_languages.apply(lambda row:[word for word in row if word not in skill_sw])
df.skills_interests_languages = [' '.join(word) for word in df.skills_interests_languages]

df.skills_interests_languages_1 = [re.sub(r'[^\w\s]|\d|/^\s+|\s+$|\s+(?=\s)/g', '', a).lower().split() for a in df.skills_interests_languages_1]
df.skills_interests_languages_1 = df.skills_interests_languages_1.apply(lambda row:[word for word in row if word not in skill_sw])
df.skills_interests_languages_1 = [' '.join(word) for word in df.skills_interests_languages_1]

df.skills_interests_languages_2 = [re.sub(r'[^\w\s]|\d|/^\s+|\s+$|\s+(?=\s)/g', '', a).lower().split() for a in df.skills_interests_languages_2]
df.skills_interests_languages_2 = df.skills_interests_languages_2.apply(lambda row:[word for word in row if word not in skill_sw])
df.skills_interests_languages_2 = [' '.join(word) for word in df.skills_interests_languages_2]

work_sw = ['professional', 'part', 'time', 'part time', 'working', 'experience', 'sdn', 'bhd', 'research', 'work', 'history', 'employment', 'career']

df.workexperience = [re.sub(r'[^\w\s]|\d|/^\s+|\s+$|\s+(?=\s)/g', '', a).lower().split() for a in df.workexperience]
df.workexperience = df.workexperience.apply(lambda row:[word for word in row if word not in work_sw])
df.workexperience = [' '.join(word) for word in df.workexperience]

df.workexperience_1 = [re.sub(r'[^\w\s]|\d|/^\s+|\s+$|\s+(?=\s)/g', '', a).lower().split() for a in df.workexperience_1]
df.workexperience_1 = df.workexperience_1.apply(lambda row:[word for word in row if word not in work_sw])
df.workexperience_1 = [' '.join(word) for word in df.workexperience_1]

df.workexperience_2 = [re.sub(r'[^\w\s]|\d|/^\s+|\s+$|\s+(?=\s)/g', '', a).lower().split() for a in df.workexperience_2]
df.workexperience_2 = df.workexperience_2.apply(lambda row:[word for word in row if word not in work_sw])
df.workexperience_2 = [' '.join(word) for word in df.workexperience_2]

df.workexperience_3 = [re.sub(r'[^\w\s]|\d|/^\s+|\s+$|\s+(?=\s)/g', '', a).lower().split() for a in df.workexperience_3]
df.workexperience_3 = df.workexperience_3.apply(lambda row:[word for word in row if word not in work_sw])
df.workexperience_3 = [' '.join(word) for word in df.workexperience_3]

df.workexperience_4 = [re.sub(r'[^\w\s]|\d|/^\s+|\s+$|\s+(?=\s)/g', '', a).lower().split() for a in df.workexperience_4]
df.workexperience_4 = df.workexperience_4.apply(lambda row:[word for word in row if word not in work_sw])
df.workexperience_4 = [' '.join(word) for word in df.workexperience_4]

df.workexperience_5 = [re.sub(r'[^\w\s]|\d|/^\s+|\s+$|\s+(?=\s)/g', '', a).lower().split() for a in df.workexperience_5]
df.workexperience_5 = df.workexperience_5.apply(lambda row:[word for word in row if word not in work_sw])
df.workexperience_5 = [' '.join(word) for word in df.workexperience_5]

training_sw = ['courses', 'and', 'certifications', 'certificate', 'professional', 'training', 'certificates']

df.training_certifications = [re.sub(r'[^\w\s]|\d|/^\s+|\s+$|\s+(?=\s)/g', '', a).lower().split() for a in df.training_certifications]
df.training_certifications = df.training_certifications.apply(lambda row:[word for word in row if word not in training_sw])
df.training_certifications = [' '.join(word) for word in df.training_certifications]

df.certifications = [re.sub(r'[^\w\s]|\d|/^\s+|\s+$|\s+(?=\s)/g', '', a).lower().split() for a in df.certifications]
df.certifications = df.certifications.apply(lambda row:[word for word in row if word not in training_sw + work_sw])
df.certifications = [' '.join(word) for word in df.certifications]

In [21]:
df.sample(3)

Unnamed: 0,achievements,achievements_1,additionalinformation,certifications,education,education_1,education_2,extracurriculars_leadership,extracurriculars_leadership_1,footer,languages,location,location_city,location_country,location_state,objective,organisations,profession,projects,publications,skills_interests_languages,skills_interests_languages_1,skills_interests_languages_2,summary,summary_1,summary_2,totalyearsexperience,training_certifications,training_certifications_1,workexperience,workexperience_1,workexperience_2,workexperience_3,workexperience_4,workexperience_5,features
16,MULTIPLE WINNING PROJECTS & COMPETITIONS As th...,empty,empty,,BACHELOR OF MASS COMMUNICATION (HONS) MAJOR IN...,EDUCATION INTI INTERNATIONAL UNIVERSITY NILAI ...,BACHELOR MASS COMMUNICATION (HONS) IN COLLABOR...,"Wear: Petronas Gallery (2018), INTI IU Fight N...",empty,empty,"[Korean, English]",empty,empty,Malaysia,empty,,empty,Videographer and Photographer based on the Gol...,FP AVANTE PETALING JAYA Located in Petaling Ja...,[],software photoshop illustrator premiere pro dr...,empty,empty,"An opportunity driven Fresh Graduate, who has ...","An opportunity driven Fresh Graduate, who has ...",empty,2,empty,empty,social media internship may sept creative cont...,empty,empty,empty,empty,empty,software photoshop illustrator premiere pro dr...
10,empty,empty,empty,,EDUCATION Infrastructure University of Kuala L...,empty,empty,empty,empty,empty,[English],empty,Neodesha,United States,Kansas,,empty,Associate,empty,[],audit express ms words ms excel ms access tabl...,cwc eng plt audit,empty,Committed person with an ability to generate a...,PROFILE CONTACT Committed person with an abili...,empty,2,empty,empty,skills,associate may may planning and conducting mana...,global savings group july current championed o...,empty,empty,empty,audit express ms words ms excel ms access tabl...
28,"Experiences and Accomplishments Coordinated, p...",Awards (1) Jan – Mar 2006 Awarded the MITI (Mi...,empty,ibm as iseries languages rpg ile programming s...,"(d) 2006-2008 IMU Education (M) Sdn. Bhd., Buk...",• 1993 –1995 South Bank University – BSc (Hons...,empty,empty,empty,empty,"[English, Bahasa, Malaysian]",empty,empty,empty,empty,Aspires to be an excellent software developer@...,Membership (1) Present ORACLE Users' Group (OU...,Consultant Engineer,EPOS Overview The EPOS Tool Tasks are the calc...,[],technical proficiency over years of software d...,english able to speak read write well proficie...,empty,,Objective: Aspires to be an excellent software...,empty,25,academic qualifications uipath diploma in rpa ...,• 1989 City & Guilds of Data Processing (UK) –...,conducted as ibm rpg iv programming on the ise...,devising and implementing user defined web ser...,analyst has not implemented dot net frameworkb...,f elken m kl malaysia project leader feasibili...,migration to oraclesql database i nestle m com...,j ait m bhdkl malaysia technical engineer syst...,technical proficiency over years of software d...


Combine the processed columns into one single column. This column will contain all texts required for prediction later.

In [7]:
df['features'] = df['skills_interests_languages'] + ' ' + df['skills_interests_languages_1'] + ' ' + df['skills_interests_languages_2'] + ' ' + df['workexperience'] + ' ' + df['workexperience_1'] + ' ' + df['workexperience_2'] + ' ' + df['workexperience_3'] + ' ' + df['workexperience_4'] + ' ' + df['workexperience_5'] + ' ' + df['training_certifications'] + ' ' + df['certifications']

In [8]:
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
stop_words = list(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Read Jobs files for matching with each candidate.

Multiple column of job desciption, requirements, and short description is combined into one single column, then tokenization is performed.

In [9]:
job_path = '/content/drive/MyDrive/Datasets/Hiredly/Dataset/Job/jobs.csv'
jobs = pd.read_csv(job_path)
jobs['combined'] = jobs.description + jobs.requirements + jobs.short_description

jobs['combined'] = jobs['combined'].apply(lambda a: re.sub(r'<.*?>', '', a))
jobs['combined'] = jobs['combined'].apply(lambda b: re.sub(r'[^\w\s]|\d|/^\s+|\s+$|\s+(?=\s)/g', '', b).lower())
jobs['combined'] = jobs['combined'].apply(lambda c: word_tokenize(c))
jobs['combined'] = [d for d in jobs['combined'] if d not in stop_words]
jobs['combined'] = [' '.join(a) for a in jobs.combined]


In [10]:
import spacy
# !python -m spacy download en_core_web_lg 
# Colab runtime must be restarted after installing the above package 

import en_core_web_lg
nlp = spacy.load('en_core_web_lg')

In [15]:
jobs.shape

(10, 5)

In [11]:
for listing in range(len(jobs['combined'])):
  job_title = jobs.iloc[listing]['title']
  print(job_title)

Business Development Executive
Data Scientist
Software Developer
Mobile App Developer
Graphic Designer
Customer Happiness Associate
Customer Happiness Officer
Digital Marketing Specialist 
Digital Marketing Executive
Sales Executive


## Job matching using Spacy similarity score

For each candidate, generate similarity score for all 10 jobs.

In [12]:
def get_matching():
  new_dict = {}
  for listing in range(len(jobs['combined'])):
    job_title = jobs.iloc[listing]['title']
    similarity = []
    for x in range(len(df['features'])):
      similarity.append(nlp(df.features[x]).similarity(nlp(jobs['combined'][listing])))
      new_dict[job_title] = similarity
      # print(listing, nlp(df.features[x]).similarity(nlp(jobs['combined'][listing])))
  return new_dict

The output is formatted in dataframe.

Note that the index [0,1,2...] is the candidate, as we dont have candidate name to label. 

In [19]:
is_similar = get_matching()
pd.DataFrame(is_similar).head(3)

Unnamed: 0,Business Development Executive,Data Scientist,Software Developer,Mobile App Developer,Graphic Designer,Customer Happiness Associate,Customer Happiness Officer,Digital Marketing Specialist,Digital Marketing Executive,Sales Executive
0,0.719867,0.804007,0.808903,0.786397,0.700468,0.711596,0.686411,0.765184,0.737422,0.743641
1,0.913068,0.897917,0.924614,0.880444,0.904011,0.908936,0.911794,0.928598,0.932058,0.952198
2,0.801031,0.836819,0.885743,0.846627,0.80036,0.806925,0.79143,0.836356,0.815838,0.844643


Get only top 3 matching job for each candidate.

In [13]:
def get_top_jobs(df_jobs=get_matching()):
  results = {}
  candidate_list = []
  positions_list = []
  score_list = []
  similarity_df = pd.concat([df, pd.DataFrame(df_jobs)], axis=1).iloc[:, -11:] # Combine applicant DF & job match DF
  for x in range(len(similarity_df)):
    candidate_list.append(x)
    temp_df = similarity_df.iloc[x,1:]
    top_jobs = np.argpartition(temp_df.values,1)[-3:].tolist() # Top 3 position index
    score_list.append(temp_df[top_jobs].sort_values(ascending=False).values.tolist()) # Score for the top 3 positions
    positions_list.append(temp_df[top_jobs].sort_values(ascending=False).index.tolist()) # The position titles
    # print(x, positions, score)
    results['candidate'] = candidate_list
    results['positions'] = positions_list
    results['score'] = score_list
    print(f'Candidate {x} suitable position is {positions_list[0]} with a score of {score_list[0]}')
  return results

In [14]:
ab = pd.DataFrame(get_top_jobs())
ab

Candidate 0 suitable position is ['Software Developer', 'Data Scientist', 'Mobile App Developer'] with a score of [0.8089034970383495, 0.8040065446820304, 0.7863969854664729]
Candidate 1 suitable position is ['Software Developer', 'Data Scientist', 'Mobile App Developer'] with a score of [0.8089034970383495, 0.8040065446820304, 0.7863969854664729]
Candidate 2 suitable position is ['Software Developer', 'Data Scientist', 'Mobile App Developer'] with a score of [0.8089034970383495, 0.8040065446820304, 0.7863969854664729]
Candidate 3 suitable position is ['Software Developer', 'Data Scientist', 'Mobile App Developer'] with a score of [0.8089034970383495, 0.8040065446820304, 0.7863969854664729]
Candidate 4 suitable position is ['Software Developer', 'Data Scientist', 'Mobile App Developer'] with a score of [0.8089034970383495, 0.8040065446820304, 0.7863969854664729]
Candidate 5 suitable position is ['Software Developer', 'Data Scientist', 'Mobile App Developer'] with a score of [0.80890349

Unnamed: 0,candidate,positions,score
0,0,"[Software Developer, Data Scientist, Mobile Ap...","[0.8089034970383495, 0.8040065446820304, 0.786..."
1,1,"[Sales Executive, Digital Marketing Executive,...","[0.9521982433651818, 0.9320584467140007, 0.928..."
2,2,"[Software Developer, Mobile App Developer, Sal...","[0.8857434101938347, 0.8466267030623466, 0.844..."
3,3,"[Software Developer, Digital Marketing Special...","[0.9507125558934106, 0.9478914063685798, 0.943..."
4,4,"[Software Developer, Customer Happiness Associ...","[0.9523639062124267, 0.9403685004508087, 0.938..."
5,5,"[Software Developer, Digital Marketing Special...","[0.915549583040753, 0.9028286138044519, 0.8924..."
6,6,"[Software Developer, Sales Executive, Customer...","[0.9015724099530303, 0.8749587618660755, 0.872..."
7,7,"[Software Developer, Sales Executive, Data Sci...","[0.9045801702152848, 0.8766504135630276, 0.861..."
8,8,"[Sales Executive, Business Development Executi...","[0.9553868220993597, 0.95415726754408, 0.94228..."
9,9,"[Software Developer, Customer Happiness Associ...","[0.8664591189878569, 0.8573107874373469, 0.853..."
