In [37]:
# for candidate profile
import json
import pandas as pd
from spacy.lang.en import English
from spacy.pipeline import EntityRuler

with open("softskills.json","r") as f:
        softskills = json.load(f)
with open("hardskills.json","r") as f:
    hardskills = json.load(f)

nlp = English()
ruler = EntityRuler(nlp, validate=True)
entity_ruler = nlp.add_pipe("entity_ruler")
entity_ruler.initialize(lambda: [], nlp=nlp, patterns=hardskills)

class transfer_profile_into_matrix:

    recruiting_matrix = {"work_availability":[],"degree&major":[], "yoe&industry":[],"posting_history":[]}
    
    for i in hardskills:
        recruiting_matrix[i['label']]=0
    for i in softskills.values():
        for element in i:
            recruiting_matrix[element]=0

    def __init__(self,json_path,id):
        self.json_path = json_path
        self.id = id
    
    @ staticmethod
    def read_source_file(path):
        with open(path,"r") as f:
            dataset = json.load(f)
        return dataset

    @staticmethod
    def most_frequent(lList):
        counter = 0
        num = lList[0]
        for i in lList:
            curr_frequency = lList.count(i)
            if(curr_frequency> counter):
                counter = curr_frequency
                num = i
        return num

    def fill_in_basic(self):
        self.recruiting_matrix["work_availability"] = [self.read_source_file(self.json_path)[self.id]['Country&City'],self.read_source_file(self.json_path)[self.id]['Work_Autho']]
        self.recruiting_matrix["degree&major"] = self.read_source_file(self.json_path)[self.id]['Degree&Major']
        self.recruiting_matrix["yoe&industry"] = self.read_source_file(self.json_path)[self.id]['YOE&Industry']
        self.recruiting_matrix["posting_history"] = self.read_source_file(self.json_path)[self.id]['posting_history']

    def project_skills(self,plist):
        hardskills_temp = []
        softskills_temp = []
        for each_sentence in plist:
            # hard skill recognition
            doc = nlp(each_sentence)
            for ent in doc.ents:
                hardskills_temp.append(ent.label_)
            # soft skill recognition
            temp_list_inner = []
            for each_word in each_sentence.split():
                if each_word in softskills.keys():
                    skill = softskills[each_word]
                    temp_list_inner = temp_list_inner+skill
                else:
                    continue
            if temp_list_inner == []:
                softskills_temp = softskills_temp
            elif len(temp_list_inner) == len(list(set(temp_list_inner))):
                softskills_temp = softskills_temp
            else:
                softskills_temp.append(self.most_frequent(temp_list_inner))
        total_skills = list(set(hardskills_temp))+list(set(softskills_temp))
        for item in total_skills:
            if item in self.recruiting_matrix.keys():
                self.recruiting_matrix[item]=1

    def transfer_hard_and_soft(self):
        # list of work experience
        project_experience = self.read_source_file(self.json_path)[self.id]['project_experiences']
        work_experience = self.read_source_file(self.json_path)[self.id]['working_experiences']
        skills = self.read_source_file(self.json_path)[self.id]['skills']
        self.project_skills(project_experience)
        self.project_skills(work_experience)
        self.project_skills(skills)
    
    def store_as_dict(self):
        # this part can be modified to any format
        self.fill_in_basic()
        self.transfer_hard_and_soft()
        candidate_matrix = {self.id:self.recruiting_matrix}
        return candidate_matrix

        

In [38]:
cand1 = transfer_profile_into_matrix("candidate_profile.json","10000")
print(cand1.store_as_dict())

{'lindsey weedon': {'work_availability': [44, 'None'], 'degree&major': [[14, ['business management', 'management', 'business']]], 'yoe&industry': [[123, ['marketing manager']], [365, ['manager']], [396, ['manager']], [303, ['marketing manager']], [395, ['marketing manager']], [853, ['founder']], [365, ['events manager']], [61, ['marketing manager']], [853, ['marketing specialist']], [123, ['manager']], [577, ['coordinator']]], 'posting_history': [], 'File versioning software': 0, 'Data mining software': 0, 'Project management software': 1, 'Label making software': 0, 'Text to speech conversion software': 0, 'Mailing and shipping software': 0, 'Internet protocol IP multimedia subsystem software': 0, 'Enterprise system management software': 0, 'Computer based training software': 1, 'Metadata management software': 0, 'Action games': 0, 'Medical software': 0, 'Charting software': 0, 'Compiler and decompiler software': 0, 'Data compression software': 0, 'Point of sale POS software': 0, 'Gra

In [39]:
# for work requirement
import json
import pandas as pd
from scipy import spatial
from spacy.lang.en import English
from spacy.pipeline import EntityRuler

with open("softskills.json","r") as f:
        softskills = json.load(f)
with open("hardskills.json","r") as f:
    hardskills = json.load(f)

nlp = English()
ruler = EntityRuler(nlp, validate=True)
entity_ruler = nlp.add_pipe("entity_ruler")
entity_ruler.initialize(lambda: [], nlp=nlp, patterns=hardskills)

class transfer_requirement_into_matrix:

    recruiting_matrix = {"work_availability":[],"degree&major":[], "yoe&industry":[],"recruiter":"","type":"","seniority":"","recruiting_history":[]}
    
    for i in hardskills:
        recruiting_matrix[i['label']]=0
    for i in softskills.values():
        for element in i:
            recruiting_matrix[element]=0

    def __init__(self,json_path,company, position,id):
        self.json_path = json_path
        self.company = company
        self.position = position
        self.id = id
    
    @ staticmethod
    def read_source_file(path):
        with open(path,"r") as f:
            dataset = json.load(f)
        return dataset

    @staticmethod
    def most_frequent(lList):
        counter = 0
        num = lList[0]
        for i in lList:
            curr_frequency = lList.count(i)
            if(curr_frequency> counter):
                counter = curr_frequency
                num = i
        return num

    def fill_in_basic(self):
        self.recruiting_matrix["work_availability"] = [self.read_source_file(self.json_path)[self.company][self.position][self.id]['Position_Country&City'],self.read_source_file(self.json_path)[self.company][self.position][self.id]['Work_autho']]
        self.recruiting_matrix["degree&major"] = [[self.read_source_file(self.json_path)[self.company][self.position][self.id]['Degree&YOE_Requirement']['basic_degree'][0],self.read_source_file(self.json_path)[self.company][self.position][self.id]['Degree&YOE_Requirement']['basic_major']],[self.read_source_file(self.json_path)[self.company][self.position][self.id]['Degree&YOE_Requirement']['preferred_degree'][0],self.read_source_file(self.json_path)[self.company][self.position][self.id]['Degree&YOE_Requirement']['preferred_major']]]
        self.recruiting_matrix["yoe&industry"] = [self.read_source_file(self.json_path)[self.company][self.position][self.id]['Degree&YOE_Requirement']['basic_yoe'],self.read_source_file(self.json_path)[self.company][self.position][self.id]['Degree&YOE_Requirement']['preferred_yoe']]
        self.recruiting_matrix["recruiter"] = self.read_source_file(self.json_path)[self.company][self.position][self.id]['Recruiter']
        self.recruiting_matrix["type"] = self.read_source_file(self.json_path)[self.company][self.position][self.id]['Type']
        self.recruiting_matrix["seniority"] = self.read_source_file(self.json_path)[self.company][self.position][self.id]['Seniority']
        self.recruiting_matrix["recruiting_history"] = self.read_source_file(self.json_path)[self.company][self.position][self.id]["Recruiting_History"]

    def project_skills(self,plist):
        hardskills_temp = []
        softskills_temp = []
        for each_sentence in plist:
            # hard skill recognition
            doc = nlp(each_sentence)
            for ent in doc.ents:
                hardskills_temp.append(ent.label_)
            # soft skill recognition
            temp_list_inner = []
            for each_word in each_sentence.split():
                if each_word in softskills.keys():
                    skill = softskills[each_word]
                    temp_list_inner = temp_list_inner+skill
                else:
                    continue
            if temp_list_inner == []:
                softskills_temp = softskills_temp
            elif len(temp_list_inner) == len(list(set(temp_list_inner))):
                softskills_temp = softskills_temp
            else:
                softskills_temp.append(self.most_frequent(temp_list_inner))
        total_skills = list(set(hardskills_temp))+list(set(softskills_temp))
        for item in total_skills:
            if item in self.recruiting_matrix.keys():
                self.recruiting_matrix[item]=1

    def transfer_hard_and_soft(self):
        # list of work experience
        responsibility = self.read_source_file(self.json_path)[self.company][self.position][self.id]["Responsibilities"]
        basic_quali = self.read_source_file(self.json_path)[self.company][self.position][self.id]["Basic_Qualifications"]
        preferred_quali = self.read_source_file(self.json_path)[self.company][self.position][self.id]["Preferred_Qualifications"]
        self.project_skills(responsibility)
        self.project_skills(basic_quali)
        self.project_skills(preferred_quali)
    
    def store_as_dict(self):
        # this part can be modified to any format
        self.fill_in_basic()
        self.transfer_hard_and_soft()
        job_matrix = {self.comapny:{self.position:{self.id:self.recruiting_matrix}}}
        return job_matrix

In [None]:
class transfer_and_compare():

    def __init__(self,cand_id,company,position,job_id,candidate_path,jd_path):
        self.cand_id = self.cand_id
        self.company = self.company
        self.position = self.position
        self.job_id = self.job_id
        self.candidate_path = candidate_path
        self.jd_path = jd_path

    @ staticmethod
    def read_source_file(path):
        with open(path,"r") as f:
            dataset = json.load(f)
        return dataset
    
    def compare_work_availibility(self):
        # make judgement based on the location and word autho (always give a higher work autho 1)
        pass

    def compare_degree_major(self):
        # make judgement based on the degree and major (always give a higher major 1)
        pass

    def compare_yoe_industry(self):
        # make judgement based on the yoe and industry (always give a longer work experience 1) (but can also consider the seniority property of this position)
        # can also consider "type" field - which is the self-defined label by the OCBang
        pass

    def consider_other_requirements(self):
        # consider the recruiting history and preferences of recruiters!
        pass

    def project_to_same_dimension(self):
        # need to project the matrix of candidates and jd to the same dimension and get the numeric vector, prepare to do the cosine similarity
        pass

    def cos_similarity(self):
        # pass two vectors to the formula and do calculation!
        result = 1 - spatial.distance.cosine(self.candidate_vector, self.jd_vector)
        return result

In [None]:
# This file contains the transformation of skills in candidate profile and jd, workflow to transfer other features and the steps to get cosine similarity.
#
#
# DETAILED EXPLANATION AND POTENTIAL QUESTIONS OF EACH STEP:
#
# - Data Cleaning:
#
# For candidate part, using formal transformation process provided by file under:
# ./ocbang/Matching System/O*NET web scraping/preliminary_attempt/clean_data/clean_candidate_profile.ipynb
# The candidate profile after cleaning can be found at:
# ./ocbang/Matching System/O*NET web scraping/preliminary_attempt/clean_data/candidate_profile.json
# For JD part, using formal transformation process provided by file under:
# ./ocbang/Matching System/O*NET web scraping/preliminary_attempt/clean_data/clean_dataset_before_input.ipynb
# **** NOTE: There's no enough data to improve JD cleaning process. Needs to improve further!
# **** NOTE: Please strictly follow the format required by each document before input!!
# **** The source files of candidate profiles can be found under folder: ./ocbang/Matching System/O*NET web scraping/preliminary_attempt/clean_data/parse from csv/
# **** Under this folder, the input source files are info1, info2, info3, and the transformation Python file is also included (contains a class)!
# **** The source files of JD can be found at: https://zstvp55f3z.larksuite.com/sheets/shtus9lKsd83Go8UlzZkolyLg1d?sheet=6dC85C
#
# This process contains two main datasets: Job Title AND Major: can be found at
# ./ocbang/Matching System/O*NET web scraping/preliminary_attempt/clean_data/create job title/title_final.xlsx
# ./ocbang/Matching System/O*NET web scraping/preliminary_attempt/clean_data/create student major/temp_merged_major.xlsx
# **** NOTE: Those two dataset can be (1) expanded and (2) labeled (can consider NERD relationship) in order to improve the accuracy, usage can be found at:
# **** https://kensho.com/solutions
# **** I currently use fuzzy matching method if certain major/title can't be direcylt found in the datasets, but if we label them, we can use label to do matching!
#
#
# - Hard Skills: (135 dimensions)
# Project into hardskill dataset:
# ./ocbang/Matching System/O*NET web scraping/final_merge&analysis/hardskills.json
# This is precise matching process. With SpaCy package.
# **** NOTE: This dataset can be further splited, but depends on the imrpovement of accuracy
# **** NOTE: How to identify those words that can not recognized (not include in the dataset). (2 potential methods - sliding window/manully classified)
# **** NOTE: In the future, visualization can be added to the UI. Example can be found under: ./ocbang/Matching System/O*NET web scraping/preliminary_attempt/first_attempt.ipynb
#
#
# - Soft Skills: (41 dimensions)
# Project into softskill dataset (generate with Google Resources): (This dataset can be expanded or using NERD Resources)
# ./ocbang/Matching System/O*NET web scraping/softskills dataset/softskills.json
# **** NOTE: This dataset can be further splited and expanded, but depends on the imrpovement of accuracy
# **** NOTE: This dataset is generated with Google Resources. We can also generate it with NERD Resources with similar step! NERD Resorces can be found under: ./ocbang/NERD Resources/
# **** NOTE: This is fuzzy matching process. The threshold used now is 2. (i.e We only retain those softskills which corresponding key words occur twice in specific sentence!) We can alter this threshold based on the accuracy.
# **** NOTE: If we have enough datasets, we can get a self_trained dataset. (with BERT or Word2Vect)
#
#
# **** NOTE: Things to be finished.
# - Other Requirements:
# We should store each candidate and JD with following format in the dataset:
# ------------------------------------------------------------------------------------------------------
# | Other Requirements (including major/degree/yoe/title/work_autho etc.) | Hardskills and Softskills  |
# | Non-Binary Format (may include string etc.)                           | Binary Format (Only 1 or 0)|
# | e.g: degree [13,["computer science","engineer"]]                      |                            |
# ------------------------------------------------------------------------------------------------------
# Before Compare, we need to transfer each candidate profile to all binary-format profile based on the requirement of jd.
# Major requirements may include work_autho, degree_major, yoe_industry. We need to consider them based on our own standard.
# Other Requirements like posting history, recruiting history and preferences should be considered seperately.
#
#
# **** NOTE: Thins to be finished.
# - About accuracy and improvement:
# After transformation, we need to calculate all cosine similarities between target JD and candidates in our database and make recommendatation based on the (1)Threshold and (2)Rankings.
# To test accuracy of our model, we need: JD & some resumes (both passed and failed included)/ resume & some JDs (both passes and failed included) to do bidirectional verification.
# Potential ways to imrpove accuracy in the whole step:
# (1) Improve the accuracy of Parsing.
# (2) Give different aspects (degree/hard/soft etc.) different weights. (Domain Experts Suggestion/ Allow User to choose the weight by themselves)
# (3) Further Expanded/ Labeled/ Splited 4 core Dataset. (major/title/hardskill/softskill)
# (4) Consider the behavoral factors when transforming (like recruiting history, posting history etc.)
# (5) Use different NLP methods to train dataset. (BERT/NERD/Google/Word2Vec/self-defined)
# (6) Change the threshold when considering softskills.