In [1]:
from pymongo import MongoClient
from bson import ObjectId
import pandas as pd
import json
import time, datetime

import numpy as np
import math
import gensim
from scipy import spatial


#Loading Database
client = MongoClient(port=27017)
database = client["HireApp"]

# Global getters

These functions fetch the whole database to find any academic, professionnal experience as well as skills and languages? The goal is to build a standard format vector for the sake of comparison

In [2]:
#Get all academic
def get_all_academic():
    _ac = [x['resume']['academic_cursus'] for x in database['users'].find({}) if 'resume' in x.keys() and 'academic_cursus' in x['resume'].keys()]
    all_ac = []
    for e in _ac:
        all_ac.extend([x['title'] for x in e])
        
    _ac = [x['requiredDegrees'] for x in database['posts'].find({}) if 'requiredDegrees' in x.keys()]
    for e in _ac:
        all_ac.extend([x['option'] for x in e])

    return sorted(list(set(all_ac)))

In [3]:
#Get all experiences
def get_all_experiences():
    _xps = [x['resume']['professionnal_cursus'] for x in database['users'].find({}) if 'resume' in x.keys() and 'professionnal_cursus' in x['resume'].keys()]
    all_xps = []
    for e in _xps:
        all_xps.extend([x['title'] for x in e])
        
    _xps = [x['requiredExp'] for x in database['posts'].find({}) if 'requiredExp' in x.keys()]
    for e in _xps:
        all_xps.extend([x['title'] for x in e])

    return sorted(list(set(all_xps)))    

In [4]:
#Get all skills
def get_all_skills():
    _skills = [x['resume']['skills'] for x in database['users'].find({}) if 'resume' in x.keys() and 'skills' in x['resume'].keys()]
    all_skills = []
    for e in _skills:
        all_skills.extend([x['skill'] for x in e])
        
    _skills = [x['requiredSkills'] for x in database['posts'].find({}) if 'requiredSkills' in x.keys()]
    for e in _skills:
        all_skills.extend([x['skill'] for x in e])

    return sorted(list(set(all_skills)))

In [5]:
#Get all languages
def get_all_languages():
    _languages = [x['resume']['languages'] for x in database['users'].find({}) if 'resume' in x.keys() and 'languages' in x['resume'].keys()]
    all_languages = []
    for e in _languages:
        all_languages.extend([x['lang'] for x in e])
        
    _languages = [x['requiredLanguages'] for x in database['posts'].find({}) if 'requiredLanguages' in x.keys()]
    for e in _languages:
        all_languages.extend([x['lang'] for x in e])

    return sorted(list(set(all_languages)))

# Evaluating user by his resume

To have an idea about user's ability to excel at the post, we need to evaluate his CV by parsing their skills, languages, academic and professional experiences to vectors and comparing them to the offer in question

In [6]:
def user_academic_to_vector(resume):
    def evaluate_bac_plus(x):
        if x['degree'] == 'PHD':
            return 7
        elif x['degree'] in ['MBA', 'Master', 'Engineer']:
            return 5
        elif x['degree'] in ['Bachelor', 'Technicien Spécialisé']:
            return 3
        elif x['degree'] in ['DEUG', 'Technicien']:
            return 2
            
        return 0
        
    _ac = dict((x, 0) for x in get_all_academic())
    u_ac = dict((x['title'], evaluate_bac_plus(x)) for x in resume['academic_cursus'])
    _ac.update(u_ac)
    return np.array(list(_ac.values()))

In [7]:
def user_experience_to_vector(resume):
    def calculate_exp_duration(x): 
        begin_date = datetime.datetime.strptime(x['begin_date'],"%Y-%m-%dT%X.%fZ").timetuple()
        end_date = datetime.datetime.strptime(x['end_date'],"%Y-%m-%dT%X.%fZ").timetuple()

        xp_years = (end_date.tm_year - begin_date.tm_year) + (end_date.tm_mon - begin_date.tm_mon)/12

        return round(xp_years, 1)
    
    _xp = dict((x, 0) for x in get_all_experiences())
    u_xp = dict((x['title'], calculate_exp_duration(x)) for x in resume['professionnal_cursus'])
    _xp.update(u_xp)
    return np.array(list(_xp.values()))

In [8]:
def user_lang_to_vector(resume):
    _lang = dict((x, 0) for x in get_all_languages())
    u_lang = dict((x['lang'], x['level']) for x in resume['languages'])
    _lang.update(u_lang)
    return np.array(list(_lang.values()))

In [9]:
def user_skills_to_vector(resume):
    _skill = dict((x, 0) for x in get_all_skills())
    u_skill = dict((x['skill'], x['level']) for x in resume['skills'])
    _skill.update(u_skill)
    return np.array(list(_skill.values()))

# Evaluating a job offer

In [10]:
def job_academic_to_vector(job):
    j_dict = dict((x, 0) for x in get_all_academic())
    j_ac = dict((x['option'], x['level']) for x in job['requiredDegrees'])
    j_dict.update(j_ac)
    return np.array(list(j_dict.values()))

In [11]:
def job_experience_to_vector(job):
    j_dict = dict((x, 0) for x in get_all_experiences())
    j_xp = dict((x['title'], x['level']) for x in job['requiredExp'])
    j_dict.update(j_xp)
    return np.array(list(j_dict.values()))

In [12]:
def job_lang_to_vector(job):
    j_dict = dict((x, 0) for x in get_all_languages())
    j_xp = dict((x['lang'], x['level']) for x in job['requiredLanguages'])
    j_dict.update(j_xp)
    return np.array(list(j_dict.values()))

In [13]:
def job_skills_to_vector(job):
    j_dict = dict((x, 0) for x in get_all_skills())
    j_xp = dict((x['skill'], x['level']) for x in job['requiredSkills'])
    j_dict.update(j_xp)
    return np.array(list(j_dict.values()))

# Building comparison dictionnaries

Dict data structures containing vectors for academic, experience, skills and languages converted to vectors
Applicable to JobOffer and Resume obejct types

In [14]:
def resume_to_vector(user_id):
    v = {}
    resume = dict(database['users'].find_one({'_id': ObjectId(user_id)}))['resume']
    v['academic'] = user_academic_to_vector(resume)
    v['experience'] = user_experience_to_vector(resume)
    v['lang'] = user_lang_to_vector(resume)
    v['skills'] = user_skills_to_vector(resume)
    return v

In [15]:
def job_to_vector(post_id):
    v = {}
    job = dict(database['posts'].find_one({'_id': ObjectId(post_id)}))
    v['academic'] = job_academic_to_vector(job)
    v['experience'] = job_experience_to_vector(job)
    v['lang'] = job_lang_to_vector(job)
    v['skills'] = job_skills_to_vector(job)
    return v

job_to_vector('5e92f8b03bc187865692b516')

{'academic': array([0, 0, 0, 0, 0, 0, 0, 0, 2]),
 'experience': array([0, 0, 0, 1, 0]),
 'lang': array([0, 0, 0]),
 'skills': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 3])}

# Comparators

In [16]:
def academic_compare(res_a, res_b):
    if len(res_a['academic']) != len(res_b['academic']):
        throw('Unequal vectors exception')
    
    i = 0
    a  = list(res_a['academic'])
    b  = list(res_b['academic'])
    while i < len(a):
        if a[i] == 0 and b[i] == 0:
            a.pop(i)
            b.pop(i)
        i = i + 1
    
    if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0:
        return 1.
        
    value = spatial.distance.cosine(a, b)
    return value

In [17]:
def experience_compare(res_a, res_b):
    if len(res_a['experience']) != len(res_b['experience']):
        throw('Unequal vectors exception')
    
    i = 0
    a  = list(res_a['experience'])
    b  = list(res_b['experience'])
    while i < len(a):
        if a[i] == 0 and b[i] == 0:
            a.pop(i)
            b.pop(i)
        i = i + 1
        
    if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0:
        return 1.
        
    value = spatial.distance.cosine(a, b)
    return value

In [18]:
def lang_compare(res_a, res_b):
    if len(res_a['lang']) != len(res_b['lang']):
        throw('Unequal vectors exception')
    
    i = 0
    a  = list(res_a['lang'])
    b  = list(res_b['lang'])
    while i < len(a):
        if a[i] == 0 and b[i] == 0:
            a.pop(i)
            b.pop(i)
        i = i + 1
        
    if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0:
        return 1.
    
    value = spatial.distance.cosine(a, b)
    return value if value != np.nan else 1

In [19]:
def skills_compare(res_a, res_b):
    if len(res_a['skills']) != len(res_b['skills']):
        throw('Unequal vectors exception')
    
    i = 0
    a  = list(res_a['skills'])
    b  = list(res_b['skills'])
    while i < len(a):
        if a[i] == 0 and b[i] == 0:
            a.pop(i)
            b.pop(i)
        i = i + 1
        
    if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0:
        return 1.
    
    value = spatial.distance.cosine(a, b)
    return value

Comparison methods
<ul>
<li>User to User</li>
<li>User to Offer</li>
<li>Company to User</li>
<li>Company to Company</li>
</ul>

In [28]:
#User similarity
def user_to_user(user_a, user_b):
    res_a, res_b = resume_to_vector(user_a), resume_to_vector(user_b)
    comparison = {}
    comparison['academic'] = academic_compare(res_a, res_b)
    comparison['experience'] = experience_compare(res_a, res_b)
    comparison['lang'] = lang_compare(res_a, res_b)
    comparison['skills'] = skills_compare(res_a, res_b)
    return 1 - np.mean(list(comparison.values()))

user_to_user('5e92f8b03bc187865692b519', '5e92f8b03bc187865692b517')

0.685437049920216

In [30]:
#Match score
def user_to_offer(user, job):
    res_a, res_b = resume_to_vector(user), job_to_vector(job)
    comparison = {}
    comparison['academic'] = academic_compare(res_a, res_b)
    comparison['experience'] = experience_compare(res_a, res_b)
    comparison['lang'] = lang_compare(res_a, res_b)
    comparison['skills'] = skills_compare(res_a, res_b)
    return 1 - np.mean(list(comparison.values()))

user_to_offer('5e92f8b03bc187865692b517', '5e92f8b03bc187865692b516')

0.011969778551569443

In [None]:
def company_to_user(company, user):
    res_a, res_b = resume_to_vector(user), job_to_vector(job)
    comparison = {}
    comparison['academic'] = academic_compare(res_a, res_b)
    comparison['experience'] = experience_compare(res_a, res_b)
    comparison['lang'] = lang_compare(res_a, res_b)
    comparison['skills'] = skills_compare(res_a, res_b)
    return 1 - np.mean(list(comparison.values()))

user_to_offer('5e92f8b03bc187865692b517', '5e92f8b03bc187865692b516')