## 1. Importing Libraries/Tools

In [1]:
import gensim
import multiprocessing
import nltk
import numpy as np
import pandas as pd
import re
import sklearn
import spacy
import pickle

from gensim import corpora
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from spacy.tokenizer import Tokenizer

## 2. Load Relevant Data

In [2]:
# load pre-trained model
nlp = spacy.load('en_core_web_sm')
# Tokenize words only with the whitespace rule
# N-grams will no longer be treated as 'N' and '-grams'
nlp.tokenizer = Tokenizer(nlp.vocab, token_match=re.compile(r'\S+').match)

skills_api = pd.read_excel('all_skills_emsi.xlsx')
skills_api['name'] = skills_api['name'].apply(lambda x: re.sub("\W?\(.*?\)","",x))
skills_api['name'] = skills_api['name'].apply(lambda x: x.strip().lower())
skills_api['lemmatized_name'] = skills_api['name'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))
skills_api = set(skills_api['name']).union(set(skills_api['lemmatized_name']))

df1 = pd.read_csv('Resume.csv')
# standardise column names
df1['Text'] = df1['Resume_str']
df1 = df1[['ID','Category','Text']]

df2 = pd.read_csv('Job Description.csv')
# get only english job descriptions
df2 = df2[df2['job_post_lang'].str.lower() == 'en-gb'] 
# standardise column names
df2['ID'] = df2['uniq_id']
df2['Category'] = df2['category']
df2['Text'] = df2['job_requirements']
df2 = df2[['ID','Category','Text']]
# drop empty rows
df2 = df2.dropna().reset_index(drop=True) 

In [3]:
df1["Category"].value_counts()

INFORMATION-TECHNOLOGY    120
BUSINESS-DEVELOPMENT      120
FINANCE                   118
ADVOCATE                  118
ACCOUNTANT                118
ENGINEERING               118
CHEF                      118
AVIATION                  117
FITNESS                   117
SALES                     116
BANKING                   115
HEALTHCARE                115
CONSULTANT                115
CONSTRUCTION              112
PUBLIC-RELATIONS          111
HR                        110
DESIGNER                  107
ARTS                      103
TEACHER                   102
APPAREL                    97
DIGITAL-MEDIA              96
AGRICULTURE                63
AUTOMOBILE                 36
BPO                        22
Name: Category, dtype: int64

In [4]:
df2["Category"].value_counts()

Registered Nurses                                    2513
Personal Care Aides                                   508
Social and Human Service Assistants                   415
Home Health Aides                                     393
Medical and Health Services Managers                  253
                                                     ... 
Health Technologists and Technicians, All Other         1
Electronics Engineering Technicians                     1
Production Workers, All Other                           1
Insurance Adjusters, Examiners, and Investigators       1
Computer Network Architects                             1
Name: Category, Length: 291, dtype: int64

## 3. Skills Extraction

Common Skills Extraction Functions

In [5]:
def preprocess(txt):
    txt = txt.lower()
    # these must come first
    txt = re.sub('b\.\S*', '', txt) # remove all bachelor qualifications
    txt = re.sub('m\.\S*', '', txt) # remove all master qualifications
    # then these
    txt = txt.replace("'","").replace("’","") # remove apostrophes
    txt = re.sub('<.*?>',' ',txt) # remove <> tags
    txt = re.sub('http\S+\s*', ' ', txt)  # remove URLs
    txt = re.sub('RT|cc', ' ', txt)  # remove RT and cc
    txt = re.sub('#\S+', '', txt)  # remove hashtags
    txt = re.sub('@\S+', '  ', txt)  # remove mentions
    txt = re.sub('[^a-zA-Z]', ' ', txt) # Remove non-English characters
    txt = re.sub('\s+', ' ', txt)  # remove extra whitespace

    # tokenize word
    txt = nlp(txt)

    # remove stop words
    txt = [token.text for token in txt if not token.is_stop]

    return ' '.join(txt)

def n_grams(tokens, n):
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

def generate_list_of_skills(text):
    nlp_text = nlp(text)

    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]
    # all the resume skills will be saved here
    skillset = []
        
    # check for one-grams (example: python)
    for token in tokens:
        skillset.append(token)
        
    # check for noun_chunks (example: machine learning)
    for token in nlp_text.noun_chunks:
        token = token.text.lower().strip()
        skillset.append(token)

    # check for N-grams that SpaCy missed in the noun_chuncks
    for n in range(2, 5):
        for token in n_grams(tokens, n):
            token = token.lower().strip()
            skillset.append(token)

    return skillset

### 3.1 User Skills Extraction

Functions

The following functions are used to iudentify and extract skills from a user's resume, calculate the years of experience candidate has in said skills based on the dates provided and the final result will be a dictionary where the key is the identified skill and the value is the number of years of experience the candidate has. 

In [6]:
def section_break(original_resume_text):
    pattern = r'\b[A-Z]+\b'
    res = list(re.finditer(pattern, original_resume_text))
    ans = []

    if len(res) == 0:
        ans.append(original_resume_text)
    elif len(res) == 1:
        ans.append(original_resume_text[res[0].span()[1]:])
    else:
        i = 1
        while i < len(res):
            ans.append(original_resume_text[res[i-1].span()[1]:res[i].span()[0]])
            i += 1
        ans.append(original_resume_text[res[i-1].span()[1]:])
    return ans

def extract_skills(resume_text, skills_api, clean=True):
    if clean == True:
        resume_text = preprocess(resume_text)
    
    skillset = set([i for i in set([i.lower() for i in generate_list_of_skills(resume_text)])])
    return skillset.intersection(skills_api)

def date_search(resume):
    ans = []

    # find all the date occurrence based on the regular expression
    pattern = r'(((Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|Jul(y)?|Aug(ust)?|Sep(tember)?|Nov(ember)?|Dec(ember)?)|(\d{1,2}\s?\/){0,2}|(\d{1,2}\s?\-){0,2})\s?[-/ ]?\s?\d{4}?)|\bPresent\b|\bpresent\b|\bCurrent\b|\bcurrent\b'
    res = list(re.finditer(pattern, resume))

    if len(res) > 1:
        for ele in res:
            # this is to eradicate the results of having only year but without month
            if len(ele.group().strip()) > 5:
                ans.append([ele.start(), ele.end(), ele.group().strip()])

    res = []
    # Convert "present" and "current" to today's date
    for ele in ans:
        if ele[2].lower() == 'present' or ele[2].lower() == 'current':
            today = pd.to_datetime('today').date()
            ele[2] = today
            res.append(ele)
        else:
            # catch DateParse Error here
            try:
                day = pd.to_datetime(ele[2]).date()
                ele[2] = day
                res.append(ele)
            except :
                print('Cannot parse the date: ', ele[2])

    
    # all the date results are given in the form of [datetime_start_index, datetime_end_index, datetime]
    return res

def experience_tagging(date_list):
    i = 1
    cleaned_section = {}

    while i < len(date_list):

        prev = date_list[i-1]
        cur = date_list[i]
        
        if cur[0] < prev[1] + 10:
            # Taking ceiling of the year of experience
            key = ((cur[2] - prev[2]).days // 365) + 1
            
            # section starts at the (end index of the current date) + 1
            frm = cur[1]+1

            if i < len(date_list) - 1:
                # if there is another date that appears later, then the section will be until the start index of the next date
                until = date_list[i+1][0]
            else:
                # else the section will be until the end of the resume
                until = -1
            

            # Multiple projects with same year of experience, we do 'chaining' here
            if key in cleaned_section:
                cleaned_section[key].append((frm, until))
            else:
                cleaned_section[key] = [(frm, until)]
            i += 2

        else:

            # ignore the current date, possibly it is useless
            i += 1

    return cleaned_section

def skills_experience_level_identification(resume, skills_api, clean=True):
    res = {}
    sections = section_break(resume)

    for section in sections:
        date_list = date_search(section)
        experience_sec = experience_tagging(date_list)
        for key in experience_sec:
            for start, end in experience_sec[key]:
                skills_list = extract_skills(section[start:end], skills_api, clean)
                for ele in skills_list:
                    if ele not in res:
                        res[ele] = key
                    else:
                        res[ele] = max(key, res[ele])

    # for all the skills which do not have any level of experience, assign a default value of 1
    skills_list = extract_skills(resume, skills_api, True)
    for ele in skills_list:
        if ele not in res:
            res[ele] = 1

    return dict(sorted(res.items(), key = lambda x:x[1], reverse = True))

Test Cases

In [7]:
# Test Case, Output will be a dictionary showing the skills as the key and the years of experience as the value. 
test_user = df1.iloc[300,2]
test_user_skills = skills_experience_level_identification(test_user, skills_api)
print(df1.iloc[300,1])
test_user_skills

INFORMATION-TECHNOLOGY


{'operations': 18,
 'desktop support': 1,
 'technical support': 1,
 'information systems': 1,
 'coordinate': 1,
 'management': 1,
 'microsoft certified technology specialist': 1,
 'network support': 1,
 'web development': 1,
 'security policies': 1,
 'r': 1,
 'streamlining': 1,
 'adware': 1,
 'ghost': 1,
 'web servers': 1,
 'virus': 1,
 'application deployment': 1,
 'business technology': 1,
 'disaster recovery': 1,
 'adobe acrobat': 1,
 'operating systems': 1,
 'reduction': 1,
 'capacity planning': 1,
 'planning': 1,
 'windows desktop': 1,
 'color': 1,
 'group policy': 1,
 'troubleshooting': 1,
 'laser printers': 1,
 'citrix certified administrator': 1,
 'project management': 1,
 'business continuity': 1,
 'netscaler': 1,
 'desktop environment': 1,
 'communication': 1,
 'active directory': 1,
 'b': 1,
 'laser': 1,
 'plan': 1,
 'spyware': 1,
 'reliability': 1,
 'management information systems': 1,
 'system center configuration manager': 1,
 'security': 1,
 'information technology': 1}

In [14]:
# extract skills from all rows > store in dictionary > pickle dump
'''
df1["Skills"] = df1["Text"].apply(lambda x: skills_experience_level_identification(x,skills_api))
resume_skills0 = {}
for index, row in df1.iterrows():
  resume_skills0[row.ID] = row.Skills
with open('resume_skills.pkl', 'wb') as handle:
  pickle.dump(resume_skills0, handle)
'''

'\ndf1["Skills"] = df1["Text"].apply(lambda x: skills_experience_level_identification(x,skills_api))\nskills0 = {}\nfor index, row in df1.iterrows():\n  skills0[row.ID] = row.Skills\nwith open(\'user_skills.pkl\', \'wb\') as handle:\n  pickle.dump(skills0, handle)\n'

### 3.2 Required Skills Extraction

The next portion will be to extract the skills needed for a particular job. Since there are no dates for us to parse in job descriptions, we will determine the proficiency needed for each skill by the percentage of resume/job postings having certain skills. 

Common Functions

In [16]:
def find_significant_skills(agg_table):

    # gauge skill levels according to percentiles
    skills_required = {}
    for col in agg_table.columns:
        skills = agg_table[col][agg_table[col] > 0]

        # no skills extracted > skills required = empty dictionary
        if len(skills) == 0: 
            skills_required[col] = dict()
        
        # some skills extracted > skills required = some dictionary
        else:
            series = agg_table[col][agg_table[col] >= np.percentile(skills, 95)]

            # if all skills above 95th percentile have same frequency, then scale them to 1 (max)
            if len(series.unique()) == 1:
                scaled_series = series.apply(lambda x: 1)
            # if skills above 95th percentile have different frequency, then scale to values between 0 and 1
            else:
                scaled_series = series.apply(lambda x: (x - series.min()) / (series.max() - series.min()))

            # bin skills according to percentiles
            binned_series = scaled_series.apply(lambda x: 5 if x > 0.7 else 4 if x > 0.3 else 3)
            
            # convert series to dictionary form
            skills_required[col] = binned_series.to_dict()

    return skills_required

def get_significance_table(df,skills_dic):
    
    # creates a binary matrix indicating the presence of each skill
    df['Skills'] = df['ID'].apply(lambda x: list(skills_dic[x].keys()))
    mlb = MultiLabelBinarizer()
    table = pd.DataFrame(mlb.fit_transform(df['Skills']),
                         columns=mlb.classes_,
                         index=df['Skills'].index)
    
    # add category column as y
    y = df['Category']
    table['y'] = y

    # sum by category column and divide by total number of instances
    agg_table = table.groupby(['y']).sum()
    agg_table = agg_table.T / table.groupby(['y']).size()

    return agg_table

##### 3.2.1 From Resume

Test Case

In [17]:
resume_df = df1.copy()

with open('resume_skills.pkl', 'rb') as pickle_file:
    resume_skills_dic = pickle.load(pickle_file)

resume_table = get_significance_table(resume_df,resume_skills_dic)
resume_table

y,ACCOUNTANT,ADVOCATE,AGRICULTURE,APPAREL,ARTS,AUTOMOBILE,AVIATION,BANKING,BPO,BUSINESS-DEVELOPMENT,...,DIGITAL-MEDIA,ENGINEERING,FINANCE,FITNESS,HEALTHCARE,HR,INFORMATION-TECHNOLOGY,PUBLIC-RELATIONS,SALES,TEACHER
abaqus,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.016949,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
ablation,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.008621,0.000000
ableton live,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.010417,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
abnormal psychology,0.0,0.008475,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.008696,0.009091,0.0,0.0,0.000000,0.000000
absorption,0.0,0.000000,0.015873,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zoning,0.0,0.000000,0.000000,0.030928,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.009804
zoology,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
zoom,0.0,0.000000,0.015873,0.000000,0.000000,0.0,0.0,0.0,0.0,0.008333,...,0.000000,0.000000,0.0,0.008547,0.000000,0.000000,0.0,0.0,0.000000,0.000000
zumba,0.0,0.000000,0.000000,0.000000,0.009709,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.051282,0.000000,0.000000,0.0,0.0,0.000000,0.000000


In [18]:
resume_skills_required = find_significant_skills(resume_table)


with open('resume_skills_required.pkl', 'wb') as handle:
  pickle.dump(resume_skills_required, handle)


resume_skills_required

{'ACCOUNTANT': {'audit': 4,
  'auditing': 3,
  'balance': 4,
  'balance sheet': 3,
  'bank reconciliations': 3,
  'banking': 3,
  'billing': 4,
  'bookkeeping': 3,
  'budget': 4,
  'budgeting': 3,
  'business administration': 4,
  'cash flow': 3,
  'close': 3,
  'closing': 4,
  'collections': 3,
  'communication': 4,
  'customer service': 3,
  'depreciation': 3,
  'detail oriented': 3,
  'file': 3,
  'filing': 3,
  'finance': 4,
  'financial analysis': 3,
  'financial statement': 3,
  'financial statements': 4,
  'forecast': 3,
  'forecasting': 3,
  'general ledger': 4,
  'invoice': 3,
  'invoicing': 3,
  'journal': 4,
  'leadership': 3,
  'ledger': 5,
  'level': 3,
  'm': 3,
  'management': 5,
  'microsoft office': 4,
  'operations': 4,
  'organizational skills': 3,
  'plan': 3,
  'planning': 3,
  'problem solving': 3,
  'purchase': 3,
  'quickbooks': 4,
  'r': 3,
  'reconciliation': 4,
  'research': 3,
  'sales': 4,
  'sales tax': 3,
  'spreadsheets': 3,
  'tax returns': 3,
  'time m

##### 3.2.2 From Job Descriptions

Functions

In [19]:
def extract_skills_frequency(job_text, skills_api, clean=True):
  if clean == True:
      job_text = preprocess(job_text)

  # get all skills
  skillset = [i.lower() for i in generate_list_of_skills(job_text)]

  # get frequency of each skill
  skillset_count = {}
  for skill in skillset:
    if skill in skills_api:
      if skill in skillset_count:
        skillset_count[skill] = skillset_count[skill] + 1
      else:
        skillset_count[skill] = 1
  
  return dict(sorted(skillset_count.items(), key = lambda x:x[1], reverse = True))


Test Cases

In [20]:
test_job = df2.iloc[13,2]
test_job_skills = extract_skills_frequency(test_job,skills_api,True)
print(df2.iloc[13,1])
test_job_skills

Software Developers, Applications


{'management': 3,
 'c': 2,
 'security': 2,
 'software module': 2,
 'programming': 1,
 'linux': 1,
 'collaboration': 1,
 'software development': 1,
 'architectural patterns': 1,
 'configuration management': 1,
 'requirements management': 1,
 'cyber security': 1,
 'software engineering': 1,
 'technical design': 1,
 'software development methodologies': 1,
 'software configuration management': 1}

In [60]:
# extract skills from all rows > store in dictionary > pickle dump
'''
df2["Skills"] = df2["Text"].apply(lambda x: extract_skills_frequency(x,skills_api,True))
job_skills0 = {}
for index, row in df2.iterrows():
  job_skills0[row.ID] = row.Skills
with open('job_skills.pkl', 'wb') as handle:
  pickle.dump(job_skills0, handle)
'''

In [21]:
job_df = df2.copy()

with open('job_skills.pkl', 'rb') as pickle_file:
    job_skills_dic = pickle.load(pickle_file)

job_table = get_significance_table(job_df,job_skills_dic)
job_table

y,Accountants,Actuaries,Acute Care Nurses,"Administrative Law Judges, Adjudicators, and Hearing Officers",Administrative Services Managers,Advanced Practice Psychiatric Nurses,Advertising Sales Agents,Advertising and Promotions Managers,Aerospace Engineers,Anesthesiologists,...,Telemarketers,Training and Development Managers,Training and Development Specialists,Treasurers and Controllers,Tutors,Upholsterers,Veterinary Technologists and Technicians,"Vocational Education Teachers, Postsecondary",Waiters and Waitresses,Web Developers
abdomen,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
absorption,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
academic standards,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
acoustic,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
acoustics,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yoga,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
zone,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
zoning,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
zoology,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


The output below is a dictionary where the key is the role the candidate is seeking and the value tagged to it is a dictionary with the required skill as the key and the proficiency level tagged as the value. Since the courses dataset that is explored further on has a competency level of either 3, 4 or 5, all proficiency levels will either be 3, 4 or 5 to ease comparision of skills gap.  

In [22]:
job_skills_required = find_significant_skills(job_table)


# with open('job_skills_required.pkl', 'wb') as handle:
#   pickle.dump(job_skills_required, handle)


job_skills_required

{'Accountants': {'audit': 3,
  'communication': 5,
  'finance': 4,
  'management': 4},
 'Actuaries': {'actuarial exams': 5,
  'communication': 5,
  'covenant': 5,
  'investment': 5},
 'Acute Care Nurses': {'management': 5,
  'nursing': 3,
  'planning': 3,
  'registration': 3,
  'rehabilitation': 3,
  'scheme': 4},
 'Administrative Law Judges, Adjudicators, and Hearing Officers': {'advise': 5,
  'comment': 5,
  'management': 5,
  'triage': 5},
 'Administrative Services Managers': {'c': 5,
  'customer engagement': 5,
  'customer service': 5,
  'grade': 5,
  'head': 5,
  'procurement': 5,
  'scheme': 5},
 'Advanced Practice Psychiatric Nurses': {'act': 4,
  'mental health': 5,
  'rehabilitation': 3,
  'supervision': 3},
 'Advertising Sales Agents': {'b': 5, 'sales': 5},
 'Advertising and Promotions Managers': {'brand': 5,
  'brand identity': 5,
  'budget': 5,
  'facebook': 5,
  'licensing': 5,
  'linkedin': 5,
  'management': 5,
  'marketing': 5,
  'marketing copy': 5,
  'merchandise': 5,

## 4. Skills Gap Identification

Skills Gap Functions

The following portion is our algorithm to idenitfy exactly how far the candidate is from obtaining their desired occupation. 

In [2]:
# Map years of experience to competency level of either 3, 4 or 5
def user_level_deduction(years):
    if years <= 2:
        return 3
    elif years <= 5:
        return 4
    else:
        return 5

# return skills gap grouped by skills
def skills_gap_identification(skills, skills_required):
    diff = {}
    for key in skills_required:
        if key not in skills:
            diff[key] = [x for x in range(3, skills_required[key] + 1)]
        else:
            user_level = user_level_deduction(skills[key])
            if user_level < skills_required[key]:
                diff[key] = [x for x in range(user_level + 1, skills_required[key] + 1)]
    return diff

### 4.1 Gaps between User and Job Requirements

In [49]:
with open('job_skills.pkl', 'rb') as pickle_file:
    job_skills = pickle.load(pickle_file)

with open('resume_skills.pkl', 'rb') as pickle_file:
    resume_skills = pickle.load(pickle_file)

with open('job_skills_required.pkl', 'rb') as pickle_file:
    job_skills_required = pickle.load(pickle_file)

skills_gap_jobs = skills_gap_identification(resume_skills[31605080], job_skills_required['Web Developers'])
skills_gap_jobs

{'angular': [3],
 'c': [3, 4],
 'git': [3],
 'javascript': [3, 4, 5],
 'php': [3],
 'sql': [3]}

### 4.2 Gaps between User and Other Candidates

We also added a portion that makes comparison between candidates as we believe that it is also crucial for the candidate to understand where he/she stands in the current market in order for them to obtain their desired occupation. 

In [48]:
with open('resume_skills_required.pkl', 'rb') as pickle_file:
    resume_skills_required = pickle.load(pickle_file)

skills_gap_cand = skills_gap_identification(resume_skills[31605080], resume_skills_required['AVIATION'])
skills_gap_cand

{'aircraft maintenance': [3],
 'ammunition': [3],
 'aviation': [4, 5],
 'b': [3],
 'budget': [3],
 'business administration': [3],
 'c': [3],
 'communication': [3, 4],
 'communications': [3],
 'control systems': [3],
 'coordinate': [3],
 'coordinating': [3],
 'critical thinking': [3],
 'data entry': [3],
 'electronic': [3],
 'electronics': [3],
 'filing': [3],
 'hydraulic': [3],
 'inventory control': [3],
 'leadership': [3, 4],
 'level': [3, 4],
 'license': [3],
 'm': [3],
 'machine': [3],
 'management': [4, 5],
 'marketing': [3],
 'mechanic': [3],
 'merchandise': [3],
 'microsoft office': [3],
 'operation': [3, 4],
 'operations': [3, 4],
 'paperwork': [3],
 'plan': [3],
 'planning': [3, 4],
 'presentations': [3],
 'process improvement': [3],
 'procurement': [3],
 'project management': [3],
 'purchase': [3],
 'purchasing': [3],
 'quality assurance': [3],
 'quality control': [3],
 'r': [3],
 'reduction': [3],
 'repair': [3, 4],
 'resource': [3],
 'safety training': [3],
 'sales': [3],
 

## 5. Pathway Generation

Now that the gaps have been identified, it is time for us to generate a learning path of courses that the user can take in order for them to bridge it. There are 2 different methods used in this notebook, Dot2Vec and Spacy.

In [39]:
courses = pd.read_excel('Courses.xlsx')
courses = courses.fillna('')
courses['Description'] = courses['jobFamily'] + ' ' \
                         + courses['Marketing Name'] + ' ' \
                         + courses['courseName'] + ' ' \
                         + courses ['moduleName'] + ' ' \
                         + courses['courseDesc'] + ' ' \
                         + courses['Outcome Description'] +  ' ' \
                         + courses['competencyUnitDesc']
courses = courses[['productId', 'Marketing Name', 'Description', 'jobFamily', 'competencyLevel']]
courses['Description'] = courses['Description'].astype(str)
courses['Description'] = courses['Description'].apply(preprocess)

In [50]:
courses.jobFamily.value_counts()

Software Design & Development                 514
Infocomm Sales & Marketing                    195
Infrastructure Support                        105
Business Analytics                            105
Digital Advertising / Digital Distribution    100
Infrastructure Architecture                    37
Enterprise Mobility                            17
Project Management                             16
Strategy and Architecture                      15
Service Innovation Design                      14
Infocomm Security                              13
Sales and Marketing                            12
Cloud Computing                                 9
Data Centre Management                          9
Product Management                              7
IT Management                                   5
Enterprise Network Design Management            5
IT Outsourcing Management                       1
Generic Skills                                  1
Name: jobFamily, dtype: int64

Functions

In [4]:
# group skills gap by level
def skills_gap_by_level(skills_gap):
    new_skills_gap = {}
    for skill in skills_gap:
        for level in skills_gap[skill]:
            if level in new_skills_gap:
                new_skills_gap[level].append(skill)
            else:
                new_skills_gap[level] = [skill]
    return new_skills_gap

### 5.1 Doc2Vec

In [40]:
def tagcol_paragraph_embeddings_features(train_data):

    # Expects a dataframe with a 'Description' column
    train_data_values = train_data['Description'].values
    
    # Remember to use token.text to get the raw string, otherwise doc2vec cannot build vocabulary
    columns = [TaggedDocument([token.text for token in nlp(text) if token is not token.is_stop] , [i]) for i, text in enumerate(train_data_values)]
    
    return columns

corpus = tagcol_paragraph_embeddings_features(courses)
model = Doc2Vec(dm=0, vector_size=50, workers=multiprocessing.cpu_count(), min_count=2, epochs=100, hs=1, negative=0)
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)
# model.save("d2v_model.pkl")

Test Case

In [47]:
def course_suggestion_d2v(doc2vec_model, skills_gap_cand, skills_gap_jobs, courses_dataset):
    ans = {'job': {}, 'continuous learning': {}}

    for key in skills_gap_jobs:
        vector = doc2vec_model.infer_vector(skills_gap_jobs[key])
        res = doc2vec_model.dv.most_similar([vector], topn=20)
        course_unique = set()
        course_list = []
        for i, prob in res:
            if courses_dataset.loc[i, 'competencyLevel'][0] == str(key) and courses_dataset.loc[i, 'Marketing Name'] not in course_unique:
                course_unique.add(courses_dataset.loc[i, 'Marketing Name'])
                course_list.append((courses_dataset.loc[i, 'productId'], courses_dataset.loc[i, 'Marketing Name'], courses_dataset.loc[i, 'competencyLevel']))
        ans['job'][key] = course_list[:5]

    for key in skills_gap_cand:
        vector = doc2vec_model.infer_vector(skills_gap_cand[key])
        res = doc2vec_model.dv.most_similar([vector], topn=20)
        course_unique = set()
        course_list = []
        for i, prob in res:
            if courses_dataset.loc[i, 'competencyLevel'][0] == str(key) and courses_dataset.loc[i, 'Marketing Name'] not in course_unique:
                course_unique.add(courses_dataset.loc[i, 'Marketing Name'])
                course_list.append((courses_dataset.loc[i, 'productId'], courses_dataset.loc[i, 'Marketing Name'], courses_dataset.loc[i, 'competencyLevel']))
        ans['continuous learning'][key] = course_list[:5]

    return ans

In [None]:
course_suggestion_d2v(model, skills_gap_by_level(skills_gap_cand), skills_gap_by_level(skills_gap_jobs), courses)

### 5.2 SpaCy

In [3]:
def course_suggestion_spacy(skills_gap_cand, skills_gap_jobs, courses_dataset):
    # load pre-trained model
    nlp = spacy.load('en_core_web_sm')
    ans = {'job': {}, 'continuous learning': {}}

    for level in skills_gap_jobs:
        skills_gap_text = " ".join(skills_gap_jobs[level])

        # get courses of same competency level
        df = courses_dataset[courses_dataset["competencyLevel"].str.contains(str(level))]

        # get similarity score
        df["Similarity"] = df["Description"].apply(lambda x: nlp(skills_gap_text).similarity(nlp(str(x))))
        top_courses = df.nlargest(20,'Similarity',keep='all')
        course_unique = set()
        course_list = []
        for index, row in top_courses.iterrows():
            if row['Marketing Name'] not in course_unique:
                course_unique.add(row['Marketing Name'])
                course_list.append((row['productId'], row['Marketing Name'], row['competencyLevel']))
        ans['job'][key] = course_list[:5]

    for level in skills_gap_cand:
        skills_gap_text = " ".join(skills_gap_cand[level])

        # get courses of same competency level
        df = courses_dataset[courses_dataset["competencyLevel"].str.contains(str(level))]

        # get similarity score
        df["Similarity"] = df["Description"].apply(lambda x: nlp(skills_gap_text).similarity(nlp(str(x))))
        top_courses = df.nlargest(20,'Similarity',keep='all')
        course_unique = set()
        course_list = []
        for index, row in top_courses.iterrows():
            if row['Marketing Name'] not in course_unique:
                course_unique.add(row['Marketing Name'])
                course_list.append((row['productId'], row['Marketing Name'], row['competencyLevel']))
        ans['continuous learning'][key] = course_list[:5]

    return ans
    

In [None]:
course_suggestion_spacy(model, skills_gap_by_level(skills_gap_cand), skills_gap_by_level(skills_gap_jobs), courses)

## 6. Final Function

For the final portion, we complie all our functions into a single function and the final output from our model is a dictionary where the key is the competency level (3, 4 or 5) and the value is the recommended courses for the user at said skill level. 

Make sure that you have `all_skills_emsi.xlsx`, `Courses.xlsx`, `resume_skills_required.pkl`, `job_skills_required.pkl` and `d2v_model.pkl`, and a sample resume `11592605.pdf`

### 6.1 Functions and Libraries Required

In [8]:
import gensim
import multiprocessing
import nltk
import numpy as np
import pandas as pd
import re
import sklearn
import spacy
import pickle

from gensim import corpora
import aspose.words as aw
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.preprocessing import MultiLabelBinarizer
from spacy.tokenizer import Tokenizer

In [15]:
# load pre-trained model
nlp = spacy.load('en_core_web_sm')
# Tokenize words only with the whitespace rule
# N-grams will no longer be treated as 'N' and '-grams'
nlp.tokenizer = Tokenizer(nlp.vocab, token_match=re.compile(r'\S+').match)

def preprocess(txt):
    txt = txt.lower()
    # these must come first
    txt = re.sub('b\.\S*', '', txt) # remove all bachelor qualifications
    txt = re.sub('m\.\S*', '', txt) # remove all master qualifications
    # then these
    txt = txt.replace("'","").replace("’","") # remove apostrophes
    txt = re.sub('<.*?>',' ',txt) # remove <> tags
    txt = re.sub('http\S+\s*', ' ', txt)  # remove URLs
    txt = re.sub('RT|cc', ' ', txt)  # remove RT and cc
    txt = re.sub('#\S+', '', txt)  # remove hashtags
    txt = re.sub('@\S+', '  ', txt)  # remove mentions
    txt = re.sub('[^a-zA-Z]', ' ', txt) # Remove non-English characters
    txt = re.sub('\s+', ' ', txt)  # remove extra whitespace

    # tokenize word
    txt = nlp(txt)

    # remove stop words
    txt = [token.text for token in txt if not token.is_stop]

    return ' '.join(txt)

def n_grams(tokens, n):
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

def generate_list_of_skills(text):
    nlp_text = nlp(text)

    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]
    # all the resume skills will be saved here
    skillset = []
        
    # check for one-grams (example: python)
    for token in tokens:
        skillset.append(token)
        
    # check for noun_chunks (example: machine learning)
    for token in nlp_text.noun_chunks:
        token = token.text.lower().strip()
        skillset.append(token)

    # check for N-grams that SpaCy missed in the noun_chuncks
    for n in range(2, 10):
        for token in n_grams(tokens, n):
            token = token.lower().strip()
            skillset.append(token)

    return skillset

def create_skills_api(skills_api_filename):
    skills_api = pd.read_excel(skills_api_filename)
    skills_api['name'] = skills_api['name'].apply(lambda x: re.sub("\W?\(.*?\)","",x))
    skills_api['name'] = skills_api['name'].apply(lambda x: x.strip().lower())
    skills_api['lemmatized_name'] = skills_api['name'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))
    skills_api = set(skills_api['name']).union(set(skills_api['lemmatized_name']))
    return skills_api

def read_resume(resume_filename):
    resume = aw.Document(resume_filename)
    resume_string = resume.to_string(aw.SaveFormat.TEXT).split('\r\n')
    resume_string = ' '.join(resume_string[1:-3])
    return resume_string

def create_courses_dataset(courses_dataset_filename):
    courses = pd.read_excel(courses_dataset_filename)
    courses = courses.fillna("")
    courses['Description'] = courses['jobFamily'] + " " \
                            + courses['Marketing Name'] + " " \
                            + courses['courseName'] + " " \
                            + courses ['moduleName'] + " " \
                            + courses['courseDesc'] + " " \
                            + courses['Outcome Description'] + " " \
                            + courses['competencyUnitDesc']
    courses = courses[['productId', 'Marketing Name', 'Description', 'jobFamily', 'competencyLevel']]
    courses['Description'] = courses['Description'].astype(str)
    courses['Description'] = courses['Description'].apply(preprocess)
    return courses

def section_break(original_resume_text):
    pattern = r'\b[A-Z]+\b'
    res = list(re.finditer(pattern, original_resume_text))
    ans = []

    if len(res) == 0:
        ans.append(original_resume_text)
    elif len(res) == 1:
        ans.append(original_resume_text[res[0].span()[1]:])
    else:
        i = 1
        while i < len(res):
            ans.append(original_resume_text[res[i-1].span()[1]:res[i].span()[0]])
            i += 1
        ans.append(original_resume_text[res[i-1].span()[1]:])
    return ans

def extract_skills(resume_text, skills_api, clean=True):
    if clean == True:
        resume_text = preprocess(resume_text)
    
    skillset = set([i for i in set([i.lower() for i in generate_list_of_skills(resume_text)])])
    return skillset.intersection(skills_api)

def date_search(resume):
    ans = []

    # find all the date occurrence based on the regular expression
    pattern = r'(((Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|Jul(y)?|Aug(ust)?|Sep(tember)?|Nov(ember)?|Dec(ember)?)|(\d{1,2}\s?\/){0,2}|(\d{1,2}\s?\-){0,2})\s?[-/ ]?\s?\d{4}?)|\bPresent\b|\bpresent\b|\bCurrent\b|\bcurrent\b'
    res = list(re.finditer(pattern, resume))

    if len(res) > 1:
        for ele in res:
            # this is to eradicate the results of having only year but without month
            if len(ele.group().strip()) > 5:
                ans.append([ele.start(), ele.end(), ele.group().strip()])

    res = []
    # Convert "present" and "current" to today's date
    for ele in ans:
        if ele[2].lower() == 'present' or ele[2].lower() == 'current':
            today = pd.to_datetime('today').date()
            ele[2] = today
            res.append(ele)
        else:
            # catch DateParse Error here
            try:
                day = pd.to_datetime(ele[2]).date()
                ele[2] = day
                res.append(ele)
            except :
                print('Cannot parse the date: ', ele[2])

    
    # all the date results are given in the form of [datetime_start_index, datetime_end_index, datetime]
    return res

def experience_tagging(date_list):
    i = 1
    cleaned_section = {}

    while i < len(date_list):

        prev = date_list[i-1]
        cur = date_list[i]
        
        if cur[0] < prev[1] + 10:
            # Taking ceiling of the year of experience
            key = ((cur[2] - prev[2]).days // 365) + 1
            
            # section starts at the (end index of the current date) + 1
            frm = cur[1]+1

            if i < len(date_list) - 1:
                # if there is another date that appears later, then the section will be until the start index of the next date
                until = date_list[i+1][0]
            else:
                # else the section will be until the end of the resume
                until = -1
            

            # Multiple projects with same year of experience, we do 'chaining' here
            if key in cleaned_section:
                cleaned_section[key].append((frm, until))
            else:
                cleaned_section[key] = [(frm, until)]
            i += 2

        else:

            # ignore the current date, possibly it is useless
            i += 1

    return cleaned_section

def skills_experience_level_identification(resume, skills_api, clean=True):
    res = {}
    sections = section_break(resume)

    for section in sections:
        date_list = date_search(section)
        experience_sec = experience_tagging(date_list)
        for key in experience_sec:
            for start, end in experience_sec[key]:
                skills_list = extract_skills(section[start:end], skills_api, clean)
                for ele in skills_list:
                    if ele not in res:
                        res[ele] = key
                    else:
                        res[ele] = max(key, res[ele])

    # for all the skills which do not have any level of experience, assign a default value of 1
    skills_list = extract_skills(resume, skills_api, True)
    for ele in skills_list:
        if ele not in res:
            res[ele] = 1

    return dict(sorted(res.items(), key = lambda x:x[1], reverse = True))

def find_significant_skills(agg_table):

    # gauge skill levels according to percentiles
    skills_required = {}
    for col in agg_table.columns:
        skills = agg_table[col][agg_table[col] > 0]

        # no skills extracted > skills required = empty dictionary
        if len(skills) == 0: 
            skills_required[col] = dict()
        
        # some skills extracted > skills required = some dictionary
        else:
            series = agg_table[col][agg_table[col] >= np.percentile(skills, 95)]

            # if all skills above 95th percentile have same frequency, then scale them to 1 (max)
            if len(series.unique()) == 1:
                scaled_series = series.apply(lambda x: 1)
            # if skills above 95th percentile have different frequency, then scale to values between 0 and 1
            else:
                scaled_series = series.apply(lambda x: (x - series.min()) / (series.max() - series.min()))

            # bin skills according to percentiles
            binned_series = scaled_series.apply(lambda x: 5 if x > 0.7 else 4 if x > 0.3 else 3)
            
            # convert series to dictionary form
            skills_required[col] = binned_series.to_dict()

    return skills_required

def get_significance_table(df,skills_dic):
    
    # creates a binary matrix indicating the presence of each skill
    df['Skills'] = df['ID'].apply(lambda x: list(skills_dic[x].keys()))
    mlb = MultiLabelBinarizer()
    table = pd.DataFrame(mlb.fit_transform(df['Skills']),
                         columns=mlb.classes_,
                         index=df['Skills'].index)
    
    # add category column as y
    y = df['Category']
    table['y'] = y

    # sum by category column and divide by total number of instances
    agg_table = table.groupby(['y']).sum()
    agg_table = agg_table.T / table.groupby(['y']).size()

    return agg_table

def extract_skills_frequency(job_text, skills_api, clean=True):
  if clean == True:
      job_text = preprocess(job_text)

  # get all skills
  skillset = [i.lower() for i in generate_list_of_skills(job_text)]

  # get frequency of each skill
  skillset_count = {}
  for skill in skillset:
    if skill in skills_api:
      if skill in skillset_count:
        skillset_count[skill] = skillset_count[skill] + 1
      else:
        skillset_count[skill] = 1
  
  return dict(sorted(skillset_count.items(), key = lambda x:x[1], reverse = True))

# Map years of experience to competency level
def user_level_deduction(years):
    if years <= 2:
        return 3
    elif years <= 5:
        return 4
    else:
        return 5

# return skills gap grouped by skills
def skills_gap_identification(skills, skills_required):
    diff = {}
    for key in skills_required:
        if key not in skills:
            diff[key] = [x for x in range(3, skills_required[key] + 1)]
        else:
            user_level = user_level_deduction(skills[key])
            if user_level < skills_required[key]:
                diff[key] = [x for x in range(user_level + 1, skills_required[key] + 1)]
    return diff

# group skills gap by level
def skills_gap_by_level(skills_gap):
    new_skills_gap = {}
    for skill in skills_gap:
        for level in skills_gap[skill]:
            if level in new_skills_gap:
                new_skills_gap[level].append(skill)
            else:
                new_skills_gap[level] = [skill]
    return new_skills_gap

def tagcol_paragraph_embeddings_features(train_data):

    # Expects a dataframe with a 'Description' column
    train_data_values = train_data['Description'].values
    
    # Remember to use token.text to get the raw string, otherwise doc2vec cannot build vocabulary
    columns = [TaggedDocument([token.text for token in nlp(text) if token is not token.is_stop] , [i]) for i, text in enumerate(train_data_values)]
    
    return columns

def course_suggestion_d2v(doc2vec_model, skills_gap_cand, skills_gap_jobs, courses_dataset):
    ans = {'job': {}, 'continuous learning': {}}

    for level in skills_gap_jobs:
        vector = doc2vec_model.infer_vector(skills_gap_jobs[level])
        res = doc2vec_model.dv.most_similar([vector], topn=20)
        course_unique = set()
        course_list = []
        for i, prob in res:
            if courses_dataset.loc[i, 'competencyLevel'][0] == str(level) and courses_dataset.loc[i, 'Marketing Name'] not in course_unique:
                course_unique.add(courses_dataset.loc[i, 'Marketing Name'])
                course_list.append((courses_dataset.loc[i, 'productId'], courses_dataset.loc[i, 'Marketing Name'], courses_dataset.loc[i, 'competencyLevel']))
        ans['job'][level] = course_list[:5]

    for level in skills_gap_cand:
        vector = doc2vec_model.infer_vector(skills_gap_cand[level])
        res = doc2vec_model.dv.most_similar([vector], topn=20)
        course_unique = set()
        course_list = []
        for i, prob in res:
            if courses_dataset.loc[i, 'competencyLevel'][0] == str(level) and courses_dataset.loc[i, 'Marketing Name'] not in course_unique:
                course_unique.add(courses_dataset.loc[i, 'Marketing Name'])
                course_list.append((courses_dataset.loc[i, 'productId'], courses_dataset.loc[i, 'Marketing Name'], courses_dataset.loc[i, 'competencyLevel']))
        ans['continuous learning'][level] = course_list[:5]

    return ans

def course_suggestion_spacy(skills_gap_cand, skills_gap_jobs, courses_dataset):
    # load pre-trained model
    nlp = spacy.load('en_core_web_sm')
    ans = {'job': {}, 'continuous learning': {}}

    for level in skills_gap_jobs:
        skills_gap_text = " ".join(skills_gap_jobs[level])

        # get courses of same competency level
        df = courses_dataset[courses_dataset["competencyLevel"].str.contains(str(level))]
        df = df.copy()

        # get similarity score
        df["Similarity"] = df["Description"].apply(lambda x: nlp(skills_gap_text).similarity(nlp(str(x))))
        top_courses = df.nlargest(20,'Similarity',keep='all')
        course_unique = set()
        course_list = []
        for index, row in top_courses.iterrows():
            if row['Marketing Name'] not in course_unique:
                course_unique.add(row['Marketing Name'])
                course_list.append((row['productId'], row['Marketing Name'], row['competencyLevel']))
        ans['job'][level] = course_list[:5]

    for level in skills_gap_cand:
        skills_gap_text = " ".join(skills_gap_cand[level])

        # get courses of same competency level
        df = courses_dataset[courses_dataset["competencyLevel"].str.contains(str(level))]
        df = df.copy()

        # get similarity score
        df["Similarity"] = df["Description"].apply(lambda x: nlp(skills_gap_text).similarity(nlp(str(x))))
        top_courses = df.nlargest(20,'Similarity',keep='all')
        course_unique = set()
        course_list = []
        for index, row in top_courses.iterrows():
            if row['Marketing Name'] not in course_unique:
                course_unique.add(row['Marketing Name'])
                course_list.append((row['productId'], row['Marketing Name'], row['competencyLevel']))
        ans['continuous learning'][level] = course_list[:5]

    return ans


### 6.2 High-level Extraction for The Final Function

In [10]:
def final_course_suggestion_d2v(resume_text, skills_api, resume_skills_required_pickle, peer_industry, job_skills_required_pickle, job_name, doc2vec_model, courses_dataset, clean=True):
    """
    This function returns list of courses corresponding to different competency levels for both job requirements and continuous learning.

    Parameters
    ----------
    resume_text : string
        The extracted text from a resume.
    skills_api : set
        A set that contains all the recognized skills. Created based on EMSI skills API.
    resume_skills_required_pickle: dictionary
        A dictionary that stores all the good-to-have skills and the corresponding competency levels for each industry.
    peer_industry: string
        The industry that you want to compare yourself with. 
    job_skills_required_pickle: dictionary
        A dictionary that stores all the job specific skills and the corresponding competency levels for each role.
    job_name: string
        The particular role that applicants are applying to.
    doc2vec_model: doc2vec model
        The pretrained doc2vec model to compare the similarity between skills required and courses description
    courses_dataset: pd.DataFrame
        The dataset that contains all the information of courses. Provided by Sambaash
    clean: boolean
        The boolean that indicates whether we shall clean the text
      

    Returns
    -------
    dict (a nested dictionary that contains all the courses information)

    See Also
    --------
    final_course_suggestion_spacy : A similar method, but it used spacy pre-trained model instead of doc2vec model

    Examples
    --------
    >>> final_course_suggestion_d2v(resume_text, skills_api, resume_skills_required_pickle, 'INFORMATION-TECHNOLOGY', 
                        job_skills_required_pickle, 'Software Developers, Applications', doc2vec_model, courses_dataset, True)
    {'job': 
        {
            3: [(7203,'Innovation and Entrepreneurship Capstone','3 - Entrant Level'), ...],
            4: [(4602, 'Express Data Base Administrator', '4 - Specialist Level'), ...],
            5: [(12502, 'Cyber Security Management Capstone Project','5 - Expert Level'), ...]
        }
     'continuous learning':
        {
            3: [(7203,'Innovation and Entrepreneurship Capstone','3 - Entrant Level'), ...],
            4: [(4602, 'Express Data Base Administrator', '4 - Specialist Level'), ...],
            5: [(12502, 'Cyber Security Management Capstone Project','5 - Expert Level'), ...]
        }
    }
    """

    if peer_industry not in resume_skills_required_pickle:
        return 'Industry Not Found'
    if job_name not in job_skills_required_pickle:
        return 'Job Not Found'
    
    my_resume_skills = skills_experience_level_identification(resume_text, skills_api, clean)

    skills_gap_cand = skills_gap_by_level(skills_gap_identification(my_resume_skills, resume_skills_required_pickle[peer_industry]))
    skills_gap_jobs = skills_gap_by_level(skills_gap_identification(my_resume_skills, job_skills_required_pickle[job_name]))
    
    return course_suggestion_d2v(doc2vec_model, skills_gap_cand, skills_gap_jobs, courses_dataset)


def final_course_suggestion_spacy(resume_text, skills_api, resume_skills_required_pickle, peer_industry, job_skills_required_pickle, job_name, courses_dataset, clean=True):
    """
    This function returns list of courses corresponding to different competency levels for both job requirements and continuous learning.

    Parameters
    ----------
    resume_text : string
        The extracted text from a resume.
    skills_api : set
        A set that contains all the recognized skills. Created based on EMSI skills API.
    resume_skills_required_pickle: dictionary
        A dictionary that stores all the good-to-have skills and the corresponding competency levels for each industry.
    peer_industry: string
        The industry that you want to compare yourself with. 
    job_skills_required_pickle: dictionary
        A dictionary that stores all the job specific skills and the corresponding competency levels for each role.
    job_name: string
        The particular role that applicants are applying to.
    courses_dataset: pd.DataFrame
        The dataset that contains all the information of courses. Provided by Sambaash
    clean: boolean
        The boolean that indicates whether we shall clean the text
      

    Returns
    -------
    dict (a nested dictionary that contains all the courses information)

    See Also
    --------
    final_course_suggestion_d2v: A similar method, but it used doc2vec model instead of spacy pre-trained model

    Examples
    --------
    >>> final_course_suggestion_d2v(resume_text, skills_api, resume_skills_required_pickle, 'INFORMATION-TECHNOLOGY', 
                        job_skills_required_pickle, 'Software Developers, Applications', doc2vec_model, courses_dataset, True)
    {'job': 
        {
            3: [(7203,'Innovation and Entrepreneurship Capstone','3 - Entrant Level'), ...],
            4: [(4602, 'Express Data Base Administrator', '4 - Specialist Level'), ...],
            5: [(12502, 'Cyber Security Management Capstone Project','5 - Expert Level'), ...]
        }
     'continuous learning':
        {
            3: [(7203,'Innovation and Entrepreneurship Capstone','3 - Entrant Level'), ...],
            4: [(4602, 'Express Data Base Administrator', '4 - Specialist Level'), ...],
            5: [(12502, 'Cyber Security Management Capstone Project','5 - Expert Level'), ...]
        }
    }
    """
    
    if peer_industry not in resume_skills_required_pickle:
        return 'Industry Not Found'
    if job_name not in job_skills_required_pickle:
        return 'Job Not Found'
    
    my_resume_skills = skills_experience_level_identification(resume_text, skills_api, clean)

    skills_gap_cand = skills_gap_by_level(skills_gap_identification(my_resume_skills, resume_skills_required_pickle[peer_industry]))
    skills_gap_jobs = skills_gap_by_level(skills_gap_identification(my_resume_skills, job_skills_required_pickle[job_name]))
    
    return course_suggestion_spacy(skills_gap_cand, skills_gap_jobs, courses_dataset)

### 6.3 Test Run

In [12]:
%%time
skills_api = create_skills_api('all_skills_emsi.xlsx')
resume_text = read_resume('resume_5.pdf')
courses_dataset = create_courses_dataset('Courses.xlsx')

with open('job_skills_required.pkl', 'rb') as pickle_file:
    job_skills_required_pickle = pickle.load(pickle_file)

with open('resume_skills_required.pkl', 'rb') as pickle_file:
    resume_skills_required_pickle = pickle.load(pickle_file)

with open('d2v_model.pkl', 'rb') as pickle_file:
    doc2vec_model = pickle.load(pickle_file)

CPU times: total: 2min 33s
Wall time: 3min 5s


In [13]:
%%time
final_course_suggestion_d2v(resume_text, skills_api, resume_skills_required_pickle, 'INFORMATION-TECHNOLOGY', 
                        job_skills_required_pickle, 'Software Developers, Applications', doc2vec_model, courses_dataset, True)

CPU times: total: 141 ms
Wall time: 184 ms


{'job': {3: [(7203,
    'Innovation and Entrepreneurship Capstone',
    '3 - Entrant Level')],
  4: [(4602, 'Express Data Base Administrator', '4 - Specialist Level'),
   (5401, 'Express CRM (Salesforce)', '4 - Specialist Level'),
   (12706, 'Data Analysis & Visualization', '4 - Specialist Level'),
   (6506, 'Express CRM <p> (with Salesforce CRM) ', '4 - Specialist Level'),
   (1302,
    'Strategic Usage of Business Intelligence (SAP)',
    '4 - Specialist Level')],
  5: []},
 'continuous learning': {3: [(2901,
    'Capstone Project using Java (SF)',
    '3 - Entrant Level'),
   (12601, 'Digital Fluency', '3 - Entrant Level')],
  4: [(13803,
    'Applied Degree in Digital Business <p> Quest International University Perak',
    '4 - Specialist Level'),
   (12107,
    'BBA Ecommerce Management <p> from Sage University, Indore',
    '4 - Specialist Level'),
   (13722,
    'Applied Degree in Digital Business <p> Institut Teknologi Sains dan Kesehatan Sugeng Hartono',
    '4 - Specialist Le

In [16]:
%%time
final_course_suggestion_spacy(resume_text, skills_api, resume_skills_required_pickle, 'INFORMATION-TECHNOLOGY', 
                        job_skills_required_pickle, 'Software Developers, Applications', courses_dataset, True)

  df["Similarity"] = df["Description"].apply(lambda x: nlp(skills_gap_text).similarity(nlp(str(x))))
  df["Similarity"] = df["Description"].apply(lambda x: nlp(skills_gap_text).similarity(nlp(str(x))))
  df["Similarity"] = df["Description"].apply(lambda x: nlp(skills_gap_text).similarity(nlp(str(x))))
  df["Similarity"] = df["Description"].apply(lambda x: nlp(skills_gap_text).similarity(nlp(str(x))))
  df["Similarity"] = df["Description"].apply(lambda x: nlp(skills_gap_text).similarity(nlp(str(x))))
  df["Similarity"] = df["Description"].apply(lambda x: nlp(skills_gap_text).similarity(nlp(str(x))))


CPU times: total: 57.5 s
Wall time: 1min


{'job': {3: [(5001, 'Data Intelligence', '3 - Entrant Level'),
   (7316, 'Front End Web Development', '3 - Entrant Level'),
   (10110, 'Omnicom Sales & Marketing  ', '3 - Entrant Level'),
   (1901, 'Web Development Foundations (SF)', '3 - Entrant Level'),
   (7318, 'Web Development Foundations ', '3 - Entrant Level')],
  4: [(4401, 'Data Analytics with Hadoop', '4 - Specialist Level'),
   (9601,
    'Applied Master in Analytics & Artificial  Intelligence',
    '4 - Specialist Level'),
   (13601, 'AWS - Restart Program\xa0\xa0', '4 - Specialist Level'),
   (1306,
    'Business Intelligence Queries & Reports (SAP)',
    '4 - Specialist Level'),
   (8705, 'Innovation Implementation Insights', '4 - Specialist Level')],
  5: [(12502,
    'Cyber Security Management Capstone Project',
    '5 - Expert Level'),
   (703, 'Business Process Engineering', '5 - Expert Level'),
   (7401, 'NICF-Information System Professional (SF)', '5 - Expert Level'),
   (9502, 'Digital Work Place', '5 - Expert Leve