# Import Libraries

In [2]:
import pandas as pd
import numpy as np
import re
import gensim
from gensim import corpora
import spacy
import aspose.words as aw
from spacy.tokenizer import Tokenizer

# Define Preprocessing Functions

In [3]:
# load pre-trained model
nlp = spacy.load('en_core_web_sm')
# Tokenize words only with the whitespace rule
# N-grams will no longer be treated as 'N' and '-grams'
nlp.tokenizer = Tokenizer(nlp.vocab, token_match=re.compile(r'\S+').match)

def preprocess(txt):
    txt = txt.lower()
    # Remove non-English charcters
    txt = re.sub('[^a-zA-Z]', ' ', txt)
    txt = re.sub('http\S+\s*', ' ', txt)  # remove URLs
    txt = re.sub('RT|cc', ' ', txt)  # remove RT and cc
    txt = re.sub('#\S+', '', txt)  # remove hashtags
    txt = re.sub('@\S+', '  ', txt)  # remove mentions
    txt = re.sub('\s+', ' ', txt)  # remove extra whitespace

    # tokenize word
    txt = nlp(txt)

    # remove stop words and lemmatization
    txt = [token.lemma_ for token in txt if not token.is_stop]

    return ' '.join(txt)

# STOP = nltk.corpus.stopwords.words('english')

def topic_modelling(resume_text, num_words=10, min_prob = 10**-2):
    doc_clean = [resume_text.split()]   
    
    # term dictionary
    dictionary = corpora.Dictionary(doc_clean)

    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_mat = [dictionary.doc2bow(doc) for doc in doc_clean]

    # latent dirichlet allocation model 
    Lda = gensim.models.ldamodel.LdaModel

    ldamodel = Lda(doc_term_mat, num_topics=1, id2word = dictionary, passes=50)
    
    # Return only the topic words that have the probability of larger than .01
    return [token for token, prob in ldamodel.show_topic(0, topn=num_words) if prob > min_prob ]

def n_grams(tokens, n):
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

# Load all the skills from EMSI Skills API dataset

In [4]:
skills = pd.read_excel("all_skills_emsi.xlsx")
# remove all the additional descriptions in round brackets
skills['name'] = skills['name'].apply(lambda x: re.sub("\s\(.*?\)","",x))
skills['name'] = skills['name'].apply(lambda x: x.lower())
skills.head()

Unnamed: 0.1,Unnamed: 0,id,infoUrl,name,type.id,type.name
0,0,KS126XS6CQCFGC3NG79X,https://skills.emsidata.com/skills/KS126XS6CQC...,.net assemblies,ST1,Specialized Skill
1,1,ES50D03AC9CFC1A0BC93,https://skills.emsidata.com/skills/ES50D03AC9C...,.net development,ST1,Specialized Skill
2,2,KS1200B62W5ZF38RJ7TD,https://skills.emsidata.com/skills/KS1200B62W5...,.net framework,ST1,Specialized Skill
3,3,KS126XW78QJCF4TRV2X7,https://skills.emsidata.com/skills/KS126XW78QJ...,.net framework 1,ST1,Specialized Skill
4,4,KS126XY68BNKXSBSLPYS,https://skills.emsidata.com/skills/KS126XY68BN...,.net framework 3,ST1,Specialized Skill


In [5]:
# Convert all the skills into a big hashset
skills_api = set(skills['name'])

In [6]:
def extract_skills(resume_text, skills_api, clean=True):
    if clean == True:
        resume_text = preprocess(resume_text)
        
    nlp_text = nlp(resume_text)

    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]
    
    # all the resume skills will be saved here
    skillset = []
    
    # check for one-grams (example: python)
    for token in tokens:
        if nlp(token)[0].tag_ != 'VBN':
            skillset.append(token)
    
    # check for noun_chunks (example: machine learning)
    for token in nlp_text.noun_chunks:
        token = token.text.lower().strip()
        skillset.append(token)

    # check for bigrams that SpaCy missed in the noun_chuncks
    for token in n_grams(tokens, 2):
        token = token.lower().strip()
        skillset.append(token)

    # check for trigrams that SpaCy missed in the noun_chuncks
    for token in n_grams(tokens, 3):
        token = token.lower().strip()
        skillset.append(token)

    # check for LDA topic modelling result
    skillset.extend(topic_modelling(resume_text))
    
    # get only the unique skills in lowercase
    skillset = set([i for i in set([i.lower() for i in skillset])])

    return skillset.intersection(skills_api)

# Read any resume

In [7]:
resume = aw.Document("34740556.pdf")
resume_string = resume.to_string(aw.SaveFormat.TEXT).split('\r\n')
resume_string = ' '.join(resume_string[1:-3])

In [8]:
resume_string[:1000]

'SENIOR HR BUSINESS PARTNER Summary Human Resources Professional with 8 years of experience in human resources and recruiting. Expertise in Employee Relations and Recruiting. Highly driven to achieve company goals. Highlights · HUMAN RESOURCES MANAGER · Extensive background in HR Business Partner affairs, including experience in, staff development, mediation, conflict resolution, benefits and compensation, HR records management, HR policies development and legal compliance. · Demonstrated success in negotiating win-win compromises, developing teambuilding programs, and writing policies, job descriptions and management reports. · HR SKILLS · Employment Law · FMLA/ADA/EEO/WC · HR Policies & Procedures *Staff Recruitment & Retention · Employee Relations · Benefits Administration *Orientation & On-Boarding  Training & Development  Organizational Development  MS Office (Word, Excel, PowerPoint, Access, Outlook) Experience Senior HR Business Partner  August 2013 to Current Company Name ï¼ Ci

# Extract Skills

In [9]:
extract_skills(resume_text=resume_string, skills_api=skills_api, clean=True)

{'ada',
 'business administration',
 'coaching',
 'conflict resolution',
 'environmental quality',
 'labor law',
 'management',
 'mediation',
 'organizational development',
 'policy development',
 'poultry',
 'reduction',
 'session',
 'strong work ethic',
 'teamwork',
 'workplace safety'}

However, this is actually assuming all the skills have the same level of experience.

We are going to identify each section using the date range in resume, then label all the skills within the section with the year of experience computed from the date range in resume. 

# Date range identification

In [13]:
# Map all the possible cases of date format in resume
# Date formats include:
# Aug 2022
# 05/2023
# Present
# Current
# 05 / 2023
# Aug / 2023
# 03-2023
# 03 - 2023
pattern = r'(((Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|June?|July?|Aug(ust)?|Sep(tember)?|Nov(ember)?|Dec(ember)?)|(\d{1,2}\s?\/){0,2}|(\d{1,2}\s?\-){0,2})\s?[-/ ]?\s?\d{4}?)|\bPresent\b|\bpresent\b|\bCurrent\b|\bcurrent\b'

def date_search(resume):
    
    ans = []

    # find all the date occurrence based on the regular expression
    res = list(re.finditer(pattern, resume))

    if len(res) > 1:
        for ele in res:
            # this is to eradicate the results of having only year but without month
            if len(ele.group().strip()) > 5:
                ans.append([ele.start(), ele.end(), ele.group().strip()])

    # Convert "present" and "current" to today's date
    for ele in ans:
        if ele[2].lower() == 'present' or ele[2].lower() == 'current':
            today = pd.to_datetime('today').date()
            ele[2] = today
        else:
            day = pd.to_datetime(ele[2]).date()
            ele[2] = day
    
    # all the date results are given in the form of [datetime_start_index, datetime_end_index, datetime]
    return ans

date_list = date_search(resume_string)
date_list

[[958, 969, datetime.date(2013, 8, 1)],
 [973, 980, datetime.date(2023, 2, 14)],
 [1066, 1080, datetime.date(2013, 9, 1)],
 [1851, 1858, datetime.date(2023, 2, 14)],
 [1969, 1980, datetime.date(2010, 8, 1)],
 [1984, 1995, datetime.date(2013, 8, 1)],
 [3608, 3619, datetime.date(2008, 8, 1)],
 [3623, 3634, datetime.date(2010, 8, 1)]]

In [17]:
def experience_tagging(date_list):
    i = 1
    cleaned_section = {}
    while i < len(date_list):
        cur = date_list[i]
        prev = date_list[i-1]

        if cur[0] < prev[1] + 10:
            # Taking ceiling of the year of experience
            key = ((cur[2] - prev[2]).days // 365) + 1
            if i < len(date_list) - 1:
                # if there is another date that appears later, then the section will be until the start index of the next date
                until = date_list[i+1][0]
            else:
                until = -1
            frm = cur[1]+1

            # Multiplie projects with same year of experience, we do 'chaining' here
            if key in cleaned_section:
                cleaned_section[key].append((frm, until))
            else:
                cleaned_section[key] = [(frm, until)]
            i += 2
        else:
            # ignore the current date, possibly it is useless
            i += 1

    return cleaned_section

experience_tagging(date_list)

{10: [(981, 1066)], 4: [(1996, 3608)], 3: [(3635, -1)]}

In [19]:
def skills_experience_level_identification(resume):
    res = {}
    date_list = date_search(resume)
    experience_sec = experience_tagging(date_list)

    for key in experience_sec:
        for start, end in experience_sec[key]:
            skills_list = extract_skills(resume[start: end], skills_api, True)
            for ele in skills_list:
                if ele not in res:
                    res[ele] = key
                else:
                    res[ele] = max(key, res[ele])

    skills_list = extract_skills(resume, skills_api, True)
    for ele in skills_list:
        if ele not in res:
            res[ele] = 1

    return res


skills_experience_level_identification(resume_string)

{'reduction': 4,
 'environmental quality': 4,
 'teamwork': 4,
 'management': 4,
 'poultry': 4,
 'strong work ethic': 4,
 'business administration': 3,
 'coaching': 3,
 'conflict resolution': 3,
 'workplace safety': 3,
 'mediation': 3,
 'organizational development': 3,
 'session': 3,
 'ada': 3,
 'labor law': 1,
 'policy development': 1}