# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import gensim
from gensim import corpora
import spacy
import aspose.words as aw
from spacy.tokenizer import Tokenizer
import pickle

# Define Preprocessing Functions

In [2]:
# load pre-trained model
nlp = spacy.load('en_core_web_sm')
# Tokenize words only with the whitespace rule
# N-grams will no longer be treated as 'N' and '-grams'
nlp.tokenizer = Tokenizer(nlp.vocab, token_match=re.compile(r'\S+').match)

def preprocess(txt):
    txt = txt.lower()
    
    # these must come first
    txt = re.sub('b\.\S*', '', txt) # remove all bachelor qualifications
    txt = re.sub('m\.\S*', '', txt) # remove all master qualifications


    txt = re.sub("'", "", txt) # Remove apostrophe
    txt = re.sub("’", "", txt) # Remove apostrophe
    txt = re.sub('http\S+\s*', ' ', txt)  # remove URLs
    txt = re.sub('RT|cc', ' ', txt)  # remove RT and cc
    txt = re.sub('#\S+', '', txt)  # remove hashtags
    txt = re.sub('@\S+', ' ', txt)  # remove mentions
    txt = re.sub('[^a-zA-Z]', ' ', txt) # Remove non-English charcters
    txt = re.sub('\s+', ' ', txt)  # remove extra whitespace

    # tokenize word
    txt = nlp(txt)

    # remove stop words and lemmatization
    txt = [token.lemma_ for token in txt if not token.is_stop]

    return ' '.join(txt)

# STOP = nltk.corpus.stopwords.words('english')

# Suggestion:
# def topic_modelling(resume_text, num_words=10, min_prob = 10**-2):
#     doc_clean = [resume_text.split()]   
    
#     # term dictionary
#     dictionary = corpora.Dictionary(doc_clean)

#     # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
#     doc_term_mat = [dictionary.doc2bow(doc) for doc in doc_clean]

#     # latent dirichlet allocation model 
#     Lda = gensim.models.ldamodel.LdaModel

#     ldamodel = Lda(doc_term_mat, num_topics=1, id2word = dictionary, passes=50)
    
#     # Return only the topic words that have the probability of larger than .01
#     return [token for token, prob in ldamodel.show_topic(0, topn=num_words) if prob > min_prob ]

def n_grams(tokens, n):
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

# Load all the skills from EMSI Skills API dataset

In [3]:
skills = pd.read_excel("all_skills_emsi.xlsx")
# remove all the additional descriptions in round brackets
skills['name'] = skills['name'].apply(lambda x: re.sub("\W\(.*?\)","",x))
skills['name'] = skills['name'].apply(lambda x: x.lower())
skills.head()

Unnamed: 0.1,Unnamed: 0,id,infoUrl,name,type.id,type.name
0,0,KS126XS6CQCFGC3NG79X,https://skills.emsidata.com/skills/KS126XS6CQC...,.net assemblies,ST1,Specialized Skill
1,1,ES50D03AC9CFC1A0BC93,https://skills.emsidata.com/skills/ES50D03AC9C...,.net development,ST1,Specialized Skill
2,2,KS1200B62W5ZF38RJ7TD,https://skills.emsidata.com/skills/KS1200B62W5...,.net framework,ST1,Specialized Skill
3,3,KS126XW78QJCF4TRV2X7,https://skills.emsidata.com/skills/KS126XW78QJ...,.net framework 1,ST1,Specialized Skill
4,4,KS126XY68BNKXSBSLPYS,https://skills.emsidata.com/skills/KS126XY68BN...,.net framework 3,ST1,Specialized Skill


In [4]:
# Convert all the skills into a big hashset
skills_api = set(skills['name'])

In [19]:
def extract_skills(resume_text, skills_api, clean=True):
    if clean == True:
        resume_text = preprocess(resume_text)
        
    nlp_text = nlp(resume_text)

    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]
    
    # all the resume skills will be saved here
    skillset = []
    
    # check for one-grams (example: python)
    for token in tokens:
        if nlp(token)[0].tag_ != 'VBN':
            skillset.append(token)
    
    # check for noun_chunks (example: machine learning)
    for token in nlp_text.noun_chunks:
        token = token.text.lower().strip()
        skillset.append(token)

    # check for bigrams that SpaCy missed in the noun_chuncks
    for token in n_grams(tokens, 2):
        token = token.lower().strip()
        skillset.append(token)

    # check for trigrams that SpaCy missed in the noun_chuncks
    for token in n_grams(tokens, 3):
        token = token.lower().strip()
        skillset.append(token)
    
    # get only the unique skills in lowercase
    skillset = set([i for i in set([i.lower() for i in skillset])])

    return skillset.intersection(skills_api)

# Read any resume

In [6]:
resume = aw.Document("31605080.pdf")
resume_string = resume.to_string(aw.SaveFormat.TEXT).split('\r\n')
resume_string = ' '.join(resume_string[1:-3])

In [9]:
resume_string

'GEEK SQUAD AGENT Professional Profile IT support specialist with experience across multiple disciplines including technical support, customer service, computer repair, and military service. I am hard working, willing to learn, team oriented, and comfortable working independently as well. Qualifications Windows / Mac / IOS / Android Technical Support Hardware & Software Maintenance User Training Malware Detection & Removal Customer Service Entry Level Active Directory & Ticketing Problem Solving & Research Experience Company Name August 2014 to October 2016 Geek Squad Agent City , State · Provided technical support in person and over the phone. · Performed hardware and software installation and repair. · Refurbished and setup PCs and peripheral devices. Company Name January 2013 to January 2014 Shipping & Receiving Associate City , State · Performed shipping and receiving of product. · Assisted with inventory management. · General logistics and warehouse duties. Company Name January 20

# Extract Skills

In [10]:
extract_skills(resume_text=resume_string, skills_api=skills_api, clean=True)

{'active directory',
 'android',
 'aviation',
 'computer repair',
 'customer service',
 'information technology',
 'installation',
 'inventory management',
 'logistics',
 'malware detection',
 'management',
 'marshalling',
 'research',
 'software installation',
 'software maintenance',
 'technical support',
 'troubleshooting'}

However, this is actually assuming all the skills have the same level of experience.

We are going to identify each section using the date range in resume, then label all the skills within the section with the year of experience computed from the date range in resume. 

# Date range identification

In [11]:
# Map all the possible cases of date format in resume
# Date formats include:
# Aug 2022
# 05/2023
# Present
# Current
# 05 / 2023
# Aug / 2023
# 03-2023
# 03 - 2023
pattern = r'(((Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|June?|July?|Aug(ust)?|Sep(tember)?|Nov(ember)?|Dec(ember)?)|(\d{1,2}\s?\/){0,2}|(\d{1,2}\s?\-){0,2})\s?[-/ ]?\s?\d{4}?)|\bPresent\b|\bpresent\b|\bCurrent\b|\bcurrent\b'

def date_search(resume):
    
    ans = []

    # find all the date occurrence based on the regular expression
    res = list(re.finditer(pattern, resume))

    if len(res) > 1:
        for ele in res:
            # this is to eradicate the results of having only year but without month
            if len(ele.group().strip()) > 5:
                ans.append([ele.start(), ele.end(), ele.group().strip()])

    res = []
    # Convert "present" and "current" to today's date
    for ele in ans:
        if ele[2].lower() == 'present' or ele[2].lower() == 'current':
            today = pd.to_datetime('today').date()
            ele[2] = today
            res.append(ele)
        else:
            # catch DateParse Error here
            try:
                day = pd.to_datetime(ele[2]).date()
                ele[2] = day
                res.append(ele)
            except :
                print('Cannot parse the date: ', ele[2])

    
    # all the date results are given in the form of [datetime_start_index, datetime_end_index, datetime]
    return res

date_list = date_search(resume_string)
date_list

[[535, 546, datetime.date(2014, 8, 1)],
 [776, 788, datetime.date(2013, 1, 1)],
 [792, 804, datetime.date(2014, 1, 1)],
 [989, 1001, datetime.date(2009, 1, 1)],
 [1005, 1017, datetime.date(2012, 1, 1)],
 [1232, 1244, datetime.date(2005, 1, 1)],
 [1248, 1260, datetime.date(2009, 1, 1)]]

In [12]:
def experience_tagging(date_list):
    i = 1
    cleaned_section = {}
    while i < len(date_list):
        cur = date_list[i]
        prev = date_list[i-1]

        if cur[0] < prev[1] + 10:
            # Taking ceiling of the year of experience
            key = ((cur[2] - prev[2]).days // 365) + 1
            if i < len(date_list) - 1:
                # if there is another date that appears later, then the section will be until the start index of the next date
                until = date_list[i+1][0]
            else:
                until = -1
            frm = cur[1]+1

            # Multiplie projects with same year of experience, we do 'chaining' here
            if key in cleaned_section:
                cleaned_section[key].append((frm, until))
            else:
                cleaned_section[key] = [(frm, until)]
            i += 2
        else:
            # ignore the current date, possibly it is useless
            i += 1

    return cleaned_section

experience_tagging(date_list)

{2: [(805, 989)], 4: [(1018, 1232)], 5: [(1261, -1)]}

In [13]:
def skills_experience_level_identification(resume):
    res = {}
    date_list = date_search(resume)
    experience_sec = experience_tagging(date_list)

    for key in experience_sec:
        for start, end in experience_sec[key]:
            skills_list = extract_skills(resume[start: end], skills_api, True)
            for ele in skills_list:
                if ele not in res:
                    res[ele] = key
                else:
                    res[ele] = max(key, res[ele])

    skills_list = extract_skills(resume, skills_api, True)
    for ele in skills_list:
        if ele not in res:
            res[ele] = 1

    return res


skills_experience_level_identification(resume_string)

{'management': 5,
 'inventory management': 5,
 'installation': 5,
 'troubleshooting': 5,
 'logistics': 5,
 'customer service': 5,
 'marshalling': 5,
 'software installation': 5,
 'active directory': 5,
 'technical support': 5,
 'aviation': 5,
 'information technology': 5,
 'research': 5,
 'computer repair': 1,
 'software maintenance': 1,
 'android': 1,
 'malware detection': 1}

# Identify skills experience on the dataset

In [14]:
data = pd.read_csv('Resume.csv')
data['skills_experience'] = np.nan
data

Unnamed: 0,ID,Resume_str,Resume_html,Category,skills_experience
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR,
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR,
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR,
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR,
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR,
...,...,...,...,...,...
2479,99416532,RANK: SGT/E-5 NON- COMMISSIONED OFFIC...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION,
2480,24589765,"GOVERNMENT RELATIONS, COMMUNICATIONS ...","<div class=""fontsize fontface vmargins hmargin...",AVIATION,
2481,31605080,GEEK SQUAD AGENT Professional...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION,
2482,21190805,PROGRAM DIRECTOR / OFFICE MANAGER ...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION,


In [20]:
experience_dict = {}
for i in range(data.shape[0]):
    print('Currently running: ' + str(i))
    experience_dict[data.loc[i, 'ID']] = skills_experience_level_identification(data.loc[i, 'Resume_str'])

Currently running: 0
Currently running: 1
Currently running: 2
Currently running: 3
Currently running: 4
Currently running: 5
Currently running: 6
Currently running: 7
Currently running: 8
Currently running: 9
Currently running: 10
Currently running: 11
Currently running: 12
Currently running: 13
Currently running: 14
Currently running: 15
Currently running: 16
Currently running: 17
Currently running: 18
Currently running: 19
Currently running: 20
Currently running: 21
Currently running: 22
Currently running: 23
Currently running: 24
Currently running: 25
Currently running: 26
Currently running: 27
Currently running: 28
Currently running: 29
Currently running: 30
Currently running: 31
Currently running: 32
Currently running: 33
Currently running: 34
Currently running: 35
Currently running: 36
Currently running: 37
Currently running: 38
Currently running: 39
Currently running: 40
Currently running: 41
Currently running: 42
Currently running: 43
Currently running: 44
Currently running: 4

  day = pd.to_datetime(ele[2]).date()


Currently running: 992
Currently running: 993
Currently running: 994
Currently running: 995
Currently running: 996
Currently running: 997
Currently running: 998
Currently running: 999
Currently running: 1000
Currently running: 1001
Currently running: 1002
Currently running: 1003
Currently running: 1004
Currently running: 1005
Currently running: 1006
Currently running: 1007
Currently running: 1008
Cannot parse the date:  - 4035
Currently running: 1009
Currently running: 1010
Currently running: 1011
Currently running: 1012
Currently running: 1013
Currently running: 1014
Currently running: 1015
Currently running: 1016
Currently running: 1017
Currently running: 1018
Currently running: 1019
Currently running: 1020
Currently running: 1021
Currently running: 1022
Currently running: 1023
Currently running: 1024
Currently running: 1025
Currently running: 1026
Currently running: 1027
Currently running: 1028
Currently running: 1029
Currently running: 1030
Currently running: 1031
Currently running

In [21]:
skills_file = open('skills.pkl', 'wb')
pickle.dump(experience_dict, skills_file)
skills_file.close()

In [None]:
with open('<directory>', 'rb') as pickle_file:
    content = pickle.load(pickle_file)