# Resume Parsing

## 1. Load data

In [1]:
import pandas as pd
import numpy as np

df_resume = pd.read_csv("resume.csv")

In [2]:
df_resume.Category.unique()

array(['HR', 'DESIGNER', 'INFORMATION-TECHNOLOGY', 'TEACHER', 'ADVOCATE',
       'BUSINESS-DEVELOPMENT', 'HEALTHCARE', 'FITNESS', 'AGRICULTURE',
       'BPO', 'SALES', 'CONSULTANT', 'DIGITAL-MEDIA', 'AUTOMOBILE',
       'CHEF', 'FINANCE', 'APPAREL', 'ENGINEERING', 'ACCOUNTANT',
       'CONSTRUCTION', 'PUBLIC-RELATIONS', 'BANKING', 'ARTS', 'AVIATION'],
      dtype=object)

In [3]:
df_resume.shape

(2484, 4)

In [5]:
df_resume = df_resume.reindex(np.random.permutation(df_resume.index))
df_resume = df_resume.copy().iloc[:1000, ]
df_resume.shape

(1000, 4)

## 2. Load skill data

If we define patterns for all the skill, we gonna be too tired.

So spacy knows that, so it allows you to give you a list of words, then it will automatically create pattern.

In [6]:
import spacy

nlp = spacy.load('en_core_web_md')
skill_path = 'skills_educations.jsonl'

In [7]:
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_path)
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

In [8]:
doc = nlp("Chaky loves ajax.")
doc.ents

(Chaky, ajax)

## 3. Let's try to extract skills from this resume.csv

In [9]:
df_resume.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [12]:
#clean our data
from spacy.lang.en.stop_words import STOP_WORDS

def preprocessing(sentence):
    stopwords    = list(STOP_WORDS)
    doc          = nlp(sentence)
    clean_tokens = []
    
    for token in doc:
        if token.text not in stopwords and token.pos_ != 'PUNCT' and token.pos_ != 'SYM' and \
            token.pos_ != 'SPACE':
                clean_tokens.append(token.lemma_.lower().strip())
                
    return " ".join(clean_tokens)

In [13]:
random_resume = df_resume.Resume_str.iloc[5]
random_resume[:300]

'         HR GENERALIST       Summary     Dedicated and focused Administrative Assistant who excels at prioritizing, completing multiple tasks simultaneously and following through to achieve project goals. Seeking a role of increased responsibility and authority.       Highlights         Microsoft Of'

In [14]:
preprocessing(random_resume[:300])

'hr generalist summary dedicated focus administrative assistant excel prioritize complete multiple task simultaneously follow achieve project goal seek role increase responsibility authority highlight microsoft of'

In [15]:
for i, row in df_resume.iterrows():
    clean_text = preprocessing(row.Resume_str)
    df_resume.at[i, 'Clean_resume'] = clean_text

  df_resume.at[i, 'Clean_resume'] = clean_text


In [16]:
df_resume.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category,Clean_resume
456,49486820,BENEFIT ADVOCATE Career Ove...,"<div class=""fontsize fontface vmargins hmargin...",ADVOCATE,benefit advocate career overview customer serv...
1265,34319869,GLOBAL DIGITAL SERVICING ~ DIGITAL ON...,"<div class=""fontsize fontface vmargins hmargin...",DIGITAL-MEDIA,global digital servicing digital onboarding co...
1925,82649935,SENIOR ACCOUNTANT Summary ...,"<div class=""fontsize fontface vmargins hmargin...",ACCOUNTANT,senior accountant summary a highly competent m...
1031,22047665,SALES MANAGER Summary Se...,"<div class=""fontsize fontface vmargins hmargin...",SALES,sales manager summary service focus profession...
1749,16911115,BIOMEDICAL ENGINEERING TECHNICIAN II ...,"<div class=""fontsize fontface vmargins hmargin...",ENGINEERING,biomedical engineer technician ii professional...


## 4. Let's really extract skills!!

In [52]:
from spacy.matcher import Matcher

def extract_emails(text):
    doc = nlp(text)
    matcher = Matcher(nlp.vocab)

    email_pattern = [{'TEXT': {'REGEX': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'}}]
    matcher.add('EMAIL', [email_pattern])

    emails = []
    matches = matcher(doc)
    for match_id, start, end in matches:
        emails.append(doc[start:end].text)

    return emails

In [58]:
def extract_entities(text):
    
    doc = nlp(text)
    
    skills = []
    educations = []
    
    for ent in doc.ents:
        if ent.label_ == "EDUCATION":
            educations.append(ent.text)
        if ent.label_ == "SKILL":
            skills.append(ent.text)
        
            
    skills = set(skills)
    education = set(educations)
    # Extract emails
    emails = extract_emails(text)
            
    dict1 = {'education':education,'skills':skills, 'emails': emails}
    #df = pd.DataFrame.from_dict(dict1, orient='index')
    return dict1

def unique_skills(x):
    return list(set(x))

## 7. Let's load the PDF - add some realism

In [63]:
from PyPDF2 import PdfReader
import pickle

def readPDF(cv_path, page=0):
    reader = PdfReader(cv_path)
    page = reader.pages[page]
    text = page.extract_text()
    text = preprocessing(text)
    doc = nlp(text)
    entities = extract_entities(text)
    return entities
    

In [60]:
if __name__=="__main__":
    df = readPDF('Ashmita_Phuyal_Resume.pdf')
    print(df)

{'education': {'master'}, 'skills': {'database', 'interaction', 'business intelligence', 'redis', 'latex', 'visualization', 'python', 'component', 'data science', 'computer science', 'business', 'security', 'data analysis', 'mysql', 'software', 'design', 'tableau', 'network security', 'neo4j', 'wireless', 'mongodb', 'data mining', 'data warehouse'}, 'emails': ['ashmitaphuyal9@gmail.com'], 'work_experience': ['work experience']}


In [None]:
with open('resume_entities.pkl', 'wb') as file:
        pickle.dump(entities, file)