# Resume Parsing

## 1. Load data

In [2]:
import pandas as pd
import numpy as np

df_resume = pd.read_csv("resume.csv")

In [3]:
df_resume.Category.unique()

array(['HR', 'DESIGNER', 'INFORMATION-TECHNOLOGY', 'TEACHER', 'ADVOCATE',
       'BUSINESS-DEVELOPMENT', 'HEALTHCARE', 'FITNESS', 'AGRICULTURE',
       'BPO', 'SALES', 'CONSULTANT', 'DIGITAL-MEDIA', 'AUTOMOBILE',
       'CHEF', 'FINANCE', 'APPAREL', 'ENGINEERING', 'ACCOUNTANT',
       'CONSTRUCTION', 'PUBLIC-RELATIONS', 'BANKING', 'ARTS', 'AVIATION'],
      dtype=object)

In [4]:
df_resume.shape

(2484, 4)

In [5]:
df_resume = df_resume.reindex(np.random.permutation(df_resume.index))
df_resume = df_resume.copy().iloc[:1000, ]
df_resume.shape

(1000, 4)

## 2. Load skill data


In [6]:
import spacy

nlp = spacy.load('en_core_web_md')
skill_path = 'skills_educations.jsonl'

In [7]:
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_path)
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

In [8]:
doc = nlp("Chaky loves ajax.")
doc.ents

(Chaky, ajax)

## 3. Let's try to extract skills from this resume.csv

In [9]:
df_resume.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
2094,37375999,OWNER Summary Dynamic eve...,"<div class=""fontsize fontface vmargins hmargin...",PUBLIC-RELATIONS
2009,12666174,REGIONAL SCHEDULE MANAGER S...,"<div class=""fontsize fontface vmargins hmargin...",CONSTRUCTION
496,37560528,DOMESTIC VIOLENCE COUNSELOR ADVOCATE ...,"<div class=""fontsize fontface vmargins hmargin...",ADVOCATE
1653,22249155,EMBROIDERY MACHINE OPERATOR Sum...,"<div class=""fontsize fontface vmargins hmargin...",APPAREL
1097,34131484,SALES ASSOCIATE Summary ...,"<div class=""fontsize fontface vmargins hmargin...",SALES


In [16]:
#clean our data
from spacy.lang.en.stop_words import STOP_WORDS

def preprocessing(sentence):
    stopwords    = list(STOP_WORDS)
    doc          = nlp(sentence)
    clean_tokens = []
    
    for token in doc:
        if token.text not in stopwords and token.pos_ != 'PUNCT' and token.pos_ != 'SYM' and \
            token.pos_ != 'SPACE':
                clean_tokens.append(token.lemma_.lower().strip())
                
    return " ".join(clean_tokens)

In [17]:
random_resume = df_resume.Resume_str.iloc[5]
random_resume[:300]

'         PATIENT ADVOCATE           Summary    Seeking an opportunity in an Management/HR Department\xa0where my professional\xa0experience and education will allow me to make an immediate contribution, as an integral part of a progressive organization.      Education and Training        HealthCare Admini'

In [12]:
preprocessing(random_resume[:300])

'patient advocate summary seek opportunity management hr department professional experience education allow immediate contribution integral progressive organization education training healthcare admini'

In [13]:
for i, row in df_resume.iterrows():
    clean_text = preprocessing(row.Resume_str)
    df_resume.at[i, 'Clean_resume'] = clean_text

  df_resume.at[i, 'Clean_resume'] = clean_text


In [16]:
df_resume.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category,Clean_resume
456,49486820,BENEFIT ADVOCATE Career Ove...,"<div class=""fontsize fontface vmargins hmargin...",ADVOCATE,benefit advocate career overview customer serv...
1265,34319869,GLOBAL DIGITAL SERVICING ~ DIGITAL ON...,"<div class=""fontsize fontface vmargins hmargin...",DIGITAL-MEDIA,global digital servicing digital onboarding co...
1925,82649935,SENIOR ACCOUNTANT Summary ...,"<div class=""fontsize fontface vmargins hmargin...",ACCOUNTANT,senior accountant summary a highly competent m...
1031,22047665,SALES MANAGER Summary Se...,"<div class=""fontsize fontface vmargins hmargin...",SALES,sales manager summary service focus profession...
1749,16911115,BIOMEDICAL ENGINEERING TECHNICIAN II ...,"<div class=""fontsize fontface vmargins hmargin...",ENGINEERING,biomedical engineer technician ii professional...


## 4. Let's really extract skills!!

In [18]:
from spacy.matcher import Matcher

def extract_emails(text):
    doc = nlp(text)
    matcher = Matcher(nlp.vocab)

    email_pattern = [{'TEXT': {'REGEX': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'}}]
    matcher.add('EMAIL', [email_pattern])

    emails = []
    matches = matcher(doc)
    for match_id, start, end in matches:
        emails.append(doc[start:end].text)

    return emails

In [19]:
def extract_entities(text):
    
    doc = nlp(text)
    
    skills = []
    educations = []
    
    for ent in doc.ents:
        if ent.label_ == "EDUCATION":
            educations.append(ent.text)
        if ent.label_ == "SKILL":
            skills.append(ent.text)
        
            
    skills = set(skills)
    education = set(educations)
    # Extract emails
    emails = extract_emails(text)
            
    dict1 = {'education':education,'skills':skills, 'emails': emails}
    #df = pd.DataFrame.from_dict(dict1, orient='index')
    return dict1

def unique_skills(x):
    return list(set(x))

## 7. Let's load the PDF - add some realism

In [20]:
from PyPDF2 import PdfReader
import pickle

def readPDF(cv_path, page=0):
    reader = PdfReader(cv_path)
    page = reader.pages[page]
    text = page.extract_text()
    text = preprocessing(text)
    doc = nlp(text)
    entities = extract_entities(text)
    return entities
    

In [21]:
if __name__=="__main__":
    df = readPDF('Ashmita_Phuyal_Resume.pdf')
    print(df)

{'education': {'master'}, 'skills': {'wireless', 'database', 'python', 'redis', 'security', 'interaction', 'latex', 'tableau', 'computer science', 'component', 'data science', 'visualization', 'software', 'business', 'network security', 'data analysis', 'data warehouse', 'mongodb', 'business intelligence', 'neo4j', 'design', 'data mining', 'mysql'}, 'emails': ['ashmitaphuyal9@gmail.com']}
