# Building a Resume Parser using NLP(Spacy) and Machine Learning
## by Bhushan Sonawane

In [1]:
import docx2txt
from PyPDF2 import PdfFileReader, PdfFileWriter, PdfFileMerger, PdfReader

In [2]:
#Extracting text from docx
def doctotext(m):
    temp = docx2txt.process(m)
    resume_text = [line.replace('\t', '') for line in temp.split('\n') if line]
    text = ''.join(resume_text)
    return(text)

In [3]:
#Extracting text from PDF
def pdftotext(m):
    pdfFileObj = open(m, 'rb')
    pdfFileReader = PdfReader(pdfFileObj)
    num_pages = len(pdfFileReader.pages)
    currentPageNumber = 0
    text = ''
    while(currentPageNumber < num_pages):
        pdfPage = pdfFileReader.pages[currentPageNumber]
        text = text + pdfPage.extract_text()
        currentPageNumber += 1
    return (text)

In [4]:
if __name__ == '__main__':
    FilePath = 'C:/Users/dell/Downloads/Stockholm-Resume-Template-Simple.pdf'
    FilePath.lower().endswith(('png','docx'))
    if FilePath.endswith('.docx'):
        textinput = doctotext(FilePath)
    elif FilePath.endswith('.pdf'):
        textinput = pdftotext(FilePath)
    else:
        print('File Not Supported')

In [5]:
import re
import nltk

In [6]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

### Extracting Name

In [7]:
import spacy

In [8]:
import en_core_web_sm
from spacy.matcher import Matcher

In [9]:
# load pre-trained model
nlp = en_core_web_sm.load()
# initialize matcher
matcher = Matcher(nlp.vocab)

In [10]:
def extract_name(resume_text):
    nlp_text = nlp(resume_text)
    pattern = [{'POS': 'PROPN'},{'POS': 'PROPN'}]
    matcher.add('NAME', [pattern])
    matches = matcher(nlp_text)
    for match_id, start, end in matches:
        span = nlp_text[start:end]
        return span.text
print('Name: ', extract_name(textinput))

Name:  Jason Miller


### Extracting Qualifications

In [11]:
STOPWORDS = set(stopwords.words('english'))

In [12]:
EDUCATION = [
    'BE', 'B.E.', 'B.E', 'BS', 'B.S',
    'ME', 'M.E.', 'M.E', 'M.B.A', 'MBA', 'MS', 'M.S',
    'BTECH', 'B.TECH', 'M.TECH', 'MTECH',
    'SSLC', 'SSC', 'HSC', 'CBSE', 'ICSE', 'X', 'XII'
]

In [13]:
def extract_education(resume_text):
    nlp_text = nlp(resume_text)
    nlp_text = [sent.text.strip() for sent in nlp_text.sents]
    edu = {}
    for index, text in enumerate(nlp_text):
        for tex in text.split():
            tex = re.sub(r'[?|$|.|!|,]', r'', tex)
            if tex.upper() in EDUCATION and tex not in STOPWORDS:
                edu[tex] = text + nlp_text[index + 1]
                
    #Extract year
    education = []
    for key in edu.keys():
        year = re.search(re.compile(r'(((20|19)(\d{})))'), edu[key])
        if year:
            education.append((key, '', join(year[0])))
        else:
            education.append(key)
    return education

In [14]:
print('Qualifications: ', extract_education(textinput))

Qualifications:  []


### Extracting Skills

In [15]:
import pandas as pd
import spacy

In [16]:
nlp = spacy.load('en_core_web_sm')

In [None]:
def extract_skills(resume_text):
    nlp_text = nlp(resume_text)
    nlp_text = [sent.text.strip() for sent in nlp_text.sents]
    
    #removing stopwords and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]
    colnames = ['skill']
    
    #extract values
    skills = textinput.skill.tolist()
    print(skills)
    skillset = []
    
    for token in tokens:
        if token.lower() in skills:
            skillset.append(token)
            
    noun_chunks = nlp.noun_chunks
    for token in noun_chunks:
        token = token.text.lower().strip()
        if token in skills:
            skillset.append(token)
    return [i.capitalize() for i in set([i.lower() for i in skillset])]

print('Skills', extract_skills(textinput))

### Extract Mobile Number

In [18]:
def extract_mobile_number(resume_text):
    phone = re.findall(re.compile(r'(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?'),resume_text)
    
    if phone:
        number = ''.join(phone[0])
        if len(number) > 10:
            return number
        else:
            return number
print('Mobile Number: ', extract_mobile_number(textinput))

Mobile Number:  386868344


### Extract email

In [19]:
def extract_email_address(string):
    r = re.compile(r'[\w\.-]+@[\w\.-]+')
    return r.findall(string)
print('Mail id: ', extract_email_address(textinput))

Mail id:  ['email@email.com']
