In [1]:
!pip install pdfminer.six
!pip install docx2txt
!pip install spacy
!pip install nltk



In [2]:
#python -m spacy download en_core_web_sm
#python -m nltk nltk.download('words')

In [3]:
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import spacy 
from spacy.matcher import Matcher
import re
import pandas as pd
from nltk.corpus import stopwords
import os
import io
import nltk
import docx2txt

In [4]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as fh:
        # iterate over all pages of PDF document
        for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
            # creating a resoure manager
            resource_manager = PDFResourceManager()
            
            # create a file handle
            fake_file_handle = io.StringIO()
            
            # creating a text converter object
            converter = TextConverter(
                                resource_manager, 
                                fake_file_handle, 
                                codec='utf-8', 
                                laparams=LAParams()
                        )

            # creating a page interpreter
            page_interpreter = PDFPageInterpreter(
                                resource_manager, 
                                converter
                            )

            # process current page
            page_interpreter.process_page(page)
            
            # extract text
            text = fake_file_handle.getvalue()
            yield text

            # close open handles
            converter.close()
            fake_file_handle.close()

In [5]:
def extract_text_from_doc(doc_path):
    '''
    Helper function to extract plain text from .doc or .docx files
    :param doc_path: path to .doc or .docx file to be extracted
    :return: string of extracted text
    '''
    temp = docx2txt.process(doc_path)
    text = [line.replace('\t', ' ') for line in temp.split('\n') if line]
    return ' '.join(text)

In [6]:
# Rule based matching to find
nlp = spacy.load('en_core_web_sm')

# initalize matcher with a vocab
matcher = Matcher(nlp.vocab)

def extract_name(resume_text):
    nlp_text = nlp(resume_text)
    
    # First name and Last name are always Proper Nouns
    pattern = [[{'POS': 'PROPN'}, {'POS': 'PROPN'}]]
    
    matcher.add('NAME', None, *pattern)
    
    print(matcher)
    print(pattern)
    
    matches = matcher(nlp_text)
    
    for match_id, start, end in matches:
        span = nlp_text[start:end]
        return span.text
#As you can observe above, we have first defined a pattern that we want 
# to search in our text. Here, we have created a simple pattern based
# on the fact that First Name and Last Name of a person is always a
# Proper Noun. Hence we have specified spacy that searches for a 
# pattern such that two continuous words whose part of speech tag 
# is equal to PROPN (Proper Noun).

### Third Step: Extracting Phone Numbers

For extracting phone numbers, we will be making use of regular expressions. Phone numbers also have multiple forms such as (+91) 1234567890 or +911234567890 or +91 123 456 7890 or +91 1234567890. Hence, we need to define a generic regular expression that can match all similar combinations of phone numbers. Thanks to this blog, I was able to extract phone numbers from resume text by making slight tweaks.

Our phone number extraction function will be as follows:

In [7]:
def extract_mobile_number(text):
    phone = re.findall(re.compile(r'(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?'), text)
    if phone:
        number = ''.join(phone[0])
        if len(number) > 10:
            return '+' + number
        else:
            return number

### Fourth Step: Extracting Email

For extracting Email IDs from resume, we can use a similar approach that we used for extracting mobile numbers. Email IDs have a fixed form i.e. an alphanumeric string should follow a @ symbol, again followed by a string, followed by a . (dot) and a string at the end. We can use regular expression to extract such expression from text.

In [8]:
def extract_email(email):
    email = re.findall("([^@|\s]+@[^@]+\.[^@|\s]+)", email)
    if email:
        try:
            return email[0].split()[0].strip(';')
        except IndexError:
            return None

### Fifth Step: Extracting Skills
Now that we have extracted some basic information about the person, lets extract the thing that matters the most from a recruiter point of view, i.e. skills. We can extract skills using a technique called tokenization. Tokenization simply is breaking down of text into paragraphs, paragraphs into sentences, sentences into words. Hence, there are two major techniques of tokenization: Sentence Tokenization and Word Tokenization.

Before implementing tokenization, we will have to create a dataset against which we can compare the skills in a particular resume. For this we will make a comma separated values file (.csv) with desired skillsets. For example, if I am the recruiter and I am looking for a candidate with skills including NLP, ML, AI then I can make a csv file with contents:

Assuming we gave the above file, a name as skills.csv, we can move further to tokenize our extracted text and compare the skills against the ones in skills.csv file. For reading csv file, we will be using the pandas module. After reading the file, we will removing all the stop words from our resume text. In short, a stop word is a word which does not change the meaning of the sentence even if it is removed

In [37]:
# load pre-trained model
nlp = spacy.load('en_core_web_sm')
def extract_skills(resume_text):
    '''
    Helper function to extract skills from spacy nlp text
    :param nlp_text: object of `spacy.tokens.doc.Doc`
    :param noun_chunks: noun chunks extracted from nlp text
    :return: list of skills extracted
    '''
    nlp_text = nlp(resume_text)
    noun_chunks = nlp_text.noun_chunks
    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]
    
    # Reading the csv file
    data = pd.read_csv("skills.csv")
    
    # extract values
    skills = list(data.columns.values)
    skillset = []
    
    # check for one-grams (example: python)
    for token in tokens:
        if token.lower() in skills:
            skillset.append(token)
    
    # check for bi-grams and tri-grams (example: machine learning)
    for token in noun_chunks:
        token = token.text.lower().strip()
        if token in skills:
            skillset.append(token)
        
    return [i.capitalize() for i in set([i.lower() for i in skillset])]

In [38]:
print(extract_skills(resume_text))

['Javascript', 'Design', 'Pandas', 'Software engineering', 'Mobile', 'Analytics', 'R', 'Tensorflow', 'Prototype', 'Python', 'Js', 'Css', 'Ui', 'Algorithms', 'System', 'Docker', 'Analyze', 'Transportation', 'Html', 'Github', 'Json', 'Programming', 'Linux', 'C', 'Statistics', 'Jira', 'Java', 'Architecture', 'Numpy', 'Automation', 'Cloud', 'Research', 'Vmware', 'Engineering', 'Computer science', 'Rest', 'Spark', 'Sql']


### Sixth Step: Extracting Education:
Now, moving towards the last step of our resume parser, we will be extracting the candidates education details. The details that we will be specifically extracting are the degree and the year of passing. For example, XYZ has completed MS in 2018, then we will be extracting a tuple like ('MS', '2018'). For this we will be requiring to discard all the stop words. We will be using nltk module to load an entire list of stopwords and later on discard those from our resume text.

Recruiters are very specific about the minimum education/degree required for a particular job. Hence, we will be preparing a list EDUCATION that will specify all the equivalent degrees that are as per requirements.

In [18]:
# load pre-trained model
nlp = spacy.load('en_core_web_sm')

# Grad all general stop words
STOPWORDS = set(stopwords.words('english'))

# Education Degrees
EDUCATION = [
            'BE','B.E.', 'B.E', 'BS', 'B.S', 
            'ME', 'M.E', 'M.E.', 'MS', 'M.S', 
            'BTECH', 'B.TECH', 'M.TECH', 'MTECH', 
            'SSC', 'HSC', 'CBSE', 'ICSE', 'X', 'XII', 'Bachelors of Science'
        ]

def extract_education(resume_text):
    nlp_text = nlp(resume_text)

    # Sentence Tokenizer
    nlp_text = [sent.string.strip() for sent in nlp_text.sents]

    edu = {}
    # Extract education degree
    for index, text in enumerate(nlp_text):
        for tex in text.split():
            # Replace all special symbols
            tex = re.sub(r'[?|$|.|!|,]', r'', tex)
            if tex.upper() in EDUCATION and tex not in STOPWORDS:
                edu[tex] = text + nlp_text[index + 1]

    # Extract year
    education = []
    for key in edu.keys():
        year = re.search(re.compile(r'(((20|19)(\d{2})))'), edu[key])
        if year:
            education.append((key, ''.join(year[0])))
        else:
            education.append(key)
    return education

In [19]:
def extract_text(file_path, extension):
    '''
    Wrapper function to detect the file extension and call text extraction function accordingly
    :param file_path: path of file of which text is to be extracted
    :param extension: extension of file `file_name`
    '''
    text = ''
    if extension == '.pdf':
        for page in extract_text_from_pdf(file_path):
            text += ' ' + page
    elif extension == '.docx' or extension == '.doc':
        text = extract_text_from_doc(file_path)
    return text

In [20]:
resume = "TimothyNguyen2022.pdf"
resume_text = extract_text(resume, os.path.splitext(resume)[1])
#ans = extract_text_from_pdf(pdf_path)

In [21]:
print(extract_education(resume_text))
print(extract_mobile_number(resume_text))
print(extract_name(resume_text))

[]
6178883076
<spacy.matcher.matcher.Matcher object at 0x0000026E1F541348>
[[{'POS': 'PROPN'}, {'POS': 'PROPN'}]]
Timothy Nguyen


In [14]:
print(resume_text)

 Timothy Nguyen
quynhthoa1972@gmail.com · 617-888-3076 · www.linkedin.com/in/timothy-nguyen-414525144 · https://timothynguyen.github.io/TimothyNguyen
EDUCATION
University of Massachusetts, Amherst
Bachelor of Science in Computer Science, Statistics & Data Science
GPA: 3.835
Relevant Coursework: Algorithms, Data Structures, Software Engineering, Scalable Web Systems, Computer Network Theory,
Databases, Information Retrieval, Data Science, Natural Language Processing, Artificial Intelligence

Amherst, MA
Graduation: May 2022

WORK EXPERIENCE
MathWorks
Incoming Software Engineering Intern

Sept 2021 - Dec 2021

May 2021 - Aug 2021

● Modernizing & Developing JavaScript Web Components for MathWorks products with HTML, CSS, JS & React.

Dell Technologies
Software Engineering Intern for the Edge Solutions Business Unit

● Collaborated to prototype edge-computing platforms with Litmus Edge and VMware to optimize the manufacturing industry.
● Capture, analyze, visualize, and manage industrial 