In [1]:
from nltk.corpus import stopwords
class constants:
    # Omkar Pathak
    NAME_PATTERN = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]

    # Education (Upper Case Mandatory)
    EDUCATION = [
                'BE', 'B.E.', 'B.E', 'BS', 'B.S', 'ME', 'M.E',
                'M.E.', 'MS', 'M.S', 'BTECH', 'MTECH',
                'SSC', 'HSC', 'CBSE', 'ICSE', 'X', 'XII'
            ]

    NOT_ALPHA_NUMERIC = r'[^a-zA-Z\d]'

    NUMBER = r'\d+'

    # For finding date ranges
    MONTHS_SHORT = r'''(jan)|(feb)|(mar)|(apr)|(may)|(jun)|(jul)
                       |(aug)|(sep)|(oct)|(nov)|(dec)'''
    MONTHS_LONG = r'''(january)|(february)|(march)|(april)|(may)|(june)|(july)|
                       (august)|(september)|(october)|(november)|(december)'''
    MONTH = r'(' + MONTHS_SHORT + r'|' + MONTHS_LONG + r')'
    YEAR = r'(((20|19)(\d{2})))'

    STOPWORDS = set(stopwords.words('english'))

    RESUME_SECTIONS_PROFESSIONAL = [
                        'experience',
                        'education',
                        'interests',
                        'professional experience',
                        'publications',
                        'skills',
                        'certifications',
                        'objective',
                        'career objective',
                        'summary',
                        'leadership'
                    ]

    RESUME_SECTIONS_GRAD = [
                        'accomplishments',
                        'experience',
                        'education',
                        'interests',
                        'projects',
                        'professional experience',
                        'publications',
                        'skills',
                        'certifications',
                        'objective',
                        'career objective',
                        'summary',
                        'leadership'
                    ]


In [2]:
# Author: Omkar Pathak

import io
import os
import re
import nltk
import pandas as pd
import docx2txt
from datetime import datetime
from dateutil import relativedelta

from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFSyntaxError
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

cs = constants()

class Utils():
    def extract_text_from_pdf(pdf_path):
        '''
        Helper function to extract the plain text from .pdf files

        :param pdf_path: path to PDF file to be extracted (remote or local)
        :return: iterator of string of extracted text
        '''
        # https://www.blog.pythonlibrary.org/2018/05/03/exporting-data-from-pdfs-with-python/
        if not isinstance(pdf_path, io.BytesIO):
            # extract text from local pdf file
            with open(pdf_path, 'rb') as fh:
                try:
                    for page in PDFPage.get_pages(
                            fh,
                            caching=True,
                            check_extractable=True
                    ):
                        resource_manager = PDFResourceManager()
                        fake_file_handle = io.StringIO()
                        converter = TextConverter(
                            resource_manager,
                            fake_file_handle,
                            codec='utf-8',
                            laparams=LAParams()
                        )
                        page_interpreter = PDFPageInterpreter(
                            resource_manager,
                            converter
                        )
                        page_interpreter.process_page(page)

                        text = fake_file_handle.getvalue()
                        yield text

                        # close open handles
                        converter.close()
                        fake_file_handle.close()
                except PDFSyntaxError:
                    return
        else:
            # extract text from remote pdf file
            try:
                for page in PDFPage.get_pages(
                        pdf_path,
                        caching=True,
                        check_extractable=True
                ):
                    resource_manager = PDFResourceManager()
                    fake_file_handle = io.StringIO()
                    converter = TextConverter(
                        resource_manager,
                        fake_file_handle,
                        codec='utf-8',
                        laparams=LAParams()
                    )
                    page_interpreter = PDFPageInterpreter(
                        resource_manager,
                        converter
                    )
                    page_interpreter.process_page(page)

                    text = fake_file_handle.getvalue()
                    yield text

                    # close open handles
                    converter.close()
                    fake_file_handle.close()
            except PDFSyntaxError:
                return


    def get_number_of_pages(file_name):
        try:
            if isinstance(file_name, io.BytesIO):
                # for remote pdf file
                count = 0
                for page in PDFPage.get_pages(
                            file_name,
                            caching=True,
                            check_extractable=True
                ):
                    count += 1
                return count
            else:
                # for local pdf file
                if file_name.endswith('.pdf'):
                    count = 0
                    with open(file_name, 'rb') as fh:
                        for page in PDFPage.get_pages(
                                fh,
                                caching=True,
                                check_extractable=True
                        ):
                            count += 1
                    return count
                else:
                    return None
        except PDFSyntaxError:
            return None


    def extract_text_from_docx(doc_path):
        '''
        Helper function to extract plain text from .docx files

        :param doc_path: path to .docx file to be extracted
        :return: string of extracted text
        '''
        try:
            temp = docx2txt.process(doc_path)
            text = [line.replace('\t', ' ') for line in temp.split('\n') if line]
            return ' '.join(text)
        except KeyError:
            return ' '


    def extract_text_from_doc(doc_path):
        '''
        Helper function to extract plain text from .doc files

        :param doc_path: path to .doc file to be extracted
        :return: string of extracted text
        '''
        try:
            try:
                import textract
            except ImportError:
                return ' '
            text = textract.process(doc_path).decode('utf-8')
            return text
        except KeyError:
            return ' '


    def extract_text(file_path, extension):
        '''
        Wrapper function to detect the file extension and call text
        extraction function accordingly

        :param file_path: path of file of which text is to be extracted
        :param extension: extension of file `file_name`
        '''
        text = ''
        if extension == '.pdf':
            for page in extract_text_from_pdf(file_path):
                text += ' ' + page
        elif extension == '.docx':
            text = extract_text_from_docx(file_path)
        elif extension == '.doc':
            text = extract_text_from_doc(file_path)
        return text


    def extract_entity_sections_grad(text):
        '''
        Helper function to extract all the raw text from sections of
        resume specifically for graduates and undergraduates

        :param text: Raw text of resume
        :return: dictionary of entities
        '''
        text_split = [i.strip() for i in text.split('\n')]
        # sections_in_resume = [i for i in text_split if i.lower() in sections]
        entities = {}
        key = False
        for phrase in text_split:
            if len(phrase) == 1:
                p_key = phrase
            else:
                p_key = set(phrase.lower().split()) & set(cs.RESUME_SECTIONS_GRAD)
            try:
                p_key = list(p_key)[0]
            except IndexError:
                pass
            if p_key in cs.RESUME_SECTIONS_GRAD:
                entities[p_key] = []
                key = p_key
            elif key and phrase.strip():
                entities[key].append(phrase)

        # entity_key = False
        # for entity in entities.keys():
        #     sub_entities = {}
        #     for entry in entities[entity]:
        #         if u'\u2022' not in entry:
        #             sub_entities[entry] = []
        #             entity_key = entry
        #         elif entity_key:
        #             sub_entities[entity_key].append(entry)
        #     entities[entity] = sub_entities

        # pprint.pprint(entities)

        # make entities that are not found None
        # for entity in cs.RESUME_SECTIONS:
        #     if entity not in entities.keys():
        #         entities[entity] = None
        return entities


    def extract_entities_wih_custom_model(custom_nlp_text):
        '''
        Helper function to extract different entities with custom
        trained model using SpaCy's NER

        :param custom_nlp_text: object of `spacy.tokens.doc.Doc`
        :return: dictionary of entities
        '''
        entities = {}
        for ent in custom_nlp_text.ents:
            if ent.label_ not in entities.keys():
                entities[ent.label_] = [ent.text]
            else:
                entities[ent.label_].append(ent.text)
        for key in entities.keys():
            entities[key] = list(set(entities[key]))
        return entities


    def get_total_experience(experience_list):
        '''
        Wrapper function to extract total months of experience from a resume

        :param experience_list: list of experience text extracted
        :return: total months of experience
        '''
        exp_ = []
        for line in experience_list:
            experience = re.search(
                r'(?P<fmonth>\w+.\d+)\s*(\D|to)\s*(?P<smonth>\w+.\d+|present)',
                line,
                re.I
            )
            if experience:
                exp_.append(experience.groups())
        total_exp = sum(
            [get_number_of_months_from_dates(i[0], i[2]) for i in exp_]
        )
        total_experience_in_months = total_exp
        return total_experience_in_months


    def get_number_of_months_from_dates(date1, date2):
        '''
        Helper function to extract total months of experience from a resume

        :param date1: Starting date
        :param date2: Ending date
        :return: months of experience from date1 to date2
        '''
        if date2.lower() == 'present':
            date2 = datetime.now().strftime('%b %Y')
        try:
            if len(date1.split()[0]) > 3:
                date1 = date1.split()
                date1 = date1[0][:3] + ' ' + date1[1]
            if len(date2.split()[0]) > 3:
                date2 = date2.split()
                date2 = date2[0][:3] + ' ' + date2[1]
        except IndexError:
            return 0
        try:
            date1 = datetime.strptime(str(date1), '%b %Y')
            date2 = datetime.strptime(str(date2), '%b %Y')
            months_of_experience = relativedelta.relativedelta(date2, date1)
            months_of_experience = (months_of_experience.years
                                    * 12 + months_of_experience.months)
        except ValueError:
            return 0
        return months_of_experience


    def extract_entity_sections_professional(text):
        '''
        Helper function to extract all the raw text from sections of
        resume specifically for professionals

        :param text: Raw text of resume
        :return: dictionary of entities
        '''
        text_split = [i.strip() for i in text.split('\n')]
        entities = {}
        key = False
        for phrase in text_split:
            if len(phrase) == 1:
                p_key = phrase
            else:
                p_key = set(phrase.lower().split()) \
                        & set(cs.RESUME_SECTIONS_PROFESSIONAL)
            try:
                p_key = list(p_key)[0]
            except IndexError:
                pass
            if p_key in cs.RESUME_SECTIONS_PROFESSIONAL:
                entities[p_key] = []
                key = p_key
            elif key and phrase.strip():
                entities[key].append(phrase)
        return entities


    def extract_email(text):
        '''
        Helper function to extract email id from text

        :param text: plain text extracted from resume file
        '''
        email = re.findall(r"([^@|\s]+@[^@]+\.[^@|\s]+)", text)
        if email:
            try:
                return email[0].split()[0].strip(';')
            except IndexError:
                return None


    def extract_name(nlp_text, matcher):
        '''
        Helper function to extract name from spacy nlp text

        :param nlp_text: object of `spacy.tokens.doc.Doc`
        :param matcher: object of `spacy.matcher.Matcher`
        :return: string of full name
        '''
        pattern = [cs.NAME_PATTERN]

        matcher.add('NAME', None, *pattern)

        matches = matcher(nlp_text)

        for _, start, end in matches:
            span = nlp_text[start:end]
            if 'name' not in span.text.lower():
                return span.text


    def extract_mobile_number(text, custom_regex=None):
        '''
        Helper function to extract mobile number from text

        :param text: plain text extracted from resume file
        :return: string of extracted mobile numbers
        '''
        # Found this complicated regex on :
        # https://zapier.com/blog/extract-links-email-phone-regex/
        # mob_num_regex = r'''(?:(?:\+?([1-9]|[0-9][0-9]|
        #     [0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|
        #     [2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|
        #     [0-9]1[02-9]|[2-9][02-8]1|
        #     [2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|
        #     [2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{7})
        #     (?:\s*(?:#|x\.?|ext\.?|
        #     extension)\s*(\d+))?'''
        if not custom_regex:
            mob_num_regex = r'''(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)
                            [-\.\s]*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})'''
            phone = re.findall(re.compile(mob_num_regex), text)
        else:
            phone = re.findall(re.compile(custom_regex), text)
        if phone:
            number = ''.join(phone[0])
            return number


    def extract_skills(nlp_text, noun_chunks, skills_file=None):
        '''
        Helper function to extract skills from spacy nlp text

        :param nlp_text: object of `spacy.tokens.doc.Doc`
        :param noun_chunks: noun chunks extracted from nlp text
        :return: list of skills extracted
        '''
        tokens = [token.text for token in nlp_text if not token.is_stop]
        if not skills_file:
            data = pd.read_csv(
                os.path.join(os.path.dirname(__file__), 'skills.csv')
            )
        else:
            data = pd.read_csv(skills_file)
        skills = list(data.columns.values)
        skillset = []
        # check for one-grams
        for token in tokens:
            if token.lower() in skills:
                skillset.append(token)

        # check for bi-grams and tri-grams
        for token in noun_chunks:
            token = token.text.lower().strip()
            if token in skills:
                skillset.append(token)
        return [i.capitalize() for i in set([i.lower() for i in skillset])]


    def cleanup(token, lower=True):
        if lower:
            token = token.lower()
        return token.strip()


    def extract_education(nlp_text):
        '''
        Helper function to extract education from spacy nlp text

        :param nlp_text: object of `spacy.tokens.doc.Doc`
        :return: tuple of education degree and year if year if found
                 else only returns education degree
        '''
        edu = {}
        # Extract education degree
        try:
            for index, text in enumerate(nlp_text):
                for tex in text.split():
                    tex = re.sub(r'[?|$|.|!|,]', r'', tex)
                    if tex.upper() in cs.EDUCATION and tex not in cs.STOPWORDS:
                        edu[tex] = text + nlp_text[index + 1]
        except IndexError:
            pass

        # Extract year
        education = []
        for key in edu.keys():
            year = re.search(re.compile(cs.YEAR), edu[key])
            if year:
                education.append((key, ''.join(year.group(0))))
            else:
                education.append(key)
        return education


    def extract_experience(resume_text):
        '''
        Helper function to extract experience from resume text

        :param resume_text: Plain resume text
        :return: list of experience
        '''
        wordnet_lemmatizer = WordNetLemmatizer()
        stop_words = set(stopwords.words('english'))

        # word tokenization
        word_tokens = nltk.word_tokenize(resume_text)

        # remove stop words and lemmatize
        filtered_sentence = [
                w for w in word_tokens if w not
                in stop_words and wordnet_lemmatizer.lemmatize(w)
                not in stop_words
            ]
        sent = nltk.pos_tag(filtered_sentence)

        # parse regex
        cp = nltk.RegexpParser('P: {<NNP>+}')
        cs = cp.parse(sent)

        # for i in cs.subtrees(filter=lambda x: x.label() == 'P'):
        #     print(i)

        test = []

        for vp in list(
            cs.subtrees(filter=lambda x: x.label() == 'P')
        ):
            test.append(" ".join([
                i[0] for i in vp.leaves()
                if len(vp.leaves()) >= 2])
            )

        # Search the word 'experience' in the chunk and
        # then print out the text after it
        x = [
            x[x.lower().index('experience') + 10:]
            for i, x in enumerate(test)
            if x and 'experience' in x.lower()
        ]
        return x


In [None]:
# Author: Omkar Pathak

import os
import multiprocessing as mp
import io
import spacy
import pprint
from spacy.matcher import Matcher


utils = Utils()

class ResumeParser(object):

    def __init__(
        self,
        resume,
        skills_file=None,
        custom_regex=None
    ):
        nlp = spacy.load('en_core_web_sm')
        custom_nlp = spacy.load(os.path.dirname(os.path.abspath(__file__)))
        self.__skills_file = skills_file
        self.__custom_regex = custom_regex
        self.__matcher = Matcher(nlp.vocab)
        self.__details = {
            'name': None,
            'email': None,
            'mobile_number': None,
            'skills': None,
            'college_name': None,
            'degree': None,
            'designation': None,
            'experience': None,
            'company_names': None,
            'no_of_pages': None,
            'total_experience': None,
        }
        self.__resume = resume
        if not isinstance(self.__resume, io.BytesIO):
            ext = os.path.splitext(self.__resume)[1].split('.')[1]
        else:
            ext = self.__resume.name.split('.')[1]
        self.__text_raw = utils.extract_text(self.__resume, '.' + ext)
        self.__text = ' '.join(self.__text_raw.split())
        self.__nlp = nlp(self.__text)
        self.__custom_nlp = custom_nlp(self.__text_raw)
        self.__noun_chunks = list(self.__nlp.noun_chunks)
        self.__get_basic_details()

    def get_extracted_data(self):
        return self.__details

    def __get_basic_details(self):
        cust_ent = utils.extract_entities_wih_custom_model(
                            self.__custom_nlp
                        )
        name = utils.extract_name(self.__nlp, matcher=self.__matcher)
        email = utils.extract_email(self.__text)
        mobile = utils.extract_mobile_number(self.__text, self.__custom_regex)
        skills = utils.extract_skills(
                    self.__nlp,
                    self.__noun_chunks,
                    self.__skills_file
                )
        # edu = utils.extract_education(
        #               [sent.string.strip() for sent in self.__nlp.sents]
        #       )
        entities = utils.extract_entity_sections_grad(self.__text_raw)

        # extract name
        try:
            self.__details['name'] = cust_ent['Name'][0]
        except (IndexError, KeyError):
            self.__details['name'] = name

        # extract email
        self.__details['email'] = email

        # extract mobile number
        self.__details['mobile_number'] = mobile

        # extract skills
        self.__details['skills'] = skills

        # extract college name
        try:
            self.__details['college_name'] = entities['College Name']
        except KeyError:
            pass

        # extract education Degree
        try:
            self.__details['degree'] = cust_ent['Degree']
        except KeyError:
            pass

        # extract designation
        try:
            self.__details['designation'] = cust_ent['Designation']
        except KeyError:
            pass

        # extract company names
        try:
            self.__details['company_names'] = cust_ent['Companies worked at']
        except KeyError:
            pass

        try:
            self.__details['experience'] = entities['experience']
            try:
                exp = round(
                    utils.get_total_experience(entities['experience']) / 12,
                    2
                )
                self.__details['total_experience'] = exp
            except KeyError:
                self.__details['total_experience'] = 0
        except KeyError:
            self.__details['total_experience'] = 0
        self.__details['no_of_pages'] = utils.get_number_of_pages(
                                            self.__resume
                                        )
        return


def resume_result_wrapper(resume):
    parser = ResumeParser(resume)
    return parser.get_extracted_data()


if __name__ == '__main__':
#     pool = mp.Pool(mp.cpu_count())

#     resumes = []
#     data = []
#     for root, directories, filenames in os.walk('C:\\Users\\SAKSOFT093.SAKSOFT093PDC\\Documents\\pyresparser-master\\pyresparser-master\\resumes'):
#         for filename in filenames:
#             file = os.path.join(root, filename)
#             resumes.append(file)

#     results = [
#         pool.apply_async(
#             resume_result_wrapper,
#             args=(x,)
#         ) for x in resumes
#     ]

#     results = [p.get() for p in results]

#     pprint.pprint(results)


In [None]:
from pyresparser import ResumeParser
data = ResumeParser('C:\\Users\\SAKSOFT093.SAKSOFT093PDC\\Documents\\pyresparser-master\\pyresparser-master\\OmkarResume.pdf').get_extracted_data()

In [None]:
{'college_name': None,
 'company_names': None,
 'degree': ['B.E. IN COMPUTER ENGINEERING'],
 'designation': ['TECHNICAL CONTENT WRITER',
                 'Machine Learning',
                 'Schlumberger\nDATA ENGINEER'],
 'email': 'omkarpathak27@gmail.com',
 'experience': ['Schlumberger',
                'DATA ENGINEER',
                'July 2018 - Present',
                '• Responsible for implementing and managing an end-to-end '
                'CI/CD Pipeline with custom validations for Informatica '
                'migrations which',
                'Pune, Maharashtra, India',
                'brought migration time to 1.5 hours from 9 hours without any '
                'manual intervention',
                '• Enhancing, auditing and maintaining custom data ingestion '
                'framework that ingest around 1TB of data each day to over 70 '
                'business',
                'units',
                '• Working with L3 developer team to ensure the discussed '
                'Scrum PBI’s are delivered on time for data ingestions',
                '• Planning and Executing QA and Production Release Cycle '
                'activities',
                'Pune, Maharashtra, India',
                'June 2018 - July 2018',
                'Truso',
                'FULL STACK DEVELOPER INTERN',
                '• Created RESTful apis',
                '• Tried my hands on Angular 5/6',
                '• Was responsible for Django backend development',
                'Propeluss',
                'DATA ENGINEERING INTERN',
                '• Wrote various automation scripts to scrape data from '
                'various websites.',
                '• Applied Natural Language Processing to articles scraped '
                'from the internet to extract different entities in these '
                'articles using entity',
                'Pune, Maharashtra, India',
                'October 2017 - January 2018',
                'extraction algorithms and applying Machine Learning to '
                'classify these articles.',
                '• Also applied KNN with LSA for extracting relevant tags for '
                'various startups based on their works.',
                'GeeksForGeeks',
                'TECHNICAL CONTENT WRITER',
                '• Published 4 articles for the topics such as Data Structures '
                'and Algorithms and Python',
                'Pune, Maharashtra, India',
                'July 2017 - September 2017',
                'Softtestlab Technologies',
                'WEB DEVELOPER INTERN',
                '• Was responsible for creating an internal project for the '
                'company using PHP and Laravel for testing purposes',
                '• Worked on a live project for creating closure reports using '
                'PHP and Excel',
                'Pune, Maharashtra, India',
                'June 2017 - July 2017'],
 'mobile_number': '8087996634',
 'name': '| \uf0e0',
 'no_of_pages': 3,
 'skills': ['Php',
            'Cloud',
            'Reports',
            'Linux',
            'Training',
            'Windows',
            'Engineering',
            'System',
            'Parser',
            'Content',
            'Unix',
            'Writing',
            'Excel',
            'Website',
            'Machine learning',
            'Github',
            'Operating systems',
            'Programming',
            'Css',
            'C',
            'Api',
            'Photography',
            'Flask',
            'Opencv',
            'Scrum',
            'Shell',
            'Technical',
            'Javascript',
            'Django',
            'Mysql',
            'Testing',
            'Python',
            'Html',
            'Migration',
            'Security',
            'Auditing',
            'Algorithms',
            'C++',
            'Apis',
            'Automation',
            'Analytics'],
 'total_experience': 4.0}