In [29]:
!pip install pdfminer3



## Installing Spacy and loading spacy english model


In [135]:
!pip install spacy
!pip install pandas
!pip install nltk
!python3 -m spacy download en_core_web_sm
!python3 -m nltk nltk.download('words')

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[K     |████████████████████████████████| 13.9 MB 2.1 MB/s eta 0:00:01


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
zsh:1: unknown sort specifier


## Importing all the necessary librabries
### 1.  pdfminer3
#### For processing pdf, i had gone through many libraries by google searches and i found every library has its own strength and weakness.Some of them were pypdf2, pdfplumber, textract and pdfminer3. The result with the pdfminer3 was good as expected.
### 2. Spacy 
#### For extracting informations from Resume using Matcher
### 3. NLTK
#### For stopwords

In [152]:
import re
import spacy
from nltk.corpus import stopwords
import pandas as pd
from spacy.matcher import Matcher
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abdullahmazhar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## File Path

In [138]:
file_path = 'Amircv.pdf'

## Extracting Text from PDF using PDFMINER

In [139]:
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter
import io


def extract_text_pdf(file_path):
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
    page_interpreter = PDFPageInterpreter(resource_manager, converter)

    with open(file_path, 'rb') as fh:

        for page in PDFPage.get_pages(fh,
                                      caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)

        text = fake_file_handle.getvalue()

    # close open handles
    converter.close()
    fake_file_handle.close()
   
    return text




## Extracting Name using Spacy Matcher

In [140]:


# load pre-trained model
nlp = spacy.load('en_core_web_sm')

# initialize matcher with a vocab
matcher = Matcher(nlp.vocab)

def extract_name(resume_text):
    nlp_text = nlp(resume_text)
    
    # First name and Last name are always Proper Nouns
    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
    
    matcher.add('NAME', [pattern])
    
    matches = matcher(nlp_text)
    
    for match_id, start, end in matches:
        span = nlp_text[start:end]
        
        return span.text

## Extracting Mobile Number using Regex

In [163]:
def extract_mobile_number(text):
    phone = re.findall(re.compile(r'(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?'), text)

    if phone:
        number = ''.join(phone[0])
        
        if len(number) > 10:
            return '+' + number
        else:
            return number

## Extracting Email using Regex

In [164]:
def extract_email(text):
    email = re.findall("([^@|\s]+@[^@]+\.[^@|\s]+)", text)
    if email:
        try:
            return email[0].split()[0].strip(';')
        except IndexError:
            return None




## Extracting Skills

In [165]:
def extract_skills(resume_text):
    nlp_text = nlp(resume_text)
    noun_chunks = nlp_text.noun_chunks
    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]
    # reading txt file of skills
    data = pd.read_csv("skills.csv") 
  
    # extract values
    skills = list(data.columns)
    
    skillset = []
    # check for one-grams (example: python)
    for token in tokens:
        
        if token.lower() in skills:
            skillset.append(token)
            
    
#     # check for bi-grams and tri-grams (example: machine learning)
    for token in noun_chunks:
        token = token.text.lower().strip()
        if token in skills:
            skillset.append(token)
    
   
    return [i.capitalize() for i in set([i.lower() for i in skillset])]



## Extracting Education using NLTK

In [166]:

# load pre-trained model
nlp = spacy.load('en_core_web_sm')

# Grad all general stop words
STOPWORDS = set(stopwords.words('english'))

# Education Degrees
EDUCATION = [
            'BE','B.E.', 'B.E', 'BS', 'B.S', 
            'ME', 'M.E', 'M.E.', 'MS', 'M.S', 
            'BTECH', 'B.TECH', 'M.TECH', 'MTECH', 
            'SSC', 'HSC', 'CBSE', 'ICSE', 'X', 'XII', 'MBA'
        ]

def extract_education(text):
    nlp_text = nlp(text)

    # Sentence Tokenizer
    nlp_text = [sent.text.strip() for sent in nlp_text.sents]

    edu = {}
    # Extract education degree
    for index, text in enumerate(nlp_text):
        for tex in text.split():
            # Replace all special symbols
            tex = re.sub(r'[?|$|.|!|,]', r'', tex)
            if tex.upper() in EDUCATION and tex not in STOPWORDS:
                edu[tex] = text + nlp_text[index + 1]

    # Extract year
    education = []
    for key in edu.keys():
        year = re.search(re.compile(r'(((20|19)(\d{2})))'), edu[key])
        if year:
            education.append((key, ''.join(year[0])))
        else:
            education.append(key)
    return education

## Main function to call every other functions

In [167]:
def resumeparser(file_path):
    resume_text = extract_text_pdf(file_path)
    name = extract_name(resume_text)
    print('1. Name:',name)
    mobile_number = extract_mobile_number(resume_text)
    print('2. Mobile Number:',mobile_number)
    email_ID = extract_email(resume_text)
    print('2. Email ID:',email_ID)
    skills = extract_skills(resume_text)
    print('3. Skills:',skills)
    education = extract_education(resume_text)
    print('4. Education:',education)
    

## Calling Main Function

In [168]:
resumeparser(file_path)

1. Name: Amir Mazhar
2. Mobile Number: 9026297309
2. Email ID: amirmazhar126@gmail.com
3. Skills: ['Nlp', 'Ml']
4. Education: [('MBA', '2019'), ('BTech', '2015')]
