In [1]:
import pandas as pd
import PyPDF2
from PIL import Image 
import pytesseract
from pdf2image import convert_from_path
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import nltk

In [2]:
def textPytesseract(filename):
    
    #convert PDF to images
    # Store all the pages of the PDF in a variable 
    pages = convert_from_path(filename, 500) 
    
    image_counter = 1
    
    for page in pages: 
  
        # Declaring filename for each page of PDF as JPG 
        # For each page, filename will be: 
        # PDF page n -> page_n.jpg 
        filename = "page_"+str(image_counter)+".jpg"

        # Save the image of the page in system 
        page.save(filename, 'JPEG') 

        # Increment the counter to update filename 
        image_counter = image_counter + 1
    
    #Recognizing text from the images using OCR
    filelimit = image_counter-1
    text=""
    for i in range(1, filelimit + 1): 
    
        # Set filename to recognize text from 
        # Again, these files will be: 
        # page_n.jpg 
        filename = "page_"+str(i)+".jpg"

        # Recognize the text as string in image using pytesserct 
        text_ = str(((pytesseract.image_to_string(Image.open(filename))))) 

        # The recognized text is stored in variable text 
        # Any string processing may be applied on text 
        # Here, basic formatting has been done: 
        # In many PDFs, at line ending, if a word can't 
        # be written fully, a 'hyphen' is added. 
        # The rest of the word is written in the next line 
        # To remove this, we replace every '-\n' to ''. 
        text_ = text_.replace('-\n', '')
        text += text_
    
    return text.lower()

In [3]:
def extract_phone_numbers(text):
    r = re.compile(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')
    phone_numbers = r.findall(text)
    return [re.sub(r'\D', '', number) for number in phone_numbers]

In [4]:
def extract_email_addresses(text):
    r = re.compile(r'[\w\.-]+@[\w\.-]+')
    return r.findall(text)

In [5]:
def ie_preprocess(text):
    stop_words = stopwords.words('english')
    document = ' '.join([i for i in text.split() if i not in stop_words])
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    return sentences

In [6]:
def extract_names(text):
    names = []
    sentences = ie_preprocess(text)
    for tagged_sentence in sentences:
        for chunk in nltk.ne_chunk(tagged_sentence):
            if type(chunk) == nltk.tree.Tree:
                if chunk.label() == 'PERSON':
                    names.append(' '.join([c[0] for c in chunk]))
    return names

In [7]:
text = textPytesseract('aditya bhartia (copy).pdf')
phone = extract_phone_numbers(text)
email = extract_email_addresses(text)
names = extract_names(text)

In [8]:
print (phone)
print (email)
print (names)#need to train with Indian Names

['9820929220']
['adityabhartia@yahoo.com']
[]


In [9]:
print (text)

aditya bhartia

g-3, tandon apartment, charat singh colony, andheri (e), mumbai
ph: +91 9820929220, e-mail: adityabhartia@yahoo.com

 

professional experience
noble group/clear capital, mumbai june 2007 — till date

equity research analyst at noble group, a uk-based investment bank specializing in small and mid—cap
equities. noble’s clients include some offhe uk ’s top institutional investors such as scottish widows, f idelily,

gartmore, aberforth, and l&g.

- held joint responsibility with the team leader for analysing the uk support services sector, which
encompasses a wide array of business models like equipment rental, accident management, social housing
and engineering consultancy.

- involved in all stages of equity research, starting with company meetings and developing sophisticated
earning models and ending with writing research notes and servicing clients through roadshows, meetings,
and ad-hoc projects.

- prepared thematic industry notes on the uk rental industry and the 

In [10]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [11]:
doc = nlp(text)

In [12]:
print([(X.text, X.label_) for X in doc.ents])

[('aditya bhartia', 'PERSON'), ('charat singh colony', 'PERSON'), ('mumbai', 'GPE'), ('june 2007', 'DATE'), ('uk', 'GPE'), ('uk', 'GPE'), ('scottish', 'NORP'), ('l&g', 'ORG'), ('uk', 'GPE'), ('the uk rental industry', 'ORG'), ('s&p', 'ORG'), ('2005', 'DATE'), ('may 2007', 'DATE'), ('european', 'NORP'), ('first', 'ORDINAL'), ('quarterly', 'DATE'), ('lochan & c0', 'ORG'), ('new delhi', 'GPE'), ('2003', 'DATE'), ('2005', 'DATE'), ('p. k', 'PERSON'), ('narula & c0', 'ORG'), ('new delhi', 'GPE'), ('2002', 'DATE'), ('2003', 'DATE'), ('may 2005', 'DATE'), ('21st', 'ORDINAL'), ('india', 'GPE'), ('l9th', 'ORG'), ('india', 'GPE'), ('shri ram college of commerce', 'ORG'), ('delhi', 'GPE'), ('76%', 'PERCENT'), ('2004', 'DATE'), ('2nd', 'ORDINAL'), ('delhi university', 'ORG'), ('lst year', 'PERSON'), ('ufj foundations', 'GPE'), ('srcc alumni', 'PERSON'), ('xii)', 'ORG'), ('delhi', 'GPE'), ('92%', 'PERCENT'), ('xii', 'PERSON'), ('secured lst rank', 'PERSON'), ('r. s. asiads 2000', 'GPE'), ('mahavir'

In [13]:
for X in doc.ents:
    if X.label_ == 'ORG':
        print (X.text)

l&g
the uk rental industry
s&p
lochan & c0
narula & c0
l9th
shri ram college of commerce
delhi university
xii)
mahavir
xxxxii national school games
