In [1]:
import pandas as pd
import PyPDF2
from PIL import Image 
import pytesseract
from pdf2image import convert_from_path
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

In [2]:
def textPyPDF2(filename, count):
    
    pdfFileObj = open(filename,'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)#The pdfReader variable is a readable object that will be parsed
    num_pages = pdfReader.numPages
    
                          #The while loop will read each page
    pageObj = pdfReader.getPage(count)
    text = pageObj.extractText()
        
    return text.lower()

In [3]:
def textPytesseract(filename, count):
    
    #convert PDF to images
    # Store the page of the PDF in a variable 
    page = convert_from_path(filename, 500, first_page=count, last_page=count) 
    
    # Declaring filename for each page of PDF as JPG 
    # For each page, filename will be: 
    # PDF page n -> page.jpg 
    filename = "page"+".jpg"

    # Save the image of the page in system 
    page[0].save(filename, 'JPEG') 
    
    #Recognizing text from the images using OCR
    # Recognize the text as string in image using pytesseract 
    text = str(((pytesseract.image_to_string(Image.open(filename))))) 

    # The recognized text is stored in variable text 
    # Any string processing may be applied on text 
    # Here, basic formatting has been done: 
    # In many PDFs, at line ending, if a word can't 
    # be written fully, a 'hyphen' is added. 
    # The rest of the word is written in the next line 
    # To remove this, we replace every '-\n' to ''. 
    text = text.replace('-\n', '')
    
    return text.lower()

In [4]:
def listkeywords(text):
    
    tokens = word_tokenize(text)
    punctuations = ['(',')',';',':','[',']',',','.','-','/']
    stop_words = stopwords.words('english')
    keywords = [word for word in tokens if not word in stop_words and not word in punctuations]
    
    return keywords

In [5]:
def count_keyword_matches(text, word):
    
    text = text.lower()
    word = word.lower()
    match = re.findall(word, text)
    
    return len(match)

In [6]:
def rankpages(filename, func, words):
    
    pdfFileObj = open(filename,'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)#The pdfReader variable is a readable object that will be parsed
    num_pages = pdfReader.numPages
    
    df = pd.DataFrame([i for i in range(1, num_pages+1)],columns=['Page'])
    
    for word in words:
        count = 0
        word_count = []
        while count < num_pages:  #The while loop will read each page

            if func == "PyPDF2":
                text = textPyPDF2(filename, count)
            if func == "OCR":
                text = textPytesseract(filename, count)

            keywords = listkeywords(text)
            
            #word_count.append(keywords.count(word))
            word_count.append(count_keyword_matches(text, word))
            
            count +=1
        df[word] = word_count
            
    return df

In [7]:
df = rankpages("Lesson-42.pdf", "PyPDF2", ["embedded", "objective"])



In [8]:
total = df.iloc[:, 1:].sum(axis=1)
df["total_count"] = total
print (df.sort_values("total_count"))

    Page  embedded  objective  total_count
4      5         0          0            0
5      6         0          0            0
7      8         0          0            0
8      9         0          0            0
9     10         0          0            0
10    11         0          0            0
11    12         0          0            0
0      1         1          0            1
1      2         1          0            1
6      7         2          0            2
3      4         7          0            7
2      3         9          1           10


In [9]:
%%timeit
df = rankpages("Lesson-42.pdf", "PyPDF2", ["embedded", "objective"])



695 ms ± 34.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%%time
df = rankpages("Lesson-42.pdf", "OCR", ["embedded", "objective"])

CPU times: user 25.7 s, sys: 3.82 s, total: 29.5 s
Wall time: 2min 37s


In [11]:
#text not extracted from a train ticket using pypdf2 but pytesseract does
text = textPyPDF2("kupdf.net_irctc-ticket-format.pdf", 0)
print (text)




In [12]:
text = textPytesseract("kupdf.net_irctc-ticket-format.pdf", 0)
print (text)

irctcs e-ticketing service

electronic reservation sli personal user

 

. this ticket will only be valid with an id proof in original. if found travelling without id proof, passenger will
be treated as without ticket and charged as per extant railway rules.

. at least one passenger should travel with his/her id card in original which is indicated on the ers/vrm. in
case he/she is not travelling, all other passenger(s) booked on that ticket, if found travelling in train will be
treated as travelling without ticket and charged accordingly.

. valid ids to be presented during train journey by one of the passenger booked on an e-ticket :- voter
identity card / passport / pan card / driving license/ photo id card issued by central / state govt /
public sector undertakings of state / central government ,district administrations , muncipal bodies and
panchayat administrations which are having serial number/ student identity card with photograph issued
by recognized school or college for the