In [1]:
import pandas as pd
import PyPDF2
import fitz
from io import StringIO

In [3]:
# This script is to read all characthers on a specific page on a PDF and Export into an Excel File

pdf_file = open('POL140298453100-lastpage_annots.pdf', 'rb')
pdf_reader = PyPDF2.PdfReader(pdf_file)

text = ''
for page in range(len(pdf_reader.pages)):
    text += pdf_reader.pages[0].extract_text()

df = pd.read_csv(StringIO(text), sep='\t', header=None)

df.to_excel('output.xlsx', index=False, header=False)

In [2]:
# This is the full script to extract the data of the PDF as a text
# Then create a function to transform the text into words and print what is on the first rectangle of annotations.

# First we open the document using Fitz, then select the specific page where the data we want to extract is in.
# Then we get all the text that in the document using get_text

doc = fitz.open('document_with_annotations.pdf')
pageone = doc[0]
words = pageone.get_text("words")

# This function below takes a list of words and sorts them based on their first character. 
# Then, it groups the words into lines based on their y-coordinate rounded to one decimal place. 
# Finally, it returns a string where each line is separated by a newline character and the words in each line are separated by a space character.

def make_text(words):
    line_dict = {} 
    words.sort(key=lambda w: w[0])
    for w in words:  
        y1 = round(w[3], 1)  
        word = w[4] 
        line = line_dict.get(y1, [])  
        line.append(word)  
        line_dict[y1] = line  
    lines = list(line_dict.items())
    lines.sort()  
    return "n".join([" ".join(line[1]) for line in lines])

In [9]:
# The Information of the words in first object or rectangle is stored in mywords
# Then we apply the function to extract the words of the first rectangle and add to the variable first_annots

rec = pageone.first_annot.rect
first_annots = []
mywords = [w for w in words if fitz.Rect(w[:4]) in rec]
ann = make_text(mywords)
first_annots.append(ann)

print(rec)
print(first_annots)

Rect(76.98999786376953, 155.42999267578125, 223.1999969482422, 172.25)
['RAMON SANCHEZ GARAY']


In [3]:
# With this script below we can get the text within all the rectagles of annotations and not only the first one.

all_annots = []

for annot in pageone.annots():
    if annot!=None:
        rec=annot.rect
        mywords = [w for w in words if fitz.Rect(w[:4]) in rec]
        ann= make_text(mywords)
        all_annots.append(ann)

print(all_annots)

['RAMON SANCHEZ GARAY', 'SAGR431217XXX', '150203067000', 'Contado']


In [4]:
# We transform the annotations into a Dictionary

pol_dic = {'NO.POLIZA': [all_annots[2]],
           'NOMBRE': [all_annots[0]], 
           'RFC': [all_annots[1]], 
           'FORMA PAGO': [all_annots[3]]}

pol_dic

{'NO.POLIZA': ['150203067000'],
 'NOMBRE': ['RAMON SANCHEZ GARAY'],
 'RFC': ['SAGR431217XXX'],
 'FORMA PAGO': ['Contado']}

In [5]:
# Then we convert the created dictionary into a Pandas Dataframe

pol_data = pd.DataFrame.from_dict(pol_dic)
pol_data


Unnamed: 0,NO.POLIZA,NOMBRE,RFC,FORMA PAGO
0,150203067000,RAMON SANCHEZ GARAY,SAGR431217XXX,Contado


In [6]:
pol_data.to_excel('pol_final.xlsx', index=False)