# Processing PDFs using Python

# Imports

In [None]:
import re

main package used here:

In [None]:
import PyPDF2

alternatives to get pdf text

In [None]:
import textract
import tika

# Read PDFs

In [None]:
file = 'lorem_ipsum.pdf'

## Get file info

In [None]:
with open(file, 'rb') as fid:
    pdf = PyPDF2.PdfReader(fid)
    info = pdf.metadata
    n_pages = len(pdf.pages)

    print(f"""
        filename: {str(file)},
        author:   {info.author},
        creator:  {info.creator},
        producer: {info.producer},
        subject:  {info.subject},
        title:    {info.title},
        pages:    {n_pages}""")

## Read text

read the contents of the file and display the first 20 lines

### Using `PyPDF2`

In [None]:
text = ''
with open(file, 'rb') as pdfFileObject:
    reader = PyPDF2.PdfReader(pdfFileObject)
    count = len(reader.pages)
    for i in range(count):
        page = reader.pages[i]
        text += page.extract_text()

### Using `tika`

This needs some time to start up the first time

In [None]:
from tika import parser
raw = parser.from_file(str(file))
text = raw['content']

### Using `textract`

In [None]:
import textract
text = textract.process(str(file)).decode()

### Printing it

In [None]:
text = re.sub(r'\n\s*\n', '\n\n', text)
print(text)

## Word count

In [None]:
print(f"""words: {len(text.split())}
characters: {len(''.join(text.split()))}""")

# Edit PDFs

## Split PDF

In [None]:
pdf_reader = PyPDF2.PdfReader(str(file))
pdf_writer = PyPDF2.PdfWriter()

for page in [1]:
    pdf_writer.add_page(pdf_reader.pages[page])

with open('output.pdf', 'wb') as output_pdf:
    pdf_writer.write(output_pdf)

In [None]:
!open output.pdf

## Write on PDF

slightly more involved. Here's a function that might work for you

In [None]:
def overwrite_PDF(inputfile, outputfile, x, y, text, fontsize=14, color=(0, 0, 0), angle=0):
    """Writes text on a pdf.

    Arguments:
    ----------
    inputfile : str 
        path of the existing PDF on which we write

    outputfile : str
    
        path/name of the output file into which we write

    x, y : int
        position on the PDF page  in cm

    text : str

    fontsize : int
        font size in pt

    color : tuple
        rgb tuple on a 0 ... 1 scale for each value

    angle : float
        rotate text by this many degree

    """
    from reportlab.pdfgen import canvas
    from reportlab.lib.pagesizes import A4
    from reportlab.lib.units import cm
    import io

    packet = io.BytesIO()

    # create a new PDF with Reportlab

    can = canvas.Canvas(packet, pagesize=A4)
    can.setFont("Helvetica", fontsize)
    can.setFillColorRGB(*color)

    can.saveState()
    can.translate(x * cm, y * cm)
    can.rotate(angle)
    can.drawCentredString(0, 0, text)
    can.restoreState()
    can.save()

    # move to the beginning of the StringIO buffer
    packet.seek(0)
    new_pdf = PyPDF2.PdfReader(packet)

    # read your existing PDF
    existing_pdf = PyPDF2.PdfReader(open(inputfile, "rb"))
    output = PyPDF2.PdfWriter()
    # add the "watermark" (which is the new pdf) on the existing page
    page = existing_pdf.pages[0]
    page.merge_page(new_pdf.pages[0])
    output.add_page(page)
    # finally, write "output" to a real file
    outputStream = open(outputfile, "wb")
    output.write(outputStream)
    outputStream.close()

In [None]:
overwrite_PDF('output.pdf', 'output_stamped.pdf', 10, 16, 'COPY', fontsize=50, color=(0.9, 0, 0), angle=45)
!open output_stamped.pdf

## Merge PDFs

In [None]:
files = ['output.pdf', 'output_stamped.pdf']

In [None]:
pdf_writer = PyPDF2.PdfWriter()

for file in files:
    pdf = PyPDF2.PdfReader(str(file))
    n_pages = len(pdf.pages)

    for page in pdf.pages:
        pdf_writer.add_page(page)

with open('output_merged.pdf', 'wb') as output_pdf:
    pdf_writer.write(output_pdf)

In [None]:
!open output_merged.pdf