## OCR with Tesseract

## Conversion of a single digitized PDF

### use-case: [historical party programmes](https://visuals.manifesto-project.wzb.eu/mpdb-shiny/cmp_dashboard_dataset/)

1. extract image from PDF
2. run optical character recognition (OCR) on the image

In [None]:
import pytesseract
from pdf2image import convert_from_path
from pathlib import Path

# path to PDF file
pdf_path = Path("../data/scanned_pdf_sample/fdp_scan_party_programme_1947.pdf")

# convert PDF to images (one image per page)
pages = convert_from_path(pdf_path, fmt="png")

# initialize text to collect the text per page
text = ""

# iterate over pages
for pageNum,imgBlob in enumerate(pages):
    # extract text from image per page
    text_page = pytesseract.image_to_string(imgBlob, lang='deu')

    # append text for each page
    text += " " + text_page

print(text[:100])

NameError: name 'Path' is not defined

## Extract the text from all PDFs in a folder

In [None]:
# path to PDF directory
indir = Path("../data/scanned_pdf_sample/")
outdir = Path("../data/scanned_pdf_sample/extracted")

# create output folder if it does not exist
outdir.mkdir(parents=True, exist_ok=True)

# iterate over all PDFs in input folder
for infile in indir.glob(pattern="*.pdf"):
    print(f"Reading PDF file: {infile}")
    
    pages = convert_from_path(infile, fmt="png")
    text = ""
    
    for pageNum,imgBlob in enumerate(pages):
        text_page = pytesseract.image_to_string(imgBlob, lang='deu')
        text += " " + text_page

    # define name of outfile (name.pdf -> name.txt)
    outfile = outdir / (infile.stem + ".txt")

    # write content to file
    with outfile.open("w") as f:
        f.write(text)
    
    print(f"Extracted text to: {outfile}")

## OCR with Surya

In [None]:
# %%
from PIL import Image
from surya.ocr import run_ocr
from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor
from surya.model.recognition.model import load_model as load_rec_model
from surya.model.recognition.processor import load_processor as load_rec_processor


pdf_file = "/home/alex/KED2025/ked/materials/data/scanned_pdf_sample/fdp_scan_party_programme_1947.pdf"
from pdf2image import convert_from_path


# convert PDF to images (one image per page)
pages = convert_from_path(pdf_file, fmt="png")


image = pages[0] # Image.open("IMAGE_PATH")

# import requests
# url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg"
# image = Image.open(requests.get(url, stream=True).raw).convert("RGB")

langs = ["de"] # Replace with your languages - optional but recommended
det_processor, det_model = load_det_processor(), load_det_model()
rec_model, rec_processor = load_rec_model(), load_rec_processor()

predictions = run_ocr([image], [langs], det_model, det_processor, rec_model, rec_processor)

for idx_box in range(len(predictions)):
    print([line.text for line in predictions[idx_box].text_lines])