# Turn Handelsregister PDF's into tables
## 1. Load PDF files

In [1]:
import os

In [2]:
directory = '../scraped_pdfs'
pdf_files = []

# Walk through the directory
for root, dirs, files in os.walk(directory):
    # Find all PDF files in current directory and add to the list
    for file in files:
        if file.lower().endswith(".pdf"):
            pdf_files.append(os.path.join(root, file))

In [3]:
# take the first file
file = pdf_files[0]
print(file.split('/')[-1])

Liste der Gesellschafter - Aufnahme in den Registerordner am 17.12.2021-HappieHaus Wellbeing GmbH.pdf


## 2. Turn PDF into Images & OCR

In [4]:
from PIL import Image
from pdf2image import convert_from_path, convert_from_bytes
import pandas as pd
import cv2
import numpy as np
import pytesseract
from pytesseract import Output

In [5]:
def correct_orientation(image):
    try:
        # Use pytesseract to detect orientation and script detection (OSD)
        osd = pytesseract.image_to_osd(image, output_type=Output.DICT)
        # Rotate the image based on the angle suggested by the OSD
        # The angle is given in degrees counterclockwise, we need to negate it for PIL rotation
        rotation_angle = -osd['rotate']
        if rotation_angle != 0:
            corrected_image = image.rotate(rotation_angle, expand=True)
            return corrected_image
        else:
            return image
    except pytesseract.TesseractError as e:
        print(f"An error occurred during orientation detection: {e}")
        return image

def pdf_to_text(pdf_path):
    # Convert PDF to a list of images
    images = convert_from_path(pdf_path)

    text_content = []

    for i, image in enumerate(images):
        # Correct orientation of the image if necessary
        corrected_image = correct_orientation(image)

        # Use PyTesseract to do OCR on the corrected image
        text = pytesseract.image_to_string(corrected_image)

        # Append the text to the list
        text_content.append(text)

    return text_content

In [6]:
extracted_text = pdf_to_text(file)

## Send it to OpenAI

In [7]:
extracted_text

['Liste der Gesellschafter\n\nder HappieHaus Wellbeing GmbH mit Sitz in Berlin\n\n(Reg.-Nr. HRB neu, Amtsgericht Charlottenburg)\n\nHappie Haus UG AG Charlottenburg\n(haftungsbeschrankt) | HRB 233422 B\n\n3H Ventures UG AG Charlottenburg\n(haftungsbeschrankt) | HRB 236167 B\n\nBerlin, den 30.11.2021\n\nStephanie Neumann\n\n17.501 bis 25.000\n\nGesamtumfang der\nBeteiligung des\nGesellschafters am\nStammkapital in %\n\nje 1,00 EUR; dies\nentspricht fir jeden\nGeschaftsanteil 0,004 %\nam Stammkapital der\nGesellschaft\n\nje 1,00 EUR; dies\nentspricht fir jeden\nGesch√©aftsanteil 0,004 %\nam Stammkapital der\nGesellschaft\n\n',
 'Hiermit beglaubige ich die Ubereinstimmung der in dieser Datei enthaltenen Bilddaten (Abschrift) mit\n\ndem mir vorliegenden Papierdokument (PDF-Ausdruck).\n\nBerlin, 10.12.2021\n\nKlaus Bienmiller, Notar\n']