In [47]:
%%capture
# Setup
%pip install virtualenv
!virtualenv venv
!source venv/bin/activate
%pip install -r ../cr_extraction/requirements.txt
%pip install python-dotenv
%pip install tiktoken
%pip install pandas


In [48]:
import requests
from pdf2image import convert_from_bytes
import pytesseract
import io
from urllib.parse import unquote
import tiktoken
from typing import Tuple
import urllib.parse
import pandas as pd

In [15]:
# CONSTANTS
OPENAI_MODEL = "gpt-3.5-turbo-1106"

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [34]:
def parse_filename(encoded_filename):
    """
    This function takes an encoded filename string and returns the decoded filename.
    """
    # The filename is after the "UTF-8''" part, so split on that
    _, encoded_filename = encoded_filename.split("UTF-8''", 1)

    # Now decode the filename
    decoded_filename = urllib.parse.unquote_plus(encoded_filename)

    return decoded_filename

def get_pdf_for_company(company: str, document_type: str, bypass_storage: bool = False) -> Tuple[str, str]:
    # get pdf stream directly 
    file_response = requests.post('https://europe-west3-lumpito.cloudfunctions.net/download_files', json={'company': company, 'documents': [document_type], 'bypass_storage': bypass_storage})
    # Make sure the request was successful
    if file_response.status_code == 200:
        # Try to extract filename from the Content-Disposition header
        content_disposition = file_response.headers.get('Content-Disposition', '')
        if 'attachment; filename=' in content_disposition:
            # Extract filename from Content-Disposition header
            filename = content_disposition.split('filename=')[1]
            # Remove any quotes around the filename
            if '"' in filename or "'" in filename:
                filename = parse_filename(filename.strip("\"'"))
        else:
            mime_type = file_response.headers.get('Content-Type')
            # If the filename is not in the header or you want to set a default
            filename = f'default_filename.{mime_type.split("/")[1]}'

        # Write the PDF binary to a file with the obtained filename
        with open("output/" + filename, 'wb') as f:
            f.write(file_response.content)
        
        return filename, file_response.content
    else:
        print(f'Failed to retrieve PDF: Status code {file_response.status_code}')

## Pre-Processing


In [9]:
# Optimization1: Correct orientation
def correct_orientation(image):
    try:
        # Use pytesseract to detect orientation and script detection (OSD)
        osd = pytesseract.image_to_osd(image, output_type=pytesseract.Output.DICT)
        # Rotate the image based on the angle suggested by the OSD
        # The angle is given in degrees counterclockwise, we need to negate it for PIL rotation
        rotation_angle = -osd['rotate']
        if rotation_angle != 0:
            corrected_image = image.rotate(rotation_angle, expand=True)
            return corrected_image
        else:
            return image
    except pytesseract.TesseractError as e:
        print(f"An error occurred during orientation detection: {e}")
        return image

In [41]:
def pdf_to_text_per_page(pdf_binary):
    # Convert PDF to a list of images
    images = convert_from_bytes(pdf_binary)

    text_content = []

    for i, image in enumerate(images):
        # Correct orientation of the image if necessary
        corrected_image = correct_orientation(image)

        # Use PyTesseract to do OCR on the corrected image
        text = pytesseract.image_to_string(corrected_image)

        # Append the text to the list
        text_content.append(text)

    return text_content

## Extracting Structured Data

In [18]:
from openai import OpenAI
import json
from dotenv import load_dotenv
import os

# Load the environment variables from .env file
load_dotenv("../.env")

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [116]:
# Define Function Calling
def get_structured_shareholder_list(input_text: str):
    content = f"Here's a shareholders list for Tanso Technologies GmbH: ${input_text} \n Retrieve the shareholders from the shareholders table. "
    messages = [{"role": "user", "content": content }]
    tools = [
        {
            "type": "function",
            "function": {
                "name": "get_shareholders_list",
                "description": "Retrieve the shareholders from the shareholders table",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "shareholders": {
                            "type": "array",
                            "description": "The individual shareholders of the company",
                            "items": {
                                "type": "object",
                                "properties": {
                                    "name": {"type": "string", "description": "First and last name of the shareholder or company name of the shareholder"},
                                    "registry_details": {"type": "string", "description": "Registry details of the shareholder, e.g. AG München, HRB 123456"},
                                    "date_of_birth": {"type": "string", "description": "Date of birth of the shareholder"},
                                    "location": {"type": "string", "description": "Location of the shareholder"},
                                    "nominal_value_per_share": {"type": "number", "description": "Nominal value per share in Euro. Usually 1 Euro."},
                                    "total_nominal_value": {"type": "number", "description": "Total nominal value of all shares per shareholder. Usually contains a '€' sign"},
                                    "total_participation_in_percent": {"type": "string", "description": "Total participation in percent for each shareholder. This is a percentage value between 0 and 100. It is usually different for every shareholder. It often contains a '%' symbol."},
                                },
                                "required": ["name", "location", "nominal_value_per_share", "total_nominal_value",  "total_participation_in_percent"],
                            },
                        },
                    },
                },
                "required": ["shareholders"],
            },
        }
    ]
    response = client.chat.completions.create(
        model=OPENAI_MODEL,
        messages=messages,
        tools=tools,
        tool_choice={"type": "function", "function": {"name": "get_shareholders_list"}},
        temperature=0.0,
    )
    response_message = response.choices[0].message
    return response_message


In [92]:
# sum up the total participation in percent for all shareholders
total_participation_in_percent = sum([shareholder["total_participation_in_percent"] for shareholder in shareholders["shareholders"]])
total_participation_in_percent

100.00000000000001

In [93]:
input_tokens = num_tokens_from_string(content, "cl100k_base")
output_tokens = num_tokens_from_string(response_message.tool_calls[0].function.arguments, "cl100k_base")
cost = input_tokens / 1000 * 0.001 + output_tokens / 1000 * 0.002
cost

0.0032259999999999997

## Final Flow v1

In [53]:
df = pd.DataFrame()

In [108]:
file_name, file_content = get_pdf_for_company('Tanso', 'gs', True)

In [109]:
text_all_pages = " ".join(pdf_to_text_per_page(file_content))
text_tokens = num_tokens_from_string(text_all_pages, "cl100k_base")
print("Input Text tokens: " + str(text_tokens) + " tokens")

Input Text tokens: 1670 tokens


In [117]:
response_message = get_structured_shareholder_list(text_all_pages)
shareholders_object = json.loads(response_message.tool_calls[0].function.arguments)
shareholders_list = shareholders_object["shareholders"]

In [118]:
df = pd.DataFrame(shareholders_list)

In [119]:
shareholders_list

[{'name': 'FYS Ventures UG (haftungsbeschrankt)',
  'registry_details': 'AG Miinchen, HRB 268779',
  'location': 'Miinchen',
  'nominal_value_per_share': 1,
  'total_nominal_value': 1110,
  'total_participation_in_percent': '0,0021 %'},
 {'name': 'Tanso Technologies AG',
  'registry_details': 'Munchen, HRB 269123',
  'location': 'Miinchen',
  'nominal_value_per_share': 1,
  'total_nominal_value': 6973,
  'total_participation_in_percent': '14,73 %'},
 {'name': 'Wiechmann Ventures UG (haftungsbeschrankt)',
  'registry_details': 'AG Miinchen, HRB 268778',
  'location': 'Miinchen',
  'nominal_value_per_share': 1,
  'total_nominal_value': 8083,
  'total_participation_in_percent': '17,07 %'},
 {'name': 'Gyri Ventures UG (haftungsbeschrankt)',
  'registry_details': 'AG Miinchen, HRB 268777',
  'location': 'Miinchen',
  'nominal_value_per_share': 1,
  'total_nominal_value': 8083,
  'total_participation_in_percent': '17,07 %'},
 {'name': 'Hetzel Ventures UG (haftungsbeschrankt)',
  'registry_de

In [120]:
df

Unnamed: 0,name,registry_details,location,nominal_value_per_share,total_nominal_value,total_participation_in_percent,date_of_birth
0,FYS Ventures UG (haftungsbeschrankt),"AG Miinchen, HRB 268779",Miinchen,1,1110,"0,0021 %",
1,Tanso Technologies AG,"Munchen, HRB 269123",Miinchen,1,6973,"14,73 %",
2,Wiechmann Ventures UG (haftungsbeschrankt),"AG Miinchen, HRB 268778",Miinchen,1,8083,"17,07 %",
3,Gyri Ventures UG (haftungsbeschrankt),"AG Miinchen, HRB 268777",Miinchen,1,8083,"17,07 %",
4,Hetzel Ventures UG (haftungsbeschrankt),"AG Disseldorf, HRB 94685",Dusseldorf,1,751,"1,59 %",
5,Unternehmertum VC AG,Munchen,Garching b.,1,9362,"19,77 %",
6,Fonds Ill GmbH & Co. KG,HRA 111267 Minchen,Landkreis Munchen,1,34670,KE: 34.670 - 38.949,
7,Picus Capital GmbH,"AG Munchen, HRB 220147",Miinchen,1,1840,"3,89 %",
8,Possible Ventures II GmbH & Co. KG,HRA 114517,Miinchen,1,686,"1,45 %",
9,Salma Vogel,,Miinchen,1,137,"0,29 %",19. Januar 1990
