In [1]:
%%capture
# Setup
%pip install virtualenv
!virtualenv venv
!source venv/bin/activate
%pip install -r ../cr_extraction/requirements.txt
%pip install python-dotenv
%pip install tiktoken
%pip install pandas


In [2]:
import requests
from pdf2image import convert_from_bytes
import pytesseract
import io
from urllib.parse import unquote
import tiktoken
from typing import Tuple
import urllib.parse
import pandas as pd

In [3]:
# CONSTANTS
OPENAI_MODEL = "gpt-3.5-turbo-1106"

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [4]:
from PIL import Image
from reportlab.pdfgen import canvas


# Check if the file is a TIFF
def tiff_to_PDF(image):
    # Create a new PDF file
    pdf = canvas.Canvas('output.pdf')
    # Draw the image on the PDF, resizing it to fit the page
    pdf.drawInlineImage(image, 0, 0, width=595, height=842)  # A4 size

    # Save the PDF
    pdf.save()
    return pdf


In [5]:
file_path = 'output/ASSEMBLY GmbH_Berlin (Charlottenburg).tiff'

# Open the file using PIL
image = Image.open(file_path)

tiff_to_PDF(image)

FileNotFoundError: [Errno 2] No such file or directory: 'output/ASSEMBLY GmbH_Berlin (Charlottenburg).tiff'

In [None]:
def parse_filename(encoded_filename):
    """
    This function takes an encoded filename string and returns the decoded filename.
    """
    decoded_filename = encoded_filename
    # The filename is after the "UTF-8''" part, so split on that
    try:
        _, encoded_filename = encoded_filename.split("UTF-8''", 1)

    # Now decode the filename
        decoded_filename = urllib.parse.unquote_plus(encoded_filename)
    except:
        pass
    return decoded_filename

def get_pdf_for_company(company: str, document_type: str, bypass_storage: bool = False) -> Tuple[str, str]:
    # get pdf stream directly 
    file_response = requests.post('https://europe-west3-lumpito.cloudfunctions.net/download_files', json={'company': company, 'documents': [document_type], 'bypass_storage': bypass_storage})
    # Make sure the request was successful
    if file_response.status_code == 200:
        if(file_response.content.format=='TIFF'):
            file_response.content = tiff_to_PDF(file_response.content)
        # Try to extract filename from the Content-Disposition header
        content_disposition = file_response.headers.get('Content-Disposition', '')
        if 'attachment; filename=' in content_disposition:
            # Extract filename from Content-Disposition header
            filename = content_disposition.split('filename=')[1]
            # Remove any quotes around the filename
            if '"' in filename or "'" in filename:
                filename = parse_filename(filename.strip("\"'"))
        else:
            mime_type = file_response.headers.get('Content-Type')
            # If the filename is not in the header or you want to set a default
            filename = f'default_filename.{mime_type.split("/")[1]}'

        # Write the PDF binary to a file with the obtained filename
        with open("output/" + filename, 'wb') as f:
            f.write(file_response.content)
        
        return filename, file_response.content
    else:
        print(f'Failed to retrieve PDF: Status code {file_response.status_code}')

In [235]:
file_name, file_content = get_pdf_for_company('SellerX', 'gs', True)
print(file_name)
print(file_content)


SellerX Commerce GmbH_Berlin (Charlottenburg).pdf
b'%PDF-1.4\r%\xe2\xe3\xcf\xd3\r\n23 0 obj\r<</E 291454/H [ 1005 202 ]/L 324707/Linearized 1/N 2/O 25/T 324119>>\rendobj\r                          \rxref\r23 26\r0000000016 00000 n\r\n0000000822 00000 n\r\n0000001207 00000 n\r\n0000001350 00000 n\r\n0000001576 00000 n\r\n0000001611 00000 n\r\n0000002044 00000 n\r\n0000002071 00000 n\r\n0000002095 00000 n\r\n0000002131 00000 n\r\n0000002151 00000 n\r\n0000002269 00000 n\r\n0000002784 00000 n\r\n0000005432 00000 n\r\n0000006232 00000 n\r\n0000012840 00000 n\r\n0000014455 00000 n\r\n0000034951 00000 n\r\n0000055392 00000 n\r\n0000079441 00000 n\r\n0000126169 00000 n\r\n0000166711 00000 n\r\n0000190468 00000 n\r\n0000206597 00000 n\r\n0000218565 00000 n\r\n0000001005 00000 n\r\ntrailer\r<</ID[(dF\\025T@\\271\\262=\\030X\\341,4!\\222\\253) (dF\\025T@\\271\\262=\\030X\\341,4!\\222\\253)]/Info 20 0 R/Prev 324110/Root 24 0 R/Size 49>>startxref\r 0\r%%EOF\r\r\r\r\r24 0 obj\r<</Metadata 21 0 R/Ou

## Pre-Processing


In [5]:
# Optimization1: Correct orientation
def correct_orientation(image):
    try:
        # Use pytesseract to detect orientation and script detection (OSD)
        osd = pytesseract.image_to_osd(image, output_type=pytesseract.Output.DICT)
        # Rotate the image based on the angle suggested by the OSD
        # The angle is given in degrees counterclockwise, we need to negate it for PIL rotation
        rotation_angle = -osd['rotate']
        if rotation_angle != 0:
            corrected_image = image.rotate(rotation_angle, expand=True)
            return corrected_image
        else:
            return image
    except pytesseract.TesseractError as e:
        print(f"An error occurred during orientation detection: {e}")
        return image

In [6]:
def pdf_to_text_per_page(pdf_binary):
    # Convert PDF to a list of images
    images = convert_from_bytes(pdf_binary)

    text_content = []

    for i, image in enumerate(images):
        # Correct orientation of the image if necessary
        corrected_image = correct_orientation(image)

        # Use PyTesseract to do OCR on the corrected image
        text = pytesseract.image_to_string(corrected_image)

        # Append the text to the list
        text_content.append(text)

    return text_content

## Extracting Structured Data

In [10]:
from openai import OpenAI
import json
from dotenv import load_dotenv
import os

# Load the environment variables from .env file
load_dotenv("../.env")

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [152]:
# Define Function Calling



def get_structured_shareholder_list(input_text: str, company_name: str):
    content = f"Here's a shareholders list for ${company_name}: ${input_text} \n Retrieve the shareholders from the shareholders table. "
    messages = [{"role": "user", "content": content }]
    tools = [
        {
            "type": "function",
            "function": {
                "name": "get_shareholders_list",
                "description": "Retrieve the shareholders from the shareholders table",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "shareholders": {
                            "type": "array",
                            "description": "The individual shareholders of the company",
                            "items": {
                                "type": "object",
                                "properties": {
                                    "name": {"type": "string", "description": "First and last name of the shareholder or company name of the shareholder. Do not invent something, double check for correct name"},
                                    "registry_details": {"type": "string", "description": "Registry details of the shareholder, e.g. AG München, HRB 123456"},
                                    "date_of_birth": {"type": "string", "description": "Date of birth of the shareholder"},
                                    "location": {"type": "string", "description": "Location of the shareholder. It is German cities written in German"},
                                    "nominal_value_per_share": {"type": "number", "description": "Nominal value per share in Euro. Usually 1 Euro."},
                                    "total_nominal_value": {"type": "number", "description": "Total nominal value of all shares per shareholder. Usually contains a '€' sign"},
                                    "participation_per_share": {"type": "string", "description": "Participation per share in percent. Is a number value, sometimes with percentag sign. Is the same for all shareholder"},
                                    "total_participation_in_percent": {"type": "string", "description": "Summe der prozentualen Beteiligung des Shareholders"},
                                },
                                "required": ["name", "location", "nominal_value_per_share", "total_nominal_value",  "total_participation_in_percent"],
                            },
                        },
                    },
                },
                "required": ["shareholders"],
            },
        }
    ]
    response = client.chat.completions.create(
        model=OPENAI_MODEL,
        messages=messages,
        tools=tools,
        tool_choice={"type": "function", "function": {"name": "get_shareholders_list"}},
        temperature=0.0,
    )
    response_message = response.choices[0].message
    return response_message


#transform text to json

In [9]:
# sum up the total participation in percent for all shareholders
total_participation_in_percent = sum([shareholder["total_participation_in_percent"] for shareholder in shareholders["shareholders"]])
total_participation_in_percent

NameError: name 'shareholders' is not defined

In [None]:
import contextlib


input_tokens = num_tokens_from_string(contextlib, "cl100k_base")
output_tokens = num_tokens_from_string(response_message.tool_calls[0].function.arguments, "cl100k_base")
cost = input_tokens / 1000 * 0.001 + output_tokens / 1000 * 0.002
cost

0.0032259999999999997

## Final Flow v1

In [79]:
df = pd.DataFrame()

In [134]:
file_name, file_content = get_pdf_for_company('Sento', 'gs', True)


In [135]:
print(file_name)
print(file_content)

sento GmbH_München.pdf
b'%PDF-1.4\r%\xe2\xe3\xcf\xd3\r\n1 0 obj\r<</Metadata 70 0 R/OutputIntents[72 0 R]/Pages 2 0 R/Type/Catalog>>\rendobj\r2 0 obj\r<</Count 9/Kids[4 0 R 15 0 R 24 0 R 34 0 R 40 0 R 49 0 R 55 0 R 61 0 R 73 0 R]/Type/Pages>>\rendobj\r3 0 obj\r<</Font<</F01 13 0 R>>/ProcSet[/PDF /Text /ImageB /ImageC /ImageI]/XObject<</XIPLAYER0 6 0 R/XIPLAYER_CM1 7 0 R/XIPLAYER_CM2 9 0 R/XIPLAYER_CM3 10 0 R/XIPLAYER_CM4 11 0 R/XIPLAYER_CM5 12 0 R>>>>\rendobj\r4 0 obj\r<</Contents 5 0 R/MediaBox[0 0 842 596]/Parent 2 0 R/Resources 3 0 R/Rotate 270/Type/Page>>\rendobj\r5 0 obj\r<</Filter/FlateDecode/Length 2111>>\rstream\nx\x9c\x8dY\xdbn\x1bG\x12}\xd7W\xccK\x16\x0e \x8d\xfb~Y\xec\x8b\x14\xcb\x17\xac\x1c8\x96\x9cM\x16\x02\x02\xca\x1a\xd1LDrwHz\x01\x7f\xfdVU_f\xd83-K\x80\r\x9a>]]\x97S\xa7\xab[M\xd34\xff=i\xe8\xc7)\xd5:\x86?\rkY\xf1A{\xdf2u\xfc\xdd\x99L_}^G\x1b/\x7f{\xf7\xe1\xea\xfc\xf7\xcb\x8f\xacy\xb5\xa5\xef~\xa1\xbf\xd3\x1e\xac\x95Z\xf8f\x19\xffi\xbcn\x85\xabl)5kU\xdc]\xb7&\xc0\x94o\xa

In [136]:
text_all_pages = " ".join(pdf_to_text_per_page(file_content))
text_tokens = num_tokens_from_string(text_all_pages, "cl100k_base")
print("Input Text tokens: " + str(text_tokens) + " tokens")

Input Text tokens: 3287 tokens


In [137]:
print(text_all_pages)

UVZNr. 1666 L/2023

Name, Wohnort, Geburtsda-
tum bzw. Firma, Sitz und
Registernummer des
Gesellschafters

M&L Investments UG
(haftungsbeschrankt)
mit dem Sitz in Horben,
AG Freiburg i.Breisgau
HRB 722284

Q.E.D. Holding UG
(haftungsbeschrankt)

mit dem Sitz in Deggendorf,
AG Deggendorf HRB 5327

blume.vision UG
(haftungsbeschrankt)
mit dem Sitz in Hamburg,
AG Hamburg HRB 166719

ee Se

Lfd. Nrn.
Geschiaftsan-
teile

1-2.354

5.233 —
10.464

25.002 —
25.038

10.465 —
15.696

25.039 —

25.041

Kenn-Nr. 43890-sento GmbH/Share Sale and Puchase Agreement SB: Lime

LISTE DER GESELLSCHAFTER

der Firma

sento GmbH
mit dem Sitz in Miinchen,
AG Miinchen HRB 264243

Prozentuale
Beteiligung je
Anteil (%)

Summe der Nenn-
betrage

Nennbetrag
GeschAaftsantei-
le

1,--€ 2.354,-- €

1,-€ 5.767,-- €

Summe der
prozentua-
len Beteili-
gung (%)

Vermerke Uber
Veranderungen

Share Sale and
Purchase Agreement
vom 09.06.2023

Share Sale and
Purchase Agreement
vom 09.06.2023

 2,853 — 3.348

Sebastian Schuo

In [128]:
# import PyPDF2

# reader = PyPDF2.PdfReader("/Users/annabellschafer/Desktop/GitHub/cr_exploration/notebooks/output/"+file_name)
# number_of_pages = len(reader.pages)
# text =""
# for page in reader.pages:
#    text = text + page.extract_text() 

# print(text)

 
Dok-ID 217071  
Akte: 2023 -1282 N  Liste der Gesellschafter der  
 
SUMM AI GmbH  
 
mit Sitz in München  
Amtsgericht München, HRB 275514  
 
Vor- und Nachname des Gesellschafters, 
Geburtsdatum, Wohnort bzw. Firma, Registernr. 
u. Amtsgericht, Sitz  lfd. Nummer  Nennbetrag der 
Geschäftsanteile  prozentualer An-
teil des Ges-
chäftsanteiles am 
Stammkapital  Summe der 
Nennbeträge des 
Gesellschafters  prozentualer An-
teil des 
Gesamtumfangs 
der Geschäftsan-
teile am Stam-
mkapital  Veränderung  
floral ventures UG (haftungsbeschränkt)  
AG Hamburg HRB 175232  
Hamburg  1 - 8.334  je 1,00 €  je 0,0031%  8.334,00 €  26,23%   
Dataflow Ventures UG (haftungsbeschränkt)  
AG Schweinfurt HRB 8953  
Burkardroth  8.335 - 
16.668  je 1,00 €  je 0,0031%  8.334,00 €  26,23%   
WHY NOT adVentures UG (haftungsbeschränkt)  
AG München HRB 275415  
München  16.669 - 
25.002  je 1,00 €  je 0,0031%  8.334,00 €  26,23%   
Uhlig Capital GmbH  
AG München HRB 226493  
München  25.003 - 
25.128  je

In [153]:
import json
# text_all_pages = text
response_message = get_structured_shareholder_list(text_all_pages, file_name[:-4])
shareholders_object = json.loads(response_message.tool_calls[0].function.arguments)
shareholders_list = shareholders_object["shareholders"]



In [154]:

#add company column
df_temp = pd.DataFrame(shareholders_list)
file_name = file_name[:-4]
df_temp['company'] = file_name


In [155]:
#add df to exiting df as additional values
df = df.append(df_temp, ignore_index=True)

  df = df.append(df_temp, ignore_index=True)


In [156]:
df


Unnamed: 0,name,registry_details,location,nominal_value_per_share,total_nominal_value,participation_per_share,total_participation_in_percent,date_of_birth,company
0,FYS Ventures UG (haftungsbeschr\u00e4nkt),"AG M\u00fcnchen, HRB 268779",M\u00fcnchen,1,1110.0,0.0021 %,2.34 %,,Tacto Technology GmbH_Mün
1,Tanso Technologies GmbH,"AG M\u00fcnchen, HRB 269123",M\u00fcnchen,1,6973.0,0.0021 %,14.73 %,,Tacto Technology GmbH_Mün
2,Wiechmann Ventures UG (haftungsbeschr\u00e4nkt),"AG M\u00fcnchen, HRB 268778",M\u00fcnchen,1,8083.0,0.0021 %,17.07 %,,Tacto Technology GmbH_Mün
3,Gyri Ventures UG (haftungsbeschr\u00e4nkt),"AG M\u00fcnchen, HRB 268777",M\u00fcnchen,1,8083.0,0.0021 %,17.07 %,,Tacto Technology GmbH_Mün
4,Hetzel Ventures UG (haftungsbeschr\u00e4nkt),"AG D\u00fcsseldorf, HRB 94685",D\u00fcsseldorf,1,751.0,0.0021 %,1.59 %,,Tacto Technology GmbH_Mün
...,...,...,...,...,...,...,...,...,...
206,Insight Luxembourg Xil S.ar.l.,Handels- und Gesellschaftsregister Luxemburg N...,Luxemburg,1,7487.0,,"<1 20,92",,sento GmbH_Mün
207,Discovery Ventures Ill GmbH & Co. KG,AG Charlottenburg HRA 59004 B,Berlin,1,1.0,,"<1 1,08",,sento GmbH_Mün
208,Herr Miguel Eduardo Burger,,"Brooklyn, New York",1,28.0,,"<1 0,08",,sento GmbH_Mün
209,September Tech Ventures UG (haftungsbeschrankt),AG Charlottenburg HRB 234235 B,Berlin,1,24.0,,"<1 0,08",,sento GmbH_Mün


In [157]:
# safe df as csv
df.to_csv('shareholders_list.csv', index=False)

## All in one function



In [6]:
df = pd.DataFrame()

#open csv with company names
companies = pd.read_csv('initial_hundred.csv', header=None)
companies = companies[0].tolist()
#take the first 30 companies, remove first row
companies = companies[0:100]
print(companies)

FileNotFoundError: [Errno 2] No such file or directory: 'initial_hundred.csv'

In [238]:

#loop over companies to retrieve their documents and store them in a dataframe

def get_shareholder_table_for_companies(companies: list, document_type: str):
    df = pd.DataFrame()
    for company in companies:
        try:
            file_name, file_content = get_pdf_for_company(company, document_type, True)
            text_all_pages = " ".join(pdf_to_text_per_page(file_content))
            response_message = get_structured_shareholder_list(text_all_pages, file_name)
            print(file_name)
            shareholders_object = json.loads(response_message.tool_calls[0].function.arguments)
            shareholders_list = shareholders_object["shareholders"]
            df_temp = pd.DataFrame(shareholders_list)
            file_name = file_name[:-4]
            df_temp['company'] = file_name
            print(file_name)
            df = df.append(df_temp, ignore_index=True)
        except Exception as e:
            print("Error for company: " + company)
            print(str(e))
    return df

result = get_shareholder_table_for_companies(companies, 'gs')
result.to_csv('shareholders_list.csv', index=False)

ACTIMI GmbH_Stuttgart.pdf
ACTIMI GmbH_Stuttgart


  df = df.append(df_temp, ignore_index=True)


Error for company: EcoPals GmbH
Unable to get page count.
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table

alfatier GmbH_Hamburg.pdf
alfatier GmbH_Hamburg


  df = df.append(df_temp, ignore_index=True)


Dealcode GmbH_Hamburg.pdf
Dealcode GmbH_Hamburg


  df = df.append(df_temp, ignore_index=True)


Milkly GmbH_Hamburg.pdf
Milkly GmbH_Hamburg


  df = df.append(df_temp, ignore_index=True)


Unstoppable Finance GmbH_Berlin (Charlottenburg).pdf
Unstoppable Finance GmbH_Berlin (Charlottenburg)


  df = df.append(df_temp, ignore_index=True)


Failed to retrieve PDF: Status code 500
Error for company: Betriebsmittelhelden
cannot unpack non-iterable NoneType object
MicroHarvest GmbH_Hamburg.pdf
MicroHarvest GmbH_Hamburg


  df = df.append(df_temp, ignore_index=True)


Noyes Technologies GmbH_München.pdf
Noyes Technologies GmbH_München


  df = df.append(df_temp, ignore_index=True)


Countercheck GmbH_Berlin (Charlottenburg).pdf
Countercheck GmbH_Berlin (Charlottenburg)


  df = df.append(df_temp, ignore_index=True)


Failed to retrieve PDF: Status code 502
Error for company: The Exploration Company
cannot unpack non-iterable NoneType object
Elixion Medical GmbH_Düsseldorf.pdf
Elixion Medical GmbH_Düsseldorf


  df = df.append(df_temp, ignore_index=True)


Error for company: Content Bay
Unable to get page count.
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table

Hopper Mobility GmbH_Augsburg.pdf
Hopper Mobility GmbH_Augsburg


  df = df.append(df_temp, ignore_index=True)


SOLAR MATERIALS GmbH_Stendal.pdf
SOLAR MATERIALS GmbH_Stendal


  df = df.append(df_temp, ignore_index=True)


Failed to retrieve PDF: Status code 500
Error for company: AeroSys
cannot unpack non-iterable NoneType object
Software Defined Automation GmbH_München.pdf
Software Defined Automation GmbH_München


  df = df.append(df_temp, ignore_index=True)


MARA Solutions GmbH_Mannheim.pdf
MARA Solutions GmbH_Mannheim


  df = df.append(df_temp, ignore_index=True)


Elona Health GmbH_Düsseldorf.pdf
Elona Health GmbH_Düsseldorf


  df = df.append(df_temp, ignore_index=True)


Coinformation GmbH_Düsseldorf.pdf
Coinformation GmbH_Düsseldorf


  df = df.append(df_temp, ignore_index=True)


Fusion Bionic GmbH_Dresden.pdf
Fusion Bionic GmbH_Dresden


  df = df.append(df_temp, ignore_index=True)


Failed to retrieve PDF: Status code 404
Error for company: Passionfroot
cannot unpack non-iterable NoneType object
Atlas Metrics GmbH_Berlin (Charlottenburg).pdf
Atlas Metrics GmbH_Berlin (Charlottenburg)


  df = df.append(df_temp, ignore_index=True)


Sub Capitals GmbH_München.pdf
Sub Capitals GmbH_München


  df = df.append(df_temp, ignore_index=True)


Hoaly Foods GmbH_München.pdf
Hoaly Foods GmbH_München


  df = df.append(df_temp, ignore_index=True)


BUILD & CODE GmbH_Potsdam.pdf
BUILD & CODE GmbH_Potsdam


  df = df.append(df_temp, ignore_index=True)


REFLEX aerospace GmbH_München.pdf
REFLEX aerospace GmbH_München


  df = df.append(df_temp, ignore_index=True)


Additive Scale GmbH_München.pdf
Additive Scale GmbH_München


  df = df.append(df_temp, ignore_index=True)


Capreolos GmbH_Frankfurt am Main.pdf
Capreolos GmbH_Frankfurt am Main


  df = df.append(df_temp, ignore_index=True)


RAMPmedical GmbH_Berlin (Charlottenburg).pdf
RAMPmedical GmbH_Berlin (Charlottenburg)


  df = df.append(df_temp, ignore_index=True)


Error for company: VISIONME GmbH
Unable to get page count.
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table

Error for company: Zana Technologies GmbH
Unable to get page count.
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table

Error for company: New Solutions GmbH
Unable to get page count.
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table

iotis GmbH_Hannover.pdf
iotis GmbH_Hannover


  df = df.append(df_temp, ignore_index=True)


Briink Intelligence GmbH_Berlin (Charlottenburg).pdf
Briink Intelligence GmbH_Berlin (Charlottenburg)


  df = df.append(df_temp, ignore_index=True)


Freshflow GmbH_Berlin (Charlottenburg).pdf
Freshflow GmbH_Berlin (Charlottenburg)


  df = df.append(df_temp, ignore_index=True)


Onstruc UG (haftungsbeschränkt)_Berlin (Charlottenburg).pdf
Onstruc UG (haftungsbeschränkt)_Berlin (Charlottenburg)


  df = df.append(df_temp, ignore_index=True)


Failed to retrieve PDF: Status code 500
Error for company: Priv
cannot unpack non-iterable NoneType object
Cryptoticker.io GmbH_Berlin (Charlottenburg).pdf
Cryptoticker.io GmbH_Berlin (Charlottenburg)


  df = df.append(df_temp, ignore_index=True)


Voltfang GmbH_Aachen.pdf
Voltfang GmbH_Aachen


  df = df.append(df_temp, ignore_index=True)


Failed to retrieve PDF: Status code 500
Error for company: Dance
cannot unpack non-iterable NoneType object
greenforce Beteiligungsgesellschaft mbH_Bad Oeynhausen.pdf
greenforce Beteiligungsgesellschaft mbH_Bad Oeynhausen


  df = df.append(df_temp, ignore_index=True)


Casculate GmbH_Kaiserslautern.pdf
Casculate GmbH_Kaiserslautern


  df = df.append(df_temp, ignore_index=True)


Error for company: Hive
Unable to get page count.
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table

Urbyo GmbH_Berlin (Charlottenburg).pdf
Urbyo GmbH_Berlin (Charlottenburg)


  df = df.append(df_temp, ignore_index=True)


Failed to retrieve PDF: Status code 500
Error for company: resonanz energy
cannot unpack non-iterable NoneType object
Yababa GmbH_Berlin (Charlottenburg).pdf
Yababa GmbH_Berlin (Charlottenburg)


  df = df.append(df_temp, ignore_index=True)


Recunited GmbH_Berlin (Charlottenburg).pdf
Recunited GmbH_Berlin (Charlottenburg)


  df = df.append(df_temp, ignore_index=True)


igetnow GmbH_Kiel.pdf
igetnow GmbH_Kiel


  df = df.append(df_temp, ignore_index=True)


Packmatic GmbH_Berlin (Charlottenburg).pdf
Packmatic GmbH_Berlin (Charlottenburg)


  df = df.append(df_temp, ignore_index=True)


Error for company: CURE
Unable to get page count.
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table

Failed to retrieve PDF: Status code 500
Error for company: Composable Finance
cannot unpack non-iterable NoneType object
Beautinda GmbH_Essen.pdf
Beautinda GmbH_Essen


  df = df.append(df_temp, ignore_index=True)


Failed to retrieve PDF: Status code 500
Error for company: Share
cannot unpack non-iterable NoneType object
Failed to retrieve PDF: Status code 500
Error for company: Vay
cannot unpack non-iterable NoneType object
Beamer & more GmbH_Stuttgart.pdf
Beamer & more GmbH_Stuttgart


  df = df.append(df_temp, ignore_index=True)


Error for company: Actio
Unable to get page count.
Syntax Error (48): Illegal character <1a> in hex string
Syntax Error (49): Illegal character <a5> in hex string
Syntax Error (50): Illegal character <aa> in hex string
Syntax Error (51): Illegal character <1c> in hex string
Syntax Error (53): Illegal character <40> in hex string
Syntax Error (54): Illegal character <ac> in hex string
Syntax Error (55): Illegal character <74> in hex string
Syntax Error (56): Illegal character <3a> in hex string
Syntax Error (57): Illegal character <d9> in hex string
Syntax Error (58): Illegal character <a2> in hex string
Syntax Error (59): Illegal character <1d> in hex string
Syntax Error (60): Illegal character <0e> in hex string
Syntax Error (61): Illegal character <06> in hex string
Syntax Error (63): Illegal character <1d> in hex string
Syntax Error (64): Illegal character <40> in hex string
Syntax Error (66): Illegal character <b5> in hex string
Syntax Error (68): Illegal character <4e> in hex stri

  df = df.append(df_temp, ignore_index=True)


Event Management Gesellschaft für Freizeit- & Veranstaltungsorganisation GmbH_Freiburg.pdf
Event Management Gesellschaft für Freizeit- & Veranstaltungsorganisation GmbH_Freiburg


  df = df.append(df_temp, ignore_index=True)


Payla Services GmbH_München.pdf
Payla Services GmbH_München


  df = df.append(df_temp, ignore_index=True)


Mondu GmbH_Berlin (Charlottenburg).pdf
Mondu GmbH_Berlin (Charlottenburg)


  df = df.append(df_temp, ignore_index=True)


flowplace GmbH_Leipzig.pdf
flowplace GmbH_Leipzig


  df = df.append(df_temp, ignore_index=True)


Atheneum Partners GmbH_Berlin (Charlottenburg).pdf
Atheneum Partners GmbH_Berlin (Charlottenburg)


  df = df.append(df_temp, ignore_index=True)


x-cardiac GmbH_Berlin (Charlottenburg).pdf
x-cardiac GmbH_Berlin (Charlottenburg)


  df = df.append(df_temp, ignore_index=True)


Error for company: BITA
Unable to get page count.
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table

Failed to retrieve PDF: Status code 500
Error for company: Pectus Finance
cannot unpack non-iterable NoneType object
Error for company: Ryte
Unable to get page count.
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table

CYCLE Sport-Handels-GmbH_Stuttgart.pdf
CYCLE Sport-Handels-GmbH_Stuttgart


  df = df.append(df_temp, ignore_index=True)


Error for company: ARIVE
Unable to get page count.
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table

Tupu Farming Solutions GmbH_Berlin (Charlottenburg).pdf
Tupu Farming Solutions GmbH_Berlin (Charlottenburg)


  df = df.append(df_temp, ignore_index=True)


Hivebuy GmbH_Berlin (Charlottenburg).pdf
Hivebuy GmbH_Berlin (Charlottenburg)


  df = df.append(df_temp, ignore_index=True)


SQUAKE.earth GmbH_Berlin (Charlottenburg).pdf
SQUAKE.earth GmbH_Berlin (Charlottenburg)


  df = df.append(df_temp, ignore_index=True)


GRID Invest GmbH_Hamburg.pdf
GRID Invest GmbH_Hamburg


  df = df.append(df_temp, ignore_index=True)


eleva GmbH_Stuttgart.pdf
eleva GmbH_Stuttgart


  df = df.append(df_temp, ignore_index=True)


MAIT GmbH_Offenbach am Main.pdf
MAIT GmbH_Offenbach am Main


  df = df.append(df_temp, ignore_index=True)


Capabox GmbH_Berlin (Charlottenburg).pdf
Capabox GmbH_Berlin (Charlottenburg)


  df = df.append(df_temp, ignore_index=True)


Circula GmbH_Berlin (Charlottenburg).pdf
Circula GmbH_Berlin (Charlottenburg)


  df = df.append(df_temp, ignore_index=True)


Failed to retrieve PDF: Status code 500
Error for company: Raus
cannot unpack non-iterable NoneType object
Finanzchef24 GmbH_München.pdf
Finanzchef24 GmbH_München


  df = df.append(df_temp, ignore_index=True)


Upminster GmbH_Frankfurt am Main.pdf
Upminster GmbH_Frankfurt am Main


  df = df.append(df_temp, ignore_index=True)


Failed to retrieve PDF: Status code 500
Error for company: AVA
cannot unpack non-iterable NoneType object
Error for company: Optalio
Unable to get page count.
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table

Failed to retrieve PDF: Status code 500
Error for company: deineStudienfinanzierung
cannot unpack non-iterable NoneType object
touch smoke GmbH_Ulm.pdf
touch smoke GmbH_Ulm


  df = df.append(df_temp, ignore_index=True)


Elopage GmbH_Berlin (Charlottenburg).pdf
Elopage GmbH_Berlin (Charlottenburg)


  df = df.append(df_temp, ignore_index=True)


FinanzRitter GmbH_Iserlohn.pdf
FinanzRitter GmbH_Iserlohn


  df = df.append(df_temp, ignore_index=True)


Failed to retrieve PDF: Status code 500
Error for company: Transolt
cannot unpack non-iterable NoneType object
Triviar Education GmbH_Hamburg.pdf
Triviar Education GmbH_Hamburg


  df = df.append(df_temp, ignore_index=True)


LODGEA GmbH_München.pdf
LODGEA GmbH_München


  df = df.append(df_temp, ignore_index=True)


Error for company: ryd
Unable to get page count.
Syntax Error (21): Illegal character ')'
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table

Friday Finance GmbH_Berlin (Charlottenburg).pdf
Friday Finance GmbH_Berlin (Charlottenburg)


  df = df.append(df_temp, ignore_index=True)


Charge & More Entwicklungs- und Servicegesellschaft für Elektromobilität UG (haftungsbeschränkt)_Berlin (Charlottenburg).pdf
Charge & More Entwicklungs- und Servicegesellschaft für Elektromobilität UG (haftungsbeschränkt)_Berlin (Charlottenburg)


  df = df.append(df_temp, ignore_index=True)


bitsCrunch GmbH_München.pdf
bitsCrunch GmbH_München


  df = df.append(df_temp, ignore_index=True)


Error for company: Aware
Unable to get page count.
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table

Helsing Germany GmbH_München.pdf
Helsing Germany GmbH_München


  df = df.append(df_temp, ignore_index=True)


Error for company: tink
Unable to get page count.
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table

Failed to retrieve PDF: Status code 500
Error for company: re cap
cannot unpack non-iterable NoneType object
ZenML GmbH_München.pdf
ZenML GmbH_München


  df = df.append(df_temp, ignore_index=True)


In [219]:

# import PyPDF2

# # Open the PDF file in binary mode
# with open('errors.pdf', 'rb') as file:
#     # Create a PDF file reader object
#     pdf_reader = PyPDF2.PdfReader(file)

#     # Initialize an empty string to hold the text
#     text = ''

#     # Loop through each page in the PDF and extract the text
#     for page_num in range(len(pdf_reader.pages)):
#         page = pdf_reader.pages[page_num]
#         text += page.extract_text()

# # Now 'text' contains the text extracted from the PDF
# print(text)


---------------------------------------------------------------------------NameError                                 Traceback (most recent call last)/Users/annabellschafer/Desktop/GitHub/cr_exploration/notebooks/e2e_retrieve_file_get_shareholder_Niklas.ipynb Cell 11 line 2      1 # sum up the total participation in percent for all shareholders----> 2 total_participation_in_percent = sum([shareholder["total_participation_in_percent"] for shareholder in shareholders["shareholders"]])      3 total_participation_in_percentNameError: name 'shareholders' is not defined0.0032259999999999997Input Text tokens: 3287 tokensnameregistry_detailslocationnominal_value_per_sharetotal_nominal_valueparticipation_per_sharetotal_participation_in_percentdate_of_birthcompany0FYS Ventures UG (haftungsbeschr\u00e4nkt)AG M\u00fcnchen, HRB 268779M\u00fcnchen11110.00.0021 %2.34 %NaNTacto Technology GmbH_Mün1Tanso Technologies GmbHAG M\u00fcnchen, HRB 269123M\u00fcnchen16973.00.0021 %14.73 %NaNTacto Technology G

In [226]:
import re

# Sample text

# Regular expression pattern to match 'error:' followed by any 15 characters
pattern = r'Error.{0,70}'

# Use re.findall to find all substrings that match the pattern
error_substrings = re.findall(pattern, text)

# Print the list of error substrings
for substring in error_substrings:
    print(substring)


Error                                 Traceback (most recent call last)/Use
Error: name 'shareholders' is not defined0.0032259999999999997Input Text to
Error for company: SyncRealitycannot unpack non-iterable NoneType objectiot
Error for company: Yunex Trafficcannot unpack non-iterable NoneType objectB
Error for company: Bestgameprice.netcannot unpack non-iterable NoneType obj
Error: Couldn't find trailer dictionarySyntax Error: Couldn't find trailer 
Error: Couldn't read xref tableFreshflow GmbH_Berlin (Charlottenburg).pdf  
Error: Couldn't find trailer dictionarySyntax Error: Couldn't find trailer 
Error: Couldn't read xref tablewildplastic GmbH_Hamburg.pdf                
Error for company: Agrivero.aicannot unpack non-iterable NoneType objectErr
Error: Couldn't find trailer dictionarySyntax Error: Couldn't find trailer 
Error: Couldn't read xref tableFailed to retrieve PDF: Status code 500Error
Error for company: Privcannot unpack non-iterable NoneType objectrubarb Gmb
Error: Could