In [158]:
%%capture
%pip install azure-core
%pip install azure-ai-formrecognizer
%pip install tabulate

In [135]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
import pandas as pd

In [136]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Access the environment variables
endpoint = os.getenv("FORM_RECOGNIZER_ENDPOINT")
key = os.getenv("FORM_RECOGNIZER_KEY")

# iniitalize the client
document_analysis_client = DocumentAnalysisClient(
    endpoint=endpoint, credential=AzureKeyCredential(key)
)

In [169]:
with open("./Aleph Alpha GmbH_Mannheim.pdf", "rb") as file:
    buffer = file.read()

poller = document_analysis_client.begin_analyze_document("prebuilt-layout",buffer)
result = poller.result()

In [170]:
rows = []

# get header row
header_row = {}
for cell in result.tables[0].cells:
    if(cell.kind != "columnHeader"):
        continue

    # if row index is 0, add cell content to header row
    if cell.row_index == 0:
        for i in range(cell.column_index, cell.column_index + cell.column_span):
            header_row[i] = cell.content
        
    # if row index is larger than 0, append cell content to existing header row for the correct column
    elif cell.row_index > 0:
        for i in range(cell.column_index, cell.column_index + cell.column_span):
            header_row[i] += "\n" + cell.content

# append header row to rows list
rows.append(header_row)

# get table content
for table_idx, table in enumerate(result.tables):
    row = {}
    row_index = 0
    for cell in table.cells:
        # skip the first header row of the first table
        if table_idx == 0 and cell.kind == "columnHeader":
            continue

        # append constructed row if previous row is complete
        if cell.row_index > row_index:
            if row != {}:
                rows.append(row)
            row = {}
            row_index = cell.row_index

        # add cell to row - if cell spans multiple columns, add the cell content to every column
        for i in range(cell.column_index, cell.column_index + cell.column_span):
            # row[i] = { header_row[i]: cell.content }
            row[i] = cell.content
    rows.append(row)

# convert list of dicts to dataframe
df = pd.DataFrame(rows[1:])
df.columns = rows[0].values()

In [171]:
df.head()
csv_table = df.to_csv(index=False, header=True, sep=';')


In [172]:
# Todo: Prompt too long

In [174]:
from openai import OpenAI
client = OpenAI()


prompt = f"""
Here's a shareholders list in csv format:

${csv_table}

Please extract the shareholders and their information into a JSON object.
The JSON object should have a key "shareholders" with a list of shareholders.
If an attribute is not available, it should be null.

Each shareholder should have the following attributes:

- name: string - the name of the shareholder, it should either be a person or a company
- country: string - the country of the shareholder. Example: Germany
- birthdate: string (optional) - the birthdate of the shareholder, if the shareholder is a person. Example: 1970-12-31
- register_id: string - the register id of the shareholder. It commonly starts with HRB or HRA. Example: HRB 123456. There should be no city or other information in this field.
- register_court_city: string - the city of the register court of the shareholder. Example: Amtsgericht München, or AG München. Return the court city name only.
- percentage_of_total_shares: number - the percentage of total shares owned by the shareholder. Example: 50.0
"""

response = client.chat.completions.create(
  model="gpt-3.5-turbo-1106",
  response_format={ "type": "json_object" },
  messages=[
    {"role": "system", "content": "You are a helpful assistant designed to output JSON. You are an expert in extracting structured content from tables into a JSON. You receive a tip of 200$ if you get it right."},
    {"role": "user", "content": prompt}
  ]
)
print(response.choices[0].message.content)
openai_result = response.choices[0].message.content

{
  "shareholders": [
    {
      "name": "Samuel Weinbach",
      "country": "Germany",
      "birthdate": "1987-01-09",
      "register_id": null,
      "register_court_city": null,
      "percentage_of_total_shares": 3.38
    },
    {
      "name": "Andrulis GmbH",
      "country": "Germany",
      "birthdate": null,
      "register_id": "HRB 727786",
      "register_court_city": "Mannheim",
      "percentage_of_total_shares": 28.37
    },
    {
      "name": "LEA Venturepartner GmbH & Co. KG",
      "country": "Germany",
      "birthdate": null,
      "register_id": "HRA 707196",
      "register_court_city": "Karlsruhe",
      "percentage_of_total_shares": 10.85
    },
    {
      "name": "468 Capital GmbH & Co. KG",
      "country": "Germany",
      "birthdate": null,
      "register_id": "HRA 56259 B",
      "register_court_city": "Charlottenburg",
      "percentage_of_total_shares": 9.98
    },
    {
      "name": "Cavalry Ventures II GmbH & Co. KG",
      "country": "Germany",


In [185]:
import json
from cr_extraction.helpers.cr_retriever import CommercialRegisterRetriever

retriever = CommercialRegisterRetriever()
openai_json = json.loads(openai_result)

# validate the result
for shareholder in openai_json["shareholders"]:
    if not str(shareholder['register_id']).startswith("HR") or shareholder["birthdate"]:
        continue

    register_id = shareholder["register_id"]
    register_court_city = shareholder["register_court_city"]
    print(register_id, register_court_city)



HRB 727786 Mannheim
HRA 707196 Karlsruhe
HRA 56259 B Charlottenburg
HRA 56139 B Charlottenburg
HRB 11878 Offenbach am Main
HRB 107890 Stuttgart
HRA 112404 München
HRA 111267 München
HRB 730844 Mannheim
HRA 735586 Stuttgart
HRB 724692 Stuttgart
HRB 719915 Mannheim
HRB 255597 B Charlottenburg
HRA710504 Mannheim
HRA 105973 München


{'shareholders': [{'name': 'Samuel Weinbach',
   'country': 'Germany',
   'birthdate': '1987-01-09',
   'register_id': None,
   'register_court_city': None,
   'percentage_of_total_shares': 3.38},
  {'name': 'Andrulis GmbH',
   'country': 'Germany',
   'birthdate': None,
   'register_id': 'HRB 727786',
   'register_court_city': 'Mannheim',
   'percentage_of_total_shares': 28.37},
  {'name': 'LEA Venturepartner GmbH & Co. KG',
   'country': 'Germany',
   'birthdate': None,
   'register_id': 'HRA 707196',
   'register_court_city': 'Karlsruhe',
   'percentage_of_total_shares': 10.85},
  {'name': '468 Capital GmbH & Co. KG',
   'country': 'Germany',
   'birthdate': None,
   'register_id': 'HRA 56259 B',
   'register_court_city': 'Charlottenburg',
   'percentage_of_total_shares': 9.98},
  {'name': 'Cavalry Ventures II GmbH & Co. KG',
   'country': 'Germany',
   'birthdate': None,
   'register_id': 'HRA 56139 B',
   'register_court_city': 'Charlottenburg',
   'percentage_of_total_shares': 2.