## verzeichnisse_extract_data.ipynb

In this notebook, we will extract the text data enclosed in the Verzeichnisse. The data has been pre-OCR'ed, with the layout-ordered text present in the pdf. We will use an algorithm to extract this layout-specific text to order the data in rows and column. 

In [1]:
import re
import os
import fitz
import glob
import json
import ollama
import pdfplumber
from fpdf import FPDF

# OpenAI
from openai import OpenAI
api_key = os.getenv('OPENAI_API_KEY')

headers = {
  "Content-Type": "application/json",
  "Authorization": f"Bearer {api_key}"
}

client = OpenAI()

ollama.pull("llama3.1")
ollama.pull("philsaysss/german-language")

# Function to crop the .pdf
def crop_pdf(pdf_path, output_path, x1=0, x2=180):
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        rect = page.mediabox
        crop_rect = fitz.Rect(x1, rect.y0, x2, rect.y1)
        page.set_cropbox(crop_rect)
        
    doc.save(output_path)
    doc.close()    
    
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    # Open the PDF file
    with pdfplumber.open(pdf_path) as pdf:
        all_text = ""
        
        # Loop through all the pages in the PDF
        for page in pdf.pages:
            # Extract text from each page
            text = page.extract_text()
            if text:
                all_text += text
        
    return all_text

def split_text_by_pattern(text):
    # Regex pattern to match a digit followed by a space
    pattern = r'\n\d+\s'
    
    # Split the text using the regex pattern
    split_list = re.split(pattern, text)
    
    return split_list

def crop_pdf_and_extract_text(pdf_path, output_path, x1=0, y1=0, x2=180, y2=722):
    # Open the PDF file
    with pdfplumber.open(pdf_path) as pdf:
        # Initialize the text container
        extracted_text = ""
        
        # Create a new PDF with fpdf to store cropped text
        pdf_writer = FPDF()
        
        # Iterate over all the pages in the PDF
        for page_num in range(len(pdf.pages)):
            page = pdf.pages[page_num]
            
            # Define the cropping box (x1, y1, x2, y2)
            crop_box = (x1, y1, x2, y2)
            
            # Crop the page to the defined box
            cropped_page = page.within_bbox(crop_box)
            
            # Extract the text from the cropped area
            cropped_text = cropped_page.extract_text()
            
            # Add the cropped text to the total extracted text
            if cropped_text:
                cropped_text = cropped_text.encode('latin-1', 'replace').decode('latin-1')
                extracted_text += cropped_text + "\n"
                
                # Add a new page to the new PDF document
                pdf_writer.add_page()
                pdf_writer.set_xy(10, 10)
                pdf_writer.set_font("Arial", size=12)
                pdf_writer.multi_cell(0, 10, cropped_text)
        
        # Save the new PDF with cropped text
        pdf_writer.output(output_path)

        # Optionally, return the extracted text
        return extracted_text

def request_city_llm(extracted_text):
    responses = []
    for text in extracted_text:
        prompt = f"""
        Welche Stadt wird in diesem Text genannt - der Städtename kann durch einige Füllwörter getrennt werden. 
        Gib mir nur den Städtename, nichts anderes: {text}
        """
        response = ollama.chat(model='philsaysss/german-language', messages=[
            {
                'role': 'user',
                'content': prompt,
                },
            ],
                               options={'temperature':0})
        
        output = response['message']['content']
        responses.append(output)
    return responses


def request_city_gpt(extracted_text):
    responses = []
    for text in extracted_text:
        prompt = f"""
        Welche Stadt wird in diesem Text genannt? Der Städtename kann durch einige Füllwörter getrennt sein. Gib mir nur den Städtenamen, nichts anderes: {text}
        """
        message = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": prompt
                }
            ]
        }
        ]
        response = client.chat.completions.create(
        model="gpt-4o",
        messages=message,
        max_tokens=2500
        )
        responses.append(response)
        
    return responses
        

## Cropping Pipeline

- We crop the file, then extract the text from the cropped file. Then we use a LLM pipeline to discover the city mentioned. 

In [7]:
# Example usage:
pdf_file_path = '../../data/patent_data/raw_patent_data/Verzeichnisse/Verzeichniss_1853_A.pdf'
output_path = '../../data/patent_data/raw_patent_data/Verzeichnisse/cropped_output.pdf'
extracted_text = crop_pdf_and_extract_text(pdf_file_path, output_path, x1=0, y1=0, x2=180, y2=722)

print(extracted_text)
splitted_text = split_text_by_pattern(extracted_text)
print(splitted_text)

Fortlaufende
Vor- und Zuname, Charakter
Bahl
und
Wohnort des Privilegirten
1 Adensamer Joseph, lan Ver
desbefugter Bandfabrikant Be
in Wien ( Schottenfeld) . in
Au
he
2 Adler Carl , Fabrikant Ver
chemisch. Producte in Wien te
( Gumpendorf Nr. 190) . un
bin
ku
Fe
Er
miadar Ar
ste
3 Alcan Michael , Civil -I n Erfi
genieur, und Peter Hypolit ein
Limet in Paris ( Bevoll Se
mächtigter Jacob Franz Ab
Heinrich Hemberger, Fri
Verwaltungs- Director in bar
Wien) . Tig
4 Allan Thomas , Ingenieur Erfi
in Edinburg , durch sei tris
nen Bevollmächtigten Georg
Märkl, in Wien (J oseph
stadt Nr. 65 ) .ad
5 Altlechner Laurenz , Bür Erfi
ger in Wien (L aimgrube un
Nr. 182) .
miadar
6 Amstötter Mathias , und Erfi
Johann Schredl , in Wie vo
ner Neustadt Nr . 172. chi
7 Anthon Ernst Friedrich , Erfi
technisch . Chemiker in Prag, zur
Nr. 70 - IV . höc
mistop Ve
sec
int zu
KaC
8 Armengaud Jacob Eugen V
sen. , Civil Ingenieur in sch
Paris (R ue St. Sebastien zus
Nr. 45) , durch Jacob Franz Er
Heinrich Hemberger

In [4]:
# For loop implementation
files = glob.glob(os.path.join("../../data/patent_data/raw_patent_data/Verzeichnisse/", "Verzeichniss*"))
verzeichnisse_texts = []
for file in files:
    print(file)
    output_path = '../../data/patent_data/raw_patent_data/Verzeichnisse/cropped_output.pdf'
    extracted_text = crop_pdf_and_extract_text(file, output_path, x1=0, y1=0, x2=180, y2=722)
    splitted_text = split_text_by_pattern(extracted_text)
    verzeichnisse_texts.append(splitted_text)
    

../../data/patent_data/raw_patent_data/Verzeichnisse/Verzeichniss_1855_A.pdf
../../data/patent_data/raw_patent_data/Verzeichnisse/Verzeichniss_1854_A.pdf
../../data/patent_data/raw_patent_data/Verzeichnisse/Verzeichniss_1859_A.pdf
../../data/patent_data/raw_patent_data/Verzeichnisse/Verzeichniss_1856_A.pdf
../../data/patent_data/raw_patent_data/Verzeichnisse/Verzeichniss_1858_A.pdf
../../data/patent_data/raw_patent_data/Verzeichnisse/Verzeichniss_1863_A.pdf
../../data/patent_data/raw_patent_data/Verzeichnisse/Verzeichniss_1864_A.pdf
../../data/patent_data/raw_patent_data/Verzeichnisse/Verzeichniss_1853_A.pdf
../../data/patent_data/raw_patent_data/Verzeichnisse/Verzeichniss_1860_A.pdf
../../data/patent_data/raw_patent_data/Verzeichnisse/Verzeichniss_1857_A.pdf
../../data/patent_data/raw_patent_data/Verzeichnisse/Verzeichniss_1867_A.pdf
../../data/patent_data/raw_patent_data/Verzeichnisse/Verzeichniss_1865_A.pdf
../../data/patent_data/raw_patent_data/Verzeichnisse/Verzeichniss_1862_A.pdf

In [12]:
# Flatten the resulting lists
verzeichnisse_texts_fl = [item for sub_list in verzeichnisse_texts for item in sub_list]

In [23]:
# Create a .json file to send them to GPT Batch API
list_of_dictionaries = []

for num, text in enumerate(verzeichnisse_texts_fl):
    # Set up the dictionary for each text
    batch_dictionary = {}
    batch_dictionary['custom_id'] = f"request-{num}"
    batch_dictionary['method'] = "POST"
    batch_dictionary['url'] = "/v1/chat/completions"
    # Create the prompt we want GPT to answer
    prompt = f"Welche Stadt wird in diesem Text genannt? Der Städtename kann durch einige Füllwörter getrennt sein. Gib mir nur den Städtenamen, nichts anderes. Falls es keinen Namen gibt, gib mir 'NA' zurück: {text}"
    batch_dictionary['body'] = {"model": "gpt-4o", "messages": [{"role": "user", "content": prompt}], "max_tokens": 100}
    list_of_dictionaries.append(batch_dictionary)

In [38]:
# With this list of dictionaries, we create a jsonlite file and then a GPT Batch Object:
with open("../../data/batch_verzeichnisse.jsonl", "w") as outfile: 
    for i in list_of_dictionaries:
        json.dump(i, outfile)
        outfile.write("\n")

In [None]:
# Create a batch object:
batch_input_file = client.files.create(
  file=open("../../data/batch_verzeichnisse.jsonl", "rb"),
  purpose="batch"
)

In [40]:
# Send it off
batch_input_file_id = batch_input_file.id

client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "verzeichnisse 24h"
    }
)

Batch(id='batch_672e0291bfe481908842cfbd1e2d95d6', completion_window='24h', created_at=1731068561, endpoint='/v1/chat/completions', input_file_id='file-Yhyl6wvim8OiEZKcBNabXPt9', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1731154961, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'verzeichnisse 24h'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

In [2]:
client.batches.retrieve("batch_672e0291bfe481908842cfbd1e2d95d6")

Batch(id='batch_672e0291bfe481908842cfbd1e2d95d6', completion_window='24h', created_at=1731068561, endpoint='/v1/chat/completions', input_file_id='file-Yhyl6wvim8OiEZKcBNabXPt9', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1731070878, error_file_id=None, errors=None, expired_at=None, expires_at=1731154961, failed_at=None, finalizing_at=1731070035, in_progress_at=1731068564, metadata={'description': 'verzeichnisse 24h'}, output_file_id='file-kDkXMelUuXPXx7NtbWzQxu3S', request_counts=BatchRequestCounts(completed=10255, failed=0, total=10255))

In [3]:
file_response = client.files.content("file-kDkXMelUuXPXx7NtbWzQxu3S")
print(file_response.text)

{"id": "batch_req_672e08539d3081908145cca2a01039c3", "custom_id": "request-0", "response": {"status_code": 200, "request_id": "b8c89e6f48542376c2a5dc99c68b66bc", "body": {"id": "chatcmpl-ARIIhfLyIytZT0PmLo3Zp8N6VTKLX", "object": "chat.completion", "created": 1731068567, "model": "gpt-4o-2024-08-06", "choices": [{"index": 0, "message": {"role": "assistant", "content": "Paris", "refusal": null}, "logprobs": null, "finish_reason": "stop"}], "usage": {"prompt_tokens": 111, "completion_tokens": 1, "total_tokens": 112, "prompt_tokens_details": {"cached_tokens": 0, "audio_tokens": 0}, "completion_tokens_details": {"reasoning_tokens": 0, "audio_tokens": 0, "accepted_prediction_tokens": 0, "rejected_prediction_tokens": 0}}, "system_fingerprint": "fp_45cf54deae"}}, "error": null}
{"id": "batch_req_672e0853b7988190933a5c86a940fd80", "custom_id": "request-1", "response": {"status_code": 200, "request_id": "20aca727f4905a77198f08e323131e99", "body": {"id": "chatcmpl-ARIIhhri6rsG8XpOSFDEvgtVW8Aa9", 

In [22]:
# Save this to .jsonl
# Split the string into separate JSON objects by newline
json_strings = file_response.text.splitlines()

# Open the output file in write mode
with open('../../data/batch_verzeichnisse_output.jsonl', 'w') as file:
    for json_str in json_strings:
        # Parse each JSON string into a dictionary
        json_obj = json.loads(json_str)
        # Write each dictionary as a JSON line
        file.write(json.dumps(json_obj) + '\n')
        
#with open("../../data/batch_verzeichnisse_output.jsonl", "w") as outfile: 
#    json.dump(file_response.text, outfile)
