In [2]:
import pandas as pd
import google.generativeai as genai
from scripts.schemes import ExtractedContent
import os
import json

# Definieren einer Config

- Diese Config kann für genau einen Task verwendet werden, sie benötigt einen genau die Keys die darin definiert sind. Diese müssen exakt übereinstimmen, da sie hardcoded sind.

In [3]:
configs = {
  "API_KEY" : os.environ.get("GEMINI_API_KEY"),
 
  
  "PROMPT":"Your porpouse is it to extract a pre-defined part of the text i will provide you. The text i'll provide you is an ocr read" \
  "historical newspaper. Your task is it to extract the marriage request from the text. A marriage request is defined a person, male or female, stating their interest in finding"
  "a partner for the porpuse of marriage either for themself or for relatives of them in a newspaper." \
  " It is possible that there are more than one in the text" \
  "in this case extract all of them. If you are done, reevaluate your work and check your results. Revavluate if the extracted text is really a marriage request and not some sort of other requst. Return the results in the format of a python list." \
  "The response should be just the text you extracted. It just should contain a list of the extracted texts, with no changes made to the text If there is no marriage request return an empty string.",
  
  "model_name":"gemini-2.5-flash",
  
  "generation_config" : {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 40,
    "max_output_tokens": 65536,
    "response_mime_type": "application/json",
    "response_schema": ExtractedContent,
    }
}



# Definieren einer Klasse LLMWorker

- Diese Klasse versucht, das was in `Scripts/information_extractor_lib` als `Information_Extractor` definiert wurde zu vereinfachen, da hier anstatt "relativ umständlich" mit Dataframe zu arbeiten direkt mit dicionarys gearbeitet wird, welches iterativ ergänzt wird und dann am ende zu einem Dataframe exploded wird.

- Mittels dieser Klasse können wir durch Vererbung des gesamten Workflow in einer zweiten Klasse automatiseren. D.h. wir müssen nicht mehr drei verschiedene Objekte intialisieren (ist aber nicht zwingend notwendig - LLMWorker kann auch allein für einen einzigen Task verwendet werden). 

In [4]:
class LLMWorker:
    def __init__(self, configs:dict, input_id:list[str], text:list[str]):
        self.config:dict = configs
        self.initialized_model:bool = False
        self.input_id:list[str] = input_id if isinstance(input_id, list) else [input_id]
        self.text:list[str] = text if isinstance(text, list) else [text]
        self.model = None
        self.output = {"results": { } }
        self.prompt_key = 'PROMPT'
        
    def set_config(self) -> None:
        """Setzen des API Keys, aus den im config dict gespeicherten Daten"""
        API_KEY = self.config["API_KEY"]
        genai.configure(api_key=API_KEY)

    def load_model(self) -> None:
        """Laden des Models und setzen der Modelkonfigurationen. Generation Config sollte im config_file gespeichert sein"""
        self.model = genai.GenerativeModel(
            model_name=self.config["model_name"],
            generation_config=self.config["generation_config"])
        self.initialized_model = True

    def create_model_input(self,page_id:str, input_text:str) -> str:
        """Funktion die den prompt erstellt, der später an die Model API gesendet wird
        Parameters:
        page_id - id des inputs
        input_text - Text der als Input für das Model verwendet wird

        Returns:
        string that combines the input text and input prompt
        
        """
        
        model_input = {page_id:input_text}
        input_combined = f"{self.config[self.prompt_key]}\n{model_input}"
        return input_combined
    
    
    def extract_single_page(self, page_id:str, input_text:str) -> dict:
        """Funktion die die API anspricht und die einen strukturierten Output als response erhält
        Parameters:
        page_id - id des inputs
        input_text - Text der als Input für das Model verwendet wird

        Returns:
        model response
        """
        try:
            input_combined = self.create_model_input(page_id=page_id, input_text=input_text)
            
        except Exception as e:
            print(f"Error while generating the model input: {e}")
            return {"content":f"Error while generating the model input: {e}"}
        
        try:
            response = self.model.generate_content([input_combined])
        except Exception as e:
            print(f"Error while receiving the response: {e} ")
            return {"content":f"Error while receiving the response: {e} "}
        
        try:
            res_parsed = json.loads(str(response.text))

            return res_parsed
        except Exception as e: 
            print(f"Error while parsing the response - Exeption {e}")
            return {"content":f"Error while parsing the response - Exeption {e}"}
        
    def extract_content(self):
        if self.initialized_model is False:
            self.set_config()
            self.load_model()
        current_idx = 0
        for p_, t_ in zip(self.input_id, self.text):
            print(p_)
            result = self.extract_single_page(p_, t_)
            #Wenn mehr als eine Id vorhanden, dann wird pro id jeweils ein index ergänzt für den eintrag im dict, sonst überschreiebn wir den eintrag wieder
            if str(p_) in self.output["results"].keys():
                self.output["results"].update({str(p_+"anz_" + str(current_idx)):result["content"]})
                current_idx += 1
            else:
                self.output["results"].update({str(p_):result["content"]})
                current_idx = 0
            
        return self.output
    

def output_to_df(output:dict):
    extracted_list = []
    ids = []
    for k in output["results"].keys():
        extracted_list.append(output["results"][k])
        ids.append(k)
    df = pd.DataFrame(data={"ids":ids, "content":extracted_list})
    df = df.explode('content').reset_index(drop=True)
    return df
 

### Laden der Test Daten

In [5]:
df = pd.read_pickle("data/test_data.pkl")

p, t = df.page_id[0:5].tolist(), df.plainpagefulltext[0:5].tolist()

### Exemplarische Anwendung des LLM-Worker für die Anzeigen Extraktion

In [6]:
extractor = LLMWorker(configs=configs, input_id= p,text= t)
result = extractor.extract_content()

222L6GN7RLURYX36SG5DC7OGNEQUHPEQ-ALTO6294738_DDB_FULLTEXT
2I6N3O6FQVREJAANZILPB7KV3FPJ3AJC-ALTO6282283_DDB_FULLTEXT
2SD5IW4WF3O7OXZOSVUN3UZSYNUSDZRU-ALTO6311708_DDB_FULLTEXT
2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_DDB_FULLTEXT
2TDMUDSOIG6M4UQSDA6QJ4D77NNJFUVG-ALTO6314075_DDB_FULLTEXT


### Transformieren des Output in einen lesbareren Dataframe

In [7]:
extracted_content = output_to_df(result)
extracted_content

Unnamed: 0,ids,content
0,222L6GN7RLURYX36SG5DC7OGNEQUHPEQ-ALTO6294738_D...,"Reelles Heiratsgesuch! Ein junger Mann, 30 Jah..."
1,2I6N3O6FQVREJAANZILPB7KV3FPJ3AJC-ALTO6282283_D...,Heiratsgesuch . In gusen Vertätrissen stehende...
2,2SD5IW4WF3O7OXZOSVUN3UZSYNUSDZRU-ALTO6311708_D...,"Damen jeden Standes und Alters , mit und ohne ..."
3,2SD5IW4WF3O7OXZOSVUN3UZSYNUSDZRU-ALTO6311708_D...,Heiratsgesuch ! Junger Mann ( Handwirker ) 23 ...
4,2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_D...,"Manusakturift , 27 Jahre , 1,68 groß , ansehn ..."
5,2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_D...,"Akademiker 28½ Jahre all , natur = und sportfr..."
6,2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_D...,"Aleinstehender Hofbesitzer , 125 Morgen , 30 J..."
7,2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_D...,"Heiratsgesuch . Lediger , kath . Beam tenanwär..."
8,2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_D...,"Sollder Mann , 26 . , kath . , 2000 4l bar , s..."
9,2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_D...,Heirat Für 22jähr . kathol . Gutsbesitzerstoch...


### Exemplarische Anwendung für die OCR-Error-Korrektur

In [12]:
fw_config = {

    "API_KEY" : os.environ.get("GEMINI_API_KEY"),

    "model_name":"gemini-2.5-flash",
  

    "PROMPT": "You're an ocr corrector. You are an expert in correcting ocr errors in the german language." \
    " The text you will receive can contain ocr-errors. There may be special characters that dont belong in the sentence, for example #." \
    "Remove those characters. Please correct those and return the corrected string and the page_id in form of the scheme you received.",
    
    "generation_config" : {
        "temperature": 1,
        "top_p": 0.95,
        "top_k": 40,
        "max_output_tokens": 65536,
        "response_mime_type": "application/json",
        "response_schema": ExtractedContent,
    }
}

In [13]:
extractor = LLMWorker(configs=fw_config, input_id= extracted_content.ids.tolist(),text= extracted_content.content.tolist())
result_corrected = extractor.extract_content()

222L6GN7RLURYX36SG5DC7OGNEQUHPEQ-ALTO6294738_DDB_FULLTEXT
2I6N3O6FQVREJAANZILPB7KV3FPJ3AJC-ALTO6282283_DDB_FULLTEXT
2SD5IW4WF3O7OXZOSVUN3UZSYNUSDZRU-ALTO6311708_DDB_FULLTEXT
2SD5IW4WF3O7OXZOSVUN3UZSYNUSDZRU-ALTO6311708_DDB_FULLTEXT
2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_DDB_FULLTEXT
2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_DDB_FULLTEXT
2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_DDB_FULLTEXT
2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_DDB_FULLTEXT
2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_DDB_FULLTEXT
2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_DDB_FULLTEXT
2TDMUDSOIG6M4UQSDA6QJ4D77NNJFUVG-ALTO6314075_DDB_FULLTEXT


In [14]:
result_corrected_df = output_to_df(result_corrected)
result_corrected_df

Unnamed: 0,ids,content
0,222L6GN7RLURYX36SG5DC7OGNEQUHPEQ-ALTO6294738_D...,"Reelles Heiratsgesuch! Ein junger Mann, 30 Jah..."
1,2I6N3O6FQVREJAANZILPB7KV3FPJ3AJC-ALTO6282283_D...,Heiratsgesuch. In guten Verhältnissen stehende...
2,2SD5IW4WF3O7OXZOSVUN3UZSYNUSDZRU-ALTO6311708_D...,"Damen jeden Standes und Alters, mit und ohne V..."
3,2SD5IW4WF3O7OXZOSVUN3UZSYNUSDZRU-ALTO6311708_D...,"Heiratsgesuch! Junger Mann (Handwerker) 23 J.,..."
4,2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_D...,"Manufakturist, 27 Jahre, 1,68 groß, ansehnlich..."
5,2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_D...,"Akademiker 28½ Jahre alt, natur- und sportfreu..."
6,2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_D...,"Alleinstehender Hofbesitzer, 125 Morgen, 30 Ja..."
7,2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_D...,"Heiratsgesuch . Lediger , kath . Beamtenanwärt..."
8,2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_D...,"Soll der Mann, 26, kath., 2000 M bar, spät. Ve..."
9,2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_D...,Heirat Für 22jähr . kathol . Gutsbesitzerstoch...


### Exemplarische Anwendung für die Textvervollständigung

In [15]:
fw_config2 = {
    "API_KEY" : os.environ.get("GEMINI_API_KEY"),
 

    "PROMPT":"You're an text corrector. You are an expert in correcting texts. You will receive a text in the german language that contain abreviated words, "
    "write those abriviated words out and return the corrected sentence in the schemes you had been given. If there are no abriviated words return the sentence as it is.",
    
    "model_name":"gemini-2.5-flash",
    
    "generation_config" : {
        "temperature": 1,
        "top_p": 0.95,
        "top_k": 40,
        "max_output_tokens": 65536,
        "response_mime_type": "application/json",
        "response_schema": ExtractedContent,
    }
}

In [16]:
extractor = LLMWorker(configs=configs, input_id= result_corrected_df.ids.tolist(),text= result_corrected_df.content.tolist())
extractor.config = fw_config2
result_completed = extractor.extract_content()

222L6GN7RLURYX36SG5DC7OGNEQUHPEQ-ALTO6294738_DDB_FULLTEXT
2I6N3O6FQVREJAANZILPB7KV3FPJ3AJC-ALTO6282283_DDB_FULLTEXT
2SD5IW4WF3O7OXZOSVUN3UZSYNUSDZRU-ALTO6311708_DDB_FULLTEXT
2SD5IW4WF3O7OXZOSVUN3UZSYNUSDZRU-ALTO6311708_DDB_FULLTEXTanz_0
2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_DDB_FULLTEXT
2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_DDB_FULLTEXTanz_0
2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_DDB_FULLTEXTanz_1
2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_DDB_FULLTEXTanz_2
2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_DDB_FULLTEXTanz_3
2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_DDB_FULLTEXTanz_4
2TDMUDSOIG6M4UQSDA6QJ4D77NNJFUVG-ALTO6314075_DDB_FULLTEXT


In [17]:
output_to_df(result_completed)

Unnamed: 0,ids,content
0,222L6GN7RLURYX36SG5DC7OGNEQUHPEQ-ALTO6294738_D...,"Reelles Heiratsgesuch! Ein junger Mann, 30 Jah..."
1,2I6N3O6FQVREJAANZILPB7KV3FPJ3AJC-ALTO6282283_D...,Heiratsgesuch. In guten Verhältnissen stehende...
2,2SD5IW4WF3O7OXZOSVUN3UZSYNUSDZRU-ALTO6311708_D...,"Damen jeden Standes und Alters, mit und ohne V..."
3,2SD5IW4WF3O7OXZOSVUN3UZSYNUSDZRU-ALTO6311708_D...,Heiratsgesuch! Junger Mann (Handwerker) 23 Jah...
4,2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_D...,"Manufakturist, 27 Jahre, 1,68 groß, ansehnlich..."
5,2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_D...,"Akademiker 28½ Jahre alt, natur- und sportfreu..."
6,2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_D...,"Alleinstehender Hofbesitzer, 125 Morgen, 30 Ja..."
7,2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_D...,"Heiratsgesuch. Lediger, katholischer Beamtenan..."
8,2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_D...,"Soll der Mann, 26, katholisch, 2000 Mark bar, ..."
9,2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_D...,Heirat Für 22-jährige katholische Gutsbesitzer...


### Versuch alle Arbeitsschritte in einer Klasse zu automatisiert

- Risiko: Hier können sich Fehler einschleichen, die sich bis zum Ende durchziehen, d.h. es ist noch kein Droput implementiert, wenn wir einen Error bekommen. 

- Das Ganze ist so implementiert, dass wir eine folgendes als Input geben:
    - `configs:dict` - beeinhaltet, alle prompts die ausgeführt werden sollen, Modell Name und Modell Spezifikationen
    - `input_id:list[str]` - liste der Input Ids
    - `text:list[str]` - Liste der Texte die verarbeitet werden sollen 
    - `prompt_list_ordered:list[str]` - geordnete Liste der Prompts die ausgeführt werden sollen.


In [None]:
df = pd.read_pickle("data/test_data.pkl")

p, t = df.page_id[0:5].tolist(), df.plainpagefulltext[0:5].tolist()

In [None]:
class FullWorkflow(LLMWorker):
    def __init__(self, configs, input_id, text, prompt_list_ordered:list):
        super().__init__(configs, input_id, text)
        self.prompt_list = prompt_list_ordered
    

    def reset_output(self):
        self.output = {"results": { } }
        

    def output_to_df(self, output:dict):
        extracted_list = []
        ids = []
        for k in output["results"].keys():
            extracted_list.append(output["results"][k])
            ids.append(k)
        df = pd.DataFrame(data={"ids":ids, "content":extracted_list})
        df = df.explode('content').reset_index(drop=True)
        return df



    def run_workflow(self):
        for prompt in self.prompt_list:
            
            self.prompt_key = prompt
            
            result = self.extract_content()
            result = self.output_to_df(result)
            
            self.input_id = result.ids.tolist()
            self.text = result.content.tolist()

            self.reset_output()


        return result
        

In [None]:
test_config = {
    "API_KEY" : os.environ.get("GEMINI_API_KEY"),

    "PROMPT":"Your porpouse is it to extract a pre-defined part of the text i will provide you. The text i'll provide you is an ocr read" \
    "historical newspaper. Your task is it to extract the marriage request from the text. A marriage request is defined a person, male or female, stating their interest in finding"
    "a partner for the porpuse of marriage either for themself or for relatives of them in a newspaper." \
    " It is possible that there are more than one in the text" \
    "in this case extract all of them. If you are done, reevaluate your work and check your results. Revavluate if the extracted text is really a marriage request and not some sort of other requst. Return the results in the format of a python list." \
    "The response should be just the text you extracted. It just should contain a list of the extracted texts, with no changes made to the text If there is no marriage request return an empty string.",
    
    
    "CORRECTION_PROMPT": "You're an ocr corrector. You are an expert in correcting ocr errors in the german language." \
    " The text you will receive can contain ocr-errors. There may be special characters that dont belong in the sentence, for example #." \
    "Remove those characters. Please correct those and return the corrected string and the page_id in form of the scheme you received.",
    
    "COMPLETION_PROMPT":"You're an text corrector. You are an expert in correcting texts. You will receive a text in the german language that contain abreviated words, "
    "write those abriviated words out and return the corrected sentence in the schemes you had been given. If there are no abriviated words return the sentence as it is.",
    
    
    "model_name":"gemini-2.5-flash",
    
    "generation_config" : {
        "temperature": 1,
        "top_p": 0.95,
        "top_k": 40,
        "max_output_tokens": 65536,
        "response_mime_type": "application/json",
        "response_schema": ExtractedContent,
    }
}

In [None]:
wf = FullWorkflow(configs=test_config, input_id= p,text= t, prompt_list_ordered=["PROMPT", "CORRECTION_PROMPT", "COMPLETION_PROMPT"])
r = wf.run_workflow()

222L6GN7RLURYX36SG5DC7OGNEQUHPEQ-ALTO6294738_DDB_FULLTEXT
2I6N3O6FQVREJAANZILPB7KV3FPJ3AJC-ALTO6282283_DDB_FULLTEXT
2SD5IW4WF3O7OXZOSVUN3UZSYNUSDZRU-ALTO6311708_DDB_FULLTEXT
2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_DDB_FULLTEXT
2TDMUDSOIG6M4UQSDA6QJ4D77NNJFUVG-ALTO6314075_DDB_FULLTEXT
222L6GN7RLURYX36SG5DC7OGNEQUHPEQ-ALTO6294738_DDB_FULLTEXT
2I6N3O6FQVREJAANZILPB7KV3FPJ3AJC-ALTO6282283_DDB_FULLTEXT
Error while parsing the response - Exeption Unterminated string starting at: line 1 column 14 (char 13)
2SD5IW4WF3O7OXZOSVUN3UZSYNUSDZRU-ALTO6311708_DDB_FULLTEXT
2SD5IW4WF3O7OXZOSVUN3UZSYNUSDZRU-ALTO6311708_DDB_FULLTEXT
2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_DDB_FULLTEXT
2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_DDB_FULLTEXT
2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_DDB_FULLTEXT
2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_DDB_FULLTEXT
2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_DDB_FULLTEXT
2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_DDB_FULLTEXT
2TDMUDSOIG6M4UQSDA6QJ4D77N

In [None]:
r

Unnamed: 0,ids,content
0,222L6GN7RLURYX36SG5DC7OGNEQUHPEQ-ALTO6294738_D...,"Reelles Heiratsgesuch ! Ein junger Mann , 30 J..."
1,2I6N3O6FQVREJAANZILPB7KV3FPJ3AJC-ALTO6282283_D...,
2,2SD5IW4WF3O7OXZOSVUN3UZSYNUSDZRU-ALTO6311708_D...,"Damen jeden Standes und Alters , mit und ohne ..."
3,2SD5IW4WF3O7OXZOSVUN3UZSYNUSDZRU-ALTO6311708_D...,Heiratsgesuch ! Junger Mann ( Handwirker ) 23 ...
4,2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_D...,"Manusakturift , 27 Jahre , 1,68 groß , ansehn ..."
5,2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_D...,"Akademiker 28 Jahre all , natur = und sportfr..."
6,2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_D...,"Aleinstehender Hofbesitzer , 125 Morgen , 30 J..."
7,2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_D...,"Heiratsgesuch . Lediger , kath . Beam tenanwär..."
8,2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_D...,"Sollder Mann , 26 . , kath . , 2000 4l bar , s..."
9,2SVA6DKFXF6DLHZQDX4K7FL3CZ2SBJW6-ALTO6338444_D...,Heirat Für 22jähr . kathol . Gutsbesitzerstoch...
