In [None]:
import pandas as pd
import google.generativeai as genai
from scripts.schemes import ExtractedContent
import os
import json

In [2]:
configs = {
  "API_KEY" : os.environ.get("GEMINI_API_KEY"),
 
  
  "PROMPT":"Your porpouse is it to extract a pre-defined part of the text i will provide you. The text i'll provide you is an ocr read" \
  "historical newspaper. Your task is it to extract the marriage request from the text. A marriage request is defined a person, male or female, stating their interest in finding"
  "a partner for the porpuse of marriage either for themself or for relatives of them in a newspaper." \
  " It is possible that there are more than one in the text" \
  "in this case extract all of them. If you are done, reevaluate your work and check your results. Revavluate if the extracted text is really a marriage request and not some sort of other requst. Return the results in the format of a python list." \
  "The response should be just the text you extracted. It just should contain a list of the extracted texts, with no changes made to the text If there is no marriage request return an empty string.",
  
  "model_name":"gemini-2.5-flash",
  
  "generation_config" : {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 40,
    "max_output_tokens": 65536,
    "response_mime_type": "application/json",
    "response_schema": ExtractedContent,
    }
}



In [4]:
class LLMWorker:
    def __init__(self, configs:dict, input_id:list[str], text:list[str]):
        self.config:dict = configs
        self.initialized_model:bool = False
        self.input_id:list[str] = input_id if isinstance(input_id, list) else [input_id]
        self.text:list[str] = text if isinstance(text, list) else [text]
        self.model = None
        self.output = {"results": { } }
        
    def set_config(self) -> None:
        """Setzen des API Keys, aus den im config dict gespeicherten Daten"""
        API_KEY = self.config["API_KEY"]
        genai.configure(api_key=API_KEY)

    def load_model(self) -> None:
        """Laden des Models und setzen der Modelkonfigurationen. Generation Config sollte im config_file gespeichert sein"""
        self.model = genai.GenerativeModel(
            model_name=self.config["model_name"],
            generation_config=self.config["generation_config"])
        self.initialized_model = True

    def create_model_input(self,page_id:str, input_text:str) -> str:
        """Funktion die den prompt erstellt, der später an die Model API gesendet wird
        Parameters:
        page_id - id des inputs
        input_text - Text der als Input für das Model verwendet wird

        Returns:
        string that combines the input text and input prompt
        
        """
        
        model_input = {page_id:input_text}
        input_combined = f"{self.config['PROMPT']}\n{model_input}"
        return input_combined
    
    
    def extract_single_page(self, page_id:str, input_text:str) -> dict:
        """Funktion die die API anspricht und die einen strukturierten Output als response erhält
        Parameters:
        page_id - id des inputs
        input_text - Text der als Input für das Model verwendet wird

        Returns:
        model response
        """
        try:
            input_combined = self.create_model_input(page_id=page_id, input_text=input_text)
            
        except Exception as e:
            print(f"Error while generating the model input: {e}")
            return {"content":f"Error while generating the model input: {e}"}
        
        try:
            response = self.model.generate_content([input_combined])
        except Exception as e:
            print(f"Error while receiving the response: {e} ")
            return {"content":f"Error while receiving the response: {e} "}
        
        try:
            res_parsed = json.loads(str(response.text))

            return res_parsed
        except Exception as e: 
            print(f"Error while parsing the response - Exeption {e}")
            return {"content":f"Error while parsing the response - Exeption {e}"}
        
    def extract_content(self):
        self.set_config()
        self.load_model()
        for p_, t_ in zip(self.input_id, self.text):
            print(p_)
            result = self.extract_single_page(p_, t_)
            self.output["results"].update({str(p_):result["content"]})
            
        return self.output
    

def output_to_df(output:dict):
    extracted_list = []
    ids = []
    for k in output["results"].keys():
        extracted_list.append(output["results"][k])
        ids.append(k)
    df = pd.DataFrame(data={"ids":ids, "content":extracted_list})
    df = df.explode('content').reset_index(drop=True)
    return df

    
            

    
        
 

In [5]:
df = pd.read_pickle("data/ddbapi_köln.pkl")

p, t = df.page_id[0:15].tolist(), df.plainpagefulltext[0:15].tolist()

In [6]:
extractor = LLMWorker(configs=configs, input_id= p,text= t)
result = extractor.extract_content()

22KVFOV36KTZS67POXRNU5DJKD4XYSPI-ALTO1934911_DDB_FULLTEXT
Error while parsing the response - Exeption name 'json' is not defined
23GSP3TO42E3USTLALB2X5EHUMBSEOWM-ALTO8960309_DDB_FULLTEXT


KeyboardInterrupt: 

In [17]:
output_to_df(result)

Unnamed: 0,ids,content
0,22KVFOV36KTZS67POXRNU5DJKD4XYSPI-ALTO1934911_D...,"Aufricht . Heiratsgesuch . Gut gebildeter , de..."
1,22KVFOV36KTZS67POXRNU5DJKD4XYSPI-ALTO1934911_D...,"Junger Mann . 2 J hre , kath . , sucht aufrich..."
2,22KVFOV36KTZS67POXRNU5DJKD4XYSPI-ALTO1934911_D...,"Zwei junge Herren , 19 # . 21 Jahre , suchen d..."
3,23GSP3TO42E3USTLALB2X5EHUMBSEOWM-ALTO8960309_D...,"Regierungsbaumeister a . D . für Tietbau , 29 ..."
4,23GSP3TO42E3USTLALB2X5EHUMBSEOWM-ALTO8960309_D...,"Gewissenhafter deutscher Herr , # 8 . 30 Jahre..."
5,23GSP3TO42E3USTLALB2X5EHUMBSEOWM-ALTO8960309_D...,"Sr Rittergutsbesitzers - Sohn 30 J . alt , Erb..."
6,23GSP3TO42E3USTLALB2X5EHUMBSEOWM-ALTO8960309_D...,"Heiratsgesuch . Ingenieur , 31 Jahre alt , in ..."
7,23GSP3TO42E3USTLALB2X5EHUMBSEOWM-ALTO8960309_D...,"übsches 11 Mädchen aus Hanges . Fam . , ma Zeu..."
8,23GSP3TO42E3USTLALB2X5EHUMBSEOWM-ALTO8960309_D...,"IVritwe . kath . , m . 2 . , wünscht d . Bekan..."
9,23GSP3TO42E3USTLALB2X5EHUMBSEOWM-ALTO8960309_D...,"teiraf 27jährig . Herr , ev . , aus erster rhe..."
