In [8]:
import re
import json
import pandas as pd
from ollama import Client
llama_3_3 = Client(host="http://intern.schlaubox.de:11434", timeout=500)
from pathlib import Path

try:
    HERE = Path(__file__).parent
except Exception as e:
    HERE = Path.cwd()

In [10]:
# we have 74 failed extracts. In our emperical observation, we observe that
# most of it is because the ingredients are not parsed correctly. Therefore,
# we write our own prompts to extract the ingredients.
data = pd.read_csv(HERE.parent / "failed_extracts_initial.csv", dtype=str)

In [13]:
## PROMPT 1

def extract_ingredients_json(ingr_txt):
    messages = [
        {
            "role" : "user",
            "content" :

f"""Your task is to identify cooking ingredients in texts and output them in the nominative singular.
If characteristics such as color, condition, processing, quality, or origin are mentioned for an
ingredient, output these characteristics as well. Generate a json object for the output.
Always include a "ingredient" as the key for the ingredient name. Also add a "properties" array
for all previously mentioned properties. If none were found, do not use this key.
If you cannot find an ingredient in the text, simply write "None" instead of a JSON.

Make sure all values are in the German basic form, like in the following example:
input: "garnelen frisch groß"
output: ```json
    {{
        "ingredient": "Garnele",
        "properties": ["frisch", "groß"]
    }}
```

If the ingredient is oddly specific, use the basic supergroup for that ingredient, e.g.:
input: "frisches seelachsfilet"
output: ```json
    {{
        "ingredient": "Lachsfillet",
        "properties": ["frisch"]
    }}
    ```

Now identify the ingredients in this text: {ingr_txt}"""

        }
    ]


    llm_analysis = llama_3_3.chat(model='llama3.3:70b', messages=messages)
    answer = llm_analysis['message']['content']

    # Get JSON
    json_strings = re.findall(r"```json\s*(.*?)\s*```", answer, re.DOTALL)
    json_strings = [e.strip() for e in json_strings]
    json_dicts = []
    for json_str in json_strings:
        try:
            json_obj = json.loads(json_str)
            if isinstance(json_obj, list):
                json_dicts.extend(json_obj)   # add elements individually
            else:
                json_dicts.append(json_obj)   # add a single element
        except Exception as e:
            print(e)
    return json_dicts


data['ingr_json'] = data.apply(lambda row: extract_ingredients_json(row['ingredient']), axis=1)
data

Unnamed: 0.1,Unnamed: 0,ingredient,amount,ingr_annotation,amount_annotation,norm_value,norm_unit,nutrition,ingr_json
0,15,bio hähnchenschlegel,4 stk.,"{""anzahl"": 4, ""einheit"": ""stk.""}","{""zutat"": ""H\u00e4hnchenschlegel"", ""eigenschaf...",4.0,stück,,"[{'ingredient': 'Hähnchenschlegel', 'propertie..."
1,31,schokoriegel,1 stk.,"{""anzahl"": 1, ""einheit"": ""stk.""}","{""zutat"": ""Schokoriegel""}",1.0,stück,,[{'ingredient': 'Schokoriegel'}]
2,32,amarettini mandelkeksezerstoßen,2 esslöffel,"{""anzahl"": 2, ""einheit"": ""essl\u00f6ffel""}","{""zutat"": ""Amarettini"", ""eigenschaft"": ""zersto...",0.03,liter,,"[{'ingredient': 'Amarettini', 'properties': ['..."
3,33,deko schirmchen,2 stk.,"{""anzahl"": 2, ""einheit"": ""stk.""}","{""zutat"": ""Schirmchen"", ""eigenschaft"": ""Deko""}",2.0,stück,,[]
4,40,chilisoße sambal oelek,0.5 tl,"{""volumen"": 0.5, ""einheit"": ""tl""}","{""zutat"": ""Chiliso\u00dfe"", ""eigenschaft"": ""Sa...",0.0025,liter,,"[{'ingredient': 'Sambal Oelek', 'properties': ..."
...,...,...,...,...,...,...,...,...,...
68,889,alaska-seelachsfilet imbackteig,2 stück,"{""anzahl"": 2, ""einheit"": ""st\u00fcck""}","{""zutat"": ""Seelachsfilet"", ""eigenschaft"": ""im ...",2.0,stück,,"[{'ingredient': 'Lachsfilet', 'properties': ['..."
69,891,küchentuch,1 stk.,"{""anzahl"": 1, ""einheit"": ""stk.""}","{""zutat"": ""K\u00fcchentuch"", ""eigenschaft"": ""S...",1.0,stück,,[]
70,896,sellerieknolle gegart,50 g,"{""gewicht"": 50, ""einheit"": ""g""}","{""zutat"": ""Sellerieknolle"", ""eigenschaft"": ""ge...",50.0,g,,"[{'ingredient': 'Sellerieknolle', 'properties'..."
71,898,spitzpaprika rot,30 g,"{""gewicht"": 30, ""einheit"": ""g""}","{""zutat"": ""Spitzpaprika"", ""eigenschaft"": ""rot""}",30.0,g,,"[{'ingredient': 'Paprika', 'properties': ['rot..."


In [14]:
def extract_quantities_json(amount_txt):
    messages = [
        {
            "role" : "user",
            "content" :
f"""Your task is to identify quantities, weights, and volumes in cooking ingredients in texts.
Generate a json object for the output.
Always use 'amount' as the key for quantities, 'weight' for weights, and 'volume' for volumes. Always
specify a 'unit' key for the unit found, e.g., l, g, Stück, Prise, msp, Tafel, Riegelor Esslöffel!
Do not fabricate or include any other information in the output!
If you cannot find a quantity in the text, simply write "None" instead of a JSON.
Make sure all values are in the German basic form, like in the following example.

A few examples for this is:
Example 1:
input: "1 Teelöffel Salz"
output: ```json
    {{
        "amount": 1,
        "unit": "teelöffel"
    }}
```

Example 2:
input: "2 Müsliriegel"
output: ```json
    {{
        "amount": "2",
        "unit": "Riegel"
    }}
```

Example 3:
input: "2 Tafeln Schokolade"
output: ```json
    {{
        "amount": "2",
        "unit": "Tafel"
    }}

Example 4:
input: "3 Kleckse Butter"
output: ```json
    {{
        "amount": "3",
        "unit": "Klecks"
    }}


Example 5:
input: "1 Schuss Rapsöl"
output: ```json
    {{
        "amount": "1",
        "unit": "Schuss"
    }}

Example 6:
input: "3 Päckchen frische Heidelbeeren"
output: ```json
    {{
        "amount": "3",
        "unit": "Pack"
    }}

Example 7:
input: "1/2 Frucht Orangensaft frisch gepresst"
output: ```json
    {{
        "amount": "0.5",
        "unit": "Stück"
    }}
```

Example 8:
input: "1/2 Kopf frischer Salat"
output: ```json
    {{
        "amount": "0.5",
        "unit": "Kopf"
    }}
```

Example 9:
input: "1 Spritzer trockener Wein"
output: ```json
    {{
        "amount": "1",
        "unit": "Spritzer"
    }}
```

Now identify quantities, weights, and volumes in this text: {amount_txt}"""
        }
    ]


    llm_analysis = llama_3_3.chat(model='llama3.3:70b', messages=messages)
    answer = llm_analysis['message']['content']

    # Get JSON
    json_strings = re.findall(r"```json\s*(.*?)\s*```", answer, re.DOTALL)
    json_strings = [e.strip() for e in json_strings]
    json_dicts = []
    for json_str in json_strings:
        try:
            json_obj = json.loads(json_str)
            if isinstance(json_obj, list):
                json_dicts.extend(json_obj)   # add elements individually
            else:
                json_dicts.append(json_obj)   # add a single element
        except Exception as e:
            print(e)
    return json_dicts

data['amount_json'] = data.apply(lambda row: extract_quantities_json(row['amount']), axis=1)
data

Unnamed: 0.1,Unnamed: 0,ingredient,amount,ingr_annotation,amount_annotation,norm_value,norm_unit,nutrition,ingr_json,amount_json
0,15,bio hähnchenschlegel,4 stk.,"{""anzahl"": 4, ""einheit"": ""stk.""}","{""zutat"": ""H\u00e4hnchenschlegel"", ""eigenschaf...",4.0,stück,,"[{'ingredient': 'Hähnchenschlegel', 'propertie...","[{'amount': 4, 'unit': 'Stück'}]"
1,31,schokoriegel,1 stk.,"{""anzahl"": 1, ""einheit"": ""stk.""}","{""zutat"": ""Schokoriegel""}",1.0,stück,,[{'ingredient': 'Schokoriegel'}],"[{'amount': 1, 'unit': 'Stück'}]"
2,32,amarettini mandelkeksezerstoßen,2 esslöffel,"{""anzahl"": 2, ""einheit"": ""essl\u00f6ffel""}","{""zutat"": ""Amarettini"", ""eigenschaft"": ""zersto...",0.03,liter,,"[{'ingredient': 'Amarettini', 'properties': ['...","[{'amount': 2, 'unit': 'Esslöffel'}]"
3,33,deko schirmchen,2 stk.,"{""anzahl"": 2, ""einheit"": ""stk.""}","{""zutat"": ""Schirmchen"", ""eigenschaft"": ""Deko""}",2.0,stück,,[],"[{'amount': '2', 'unit': 'Stück'}]"
4,40,chilisoße sambal oelek,0.5 tl,"{""volumen"": 0.5, ""einheit"": ""tl""}","{""zutat"": ""Chiliso\u00dfe"", ""eigenschaft"": ""Sa...",0.0025,liter,,"[{'ingredient': 'Sambal Oelek', 'properties': ...","[{'amount': 0.5, 'unit': 'teelöffel'}]"
...,...,...,...,...,...,...,...,...,...,...
68,889,alaska-seelachsfilet imbackteig,2 stück,"{""anzahl"": 2, ""einheit"": ""st\u00fcck""}","{""zutat"": ""Seelachsfilet"", ""eigenschaft"": ""im ...",2.0,stück,,"[{'ingredient': 'Lachsfilet', 'properties': ['...","[{'amount': '2', 'unit': 'Stück'}]"
69,891,küchentuch,1 stk.,"{""anzahl"": 1, ""einheit"": ""stk.""}","{""zutat"": ""K\u00fcchentuch"", ""eigenschaft"": ""S...",1.0,stück,,[],"[{'amount': 1, 'unit': 'Stück'}]"
70,896,sellerieknolle gegart,50 g,"{""gewicht"": 50, ""einheit"": ""g""}","{""zutat"": ""Sellerieknolle"", ""eigenschaft"": ""ge...",50.0,g,,"[{'ingredient': 'Sellerieknolle', 'properties'...","[{'weight': 50, 'unit': 'g'}]"
71,898,spitzpaprika rot,30 g,"{""gewicht"": 30, ""einheit"": ""g""}","{""zutat"": ""Spitzpaprika"", ""eigenschaft"": ""rot""}",30.0,g,,"[{'ingredient': 'Paprika', 'properties': ['rot...","[{'weight': 30, 'unit': 'g'}]"


In [15]:
# now use the annotations generated by PROMPT1 to test we made any improvements
data.to_csv("gemma_annotated_prompt1.csv")

In [16]:
## PROMPT 2

# another approach to extract ingredients

def extract_ingredients_json(ingr_txt: str) -> str:
    messages = [
        { "role" : "user",
          "content":
              f"""
                    Your task is to identify cooking ingredients in texts and output them in
                    the nominative singular. If characteristics such as color,  condition,
                    processing, quality, or origin are mentioned for an ingredient, ignore them.

                    You must output as single word, which should be the ingredient.
                    DO not output
                    anything else. You must always output a valid ingredient.

                    Please carefully observe these examples:

                        input: schokoriegel
                        output: riegel

                        input: amarettini mandelkeksezerstoßen
                        output: mandelkekse

                        input: deko schirmchen
                        output: schirmchen

                        input: asiatische fischsoße
                        output: fischsoße

                        input: bisquits rund
                        output: bisquits

                        input: agavensirup
                        output: sirup

                        input: prinzessbohnen gewürfelt
                        output: bohnen

                        input: wurzelgemüse
                        output: gemüse

                        input: jostabeeren
                        output: beeren

                        input: curcuma gelbwurz
                        output: curcuma

                        input: pankomehl
                        output: mehl

                        input: stevia flüssig
                        output: stevia

                        input: weißkohl frisch
                        output: kohl

                        input: stevia flüssig
                        output: stevia

                        input: speiseöl
                        output: öl

                    Now identify the ingredients in this text: {ingr_txt}
                """
        }
    ]

    llm_analysis = llama_3_3.chat(model='llama3.3:70b', messages=messages)
    answer = llm_analysis['message']['content']

    return answer

In [None]:
## test this file using the testing.ipynb file and see if we manage to reduce the errors
data['ingr_identified'] = data.apply(
    lambda x: extract_ingredients_json(x["ingredient"]),
    axis = 1,
)

