In [5]:
annotated_data = pd.read_csv("gemma_annotation.csv", sep=",")
annotated_data

Unnamed: 0.1,Unnamed: 0,ingredient,amount,ingr_annotation,amount_annotation
0,0,zucchini,1,"{""anzahl"": 1, ""einheit"": ""St\u00fcck""}","{""zutat"": ""Zucchini""}"
1,1,paprika rot,2,"{""anzahl"": 2, ""einheit"": ""St\u00fcck""}","{""zutat"": ""Paprika"", ""eigenschaft"": ""rot""}"
2,2,feta,200 g,"{""gewicht"": 200, ""einheit"": ""g""}","{""zutat"": ""Feta""}"
3,3,schinken,250 g,"{""gewicht"": 250, ""einheit"": ""g""}","{""zutat"": ""Schinken""}"
4,4,zwiebel frisch,1,"{""anzahl"": 1, ""einheit"": ""St\u00fcck""}","{""zutat"": ""Zwiebel"", ""eigenschaft"": ""frisch""}"
...,...,...,...,...,...
905,905,weinessig,1 tl,"{""volumen"": 1, ""einheit"": ""tl""}","{""zutat"": ""Essig"", ""eigenschaft"": ""wei\u00df""}"
906,906,milch,1 liter,"{""volumen"": 1, ""einheit"": ""liter""}","{""zutat"": ""Milch"", ""eigenschaft"": ""1 Liter""}"
907,907,milchreis,250 g,"{""gewicht"": 250, ""einheit"": ""g""}","{""zutat"": ""Milchreis""}"
908,908,salz,1 prise,"{""anzahl"": 1, ""einheit"": ""Prise"", ""zutat"": ""sa...","{""zutat"": ""Salz"", ""eigenschaft"": ""Prise""}"


In [38]:
import json
import pandas as pd
from collections import defaultdict
from statistics import mean

def num(x):
    if x is None or (isinstance(x, float) and pd.isna(x)): return None
    s = str(x).strip().replace(",", ".")
    if " " in s and "/" in s:
        try: a,b=s.split(" ",1); n,d=b.split("/",1); return float(a)+float(n)/float(d)
        except: pass
    if "/" in s and " " not in s:
        try: n,d=s.split("/",1); return float(n)/float(d)
        except: pass
    s = s.replace("½","0.5").replace("¼","0.25").replace("¾","0.75")
    try: return float(s)
    except: return None

clean_unit = lambda u: ("" if u is None or (isinstance(u, float) and pd.isna(u)) else str(u).lower().strip().replace(".",""))
parse_ann  = lambda raw: raw if isinstance(raw, dict) else (json.loads(raw) if isinstance(raw, str) else None)

def pick_amount(d):
    for k in ("gewicht","volumen","menge","anzahl"):
        v = num(d.get(k))
        if v is not None: return k, v
    return None, None


WEIGHT_LIKE = {"el","esslöffel","tl","teelöffel","prise","msp"}   
VOLUME_LIKE = {"glas","tasse","becher"}                          

FALLBACK = {
    "el":15.0,"esslöffel":15.0,"tl":5.0,"teelöffel":5.0,"prise":0.5,"msp":0.1,
    "glas":0.2,"tasse":0.24,"becher":0.2
}

BASE = {
    "g":(1.0,"grams"),"gramm":(1.0,"grams"),
    "kg":(1000.0,"grams"),"mg":(0.001,"grams"),
    "l":(1.0,"liters"),"liter":(1.0,"liters"),"ml":(0.001,"liters"),
    "stk":(1.0,"pieces"),"stück":(1.0,"pieces"),"piece":(1.0,"pieces"),
}

def build_unit_map_from_data(df, ann_col="ingr_annotation"):
    w, v = defaultdict(list), defaultdict(list)
    for _, row in df.iterrows():
        ann = parse_ann(row.get(ann_col)); 
        if not ann: continue
        u = clean_unit(ann.get("einheit"))
        k, val = pick_amount(ann)
        if not u or val is None: continue
        if u in WEIGHT_LIKE and k=="gewicht": w[u].append(val)
        if u in VOLUME_LIKE and k=="volumen": v[u].append(val)
    est = {}
    est.update({u:(round(mean(vals),4),"grams")  for u,vals in w.items() if vals})
    est.update({u:(round(mean(vals),4),"liters") for u,vals in v.items() if vals})
    for u in WEIGHT_LIKE: est.setdefault(u,(FALLBACK[u],"grams"))
    for u in VOLUME_LIKE: est.setdefault(u,(FALLBACK[u],"liters"))
    return {**BASE, **est}

def convert_unit(amount, unit_raw, unit_map):
    u = clean_unit(unit_raw)
    a = 1.0 if (amount is None or (isinstance(amount,float) and pd.isna(amount))) else float(amount)
    if u in unit_map:
        f, t = unit_map[u]
        return a*f, t
    return a, u or ""


UNIT_MAP = build_unit_map_from_data(annotated_data, ann_col="ingr_annotation")

out = []
for _, row in annotated_data.iterrows():
    try:
        ann = parse_ann(row["ingr_annotation"])
        if not ann: raise ValueError("json")
        _, amt = pick_amount(ann)
        if amt is None: amt = 1.0
        val, u = convert_unit(amt, ann.get("einheit"), UNIT_MAP)
    except Exception:
        fb = num(row.get("amount")); val = fb if fb is not None else 1.0; u = "json_error"
    out.append({"normalized_amount": val, "normalized_unit": u})

res = pd.DataFrame(out)
annotated_data["normalized_amount"] = res["normalized_amount"].values
annotated_data["normalized_unit"]  = res["normalized_unit"].values

print(annotated_data[["amount","ingr_annotation","normalized_amount","normalized_unit"]].head(10))


     amount                         ingr_annotation  normalized_amount  \
0         1  {"anzahl": 1, "einheit": "St\u00fcck"}                1.0   
1         2  {"anzahl": 2, "einheit": "St\u00fcck"}                2.0   
2     200 g        {"gewicht": 200, "einheit": "g"}              200.0   
3     250 g        {"gewicht": 250, "einheit": "g"}              250.0   
4         1  {"anzahl": 1, "einheit": "St\u00fcck"}                1.0   
5     300 g        {"gewicht": 300, "einheit": "g"}              300.0   
6     500 g        {"gewicht": 500, "einheit": "g"}              500.0   
7     120 g        {"gewicht": 120, "einheit": "g"}              120.0   
8     300 g        {"gewicht": 300, "einheit": "g"}              300.0   
9  1 becher     {"volumen": 1, "einheit": "becher"}                1.0   

  normalized_unit  
0          pieces  
1          pieces  
2           grams  
3           grams  
4          pieces  
5           grams  
6           grams  
7           grams  
8    