In [None]:
import json, re, pandas as pd
from statistics import median

annotated_data = pd.read_csv("gemma_annotation.csv")

# ehh just some helper functions for later 
def num(x):
    if x is None or (isinstance(x, float) and pd.isna(x)): return None
    s = str(x).strip().replace(",", ".")
    if " " in s and "/" in s:
        try: a,b=s.split(" ",1); n,d=b.split("/",1); return float(a)+float(n)/float(d)
        except: pass
    if "/" in s and " " not in s:
        try: n,d=s.split("/",1); return float(n)/float(d)
        except: pass
    s = (s.replace("½","0.5").replace("¼","0.25").replace("¾","0.75")
           .replace("⅓","0.3333333333").replace("⅔","0.6666666667")
           .replace("⅛","0.125").replace("⅜","0.375")
           .replace("⅝","0.625").replace("⅞","0.875"))
    try: return float(s)
    except: return None

def clean_unit(u):
    return "" if u is None or (isinstance(u, float) and pd.isna(u)) else str(u).lower().strip().replace(".","")

def parse_ann(raw):
    return raw if isinstance(raw, dict) else (json.loads(raw) if isinstance(raw, str) else None)

def pick_amount(d):
    for k in ("gewicht","volumen","menge","anzahl"):
        v = num(d.get(k))
        if v is not None: return k, v
    return None, None

# sleep deprived rn...unit tables
CANON_G, CANON_L, CANON_P = "g", "l", "stk"

BASE = {
   
    "g":(1.0,CANON_G), "gramm":(1.0,CANON_G), "gram":(1.0,CANON_G), "gr":(1.0,CANON_G),
    "kg":(1000.0,CANON_G), "kilogramm":(1000.0,CANON_G), "mg":(0.001,CANON_G),
    "l":(1.0,CANON_L), "liter":(1.0,CANON_L), "ml":(0.001,CANON_L), "milliliter":(0.001,CANON_L),
    "stk":(1.0,CANON_P), "stück":(1.0,CANON_P), "stueck":(1.0,CANON_P), "piece":(1.0,CANON_P), "anzahl":(1.0,CANON_P)
}

AMB = {"el","esslöffel","essloeffel","tl","teelöffel","teeloeffel","prise","msp","messerspitze","glas","tasse","becher"}
DEFAULT_ML = {"el":15,"esslöffel":15,"essloeffel":15,"tl":5,"teelöffel":5,"teeloeffel":5,
              "prise":0.5,"msp":0.5,"messerspitze":0.5,"glas":200,"tasse":250,"becher":250}

# learn ml-per-unit from free-text 'amount' if co-mentions exist (e.g., "2 EL (30 ml)")
learned_lists = {k:[] for k in AMB}
if "amount" in annotated_data.columns:
    pat = re.compile(r"(\d+[.,]?\d*|\d+/\d+|[½¼¾⅓⅔⅛⅜⅝⅞])\s*"
                     r"(el|esslöffel|essloeffel|tl|teelöffel|teeloeffel|tasse|becher|glas|msp|messerspitze|prise)"
                     r".*?(\d+[.,]?\d*)\s*(ml|l)", re.IGNORECASE)
    for txt in annotated_data["amount"].astype(str).fillna(""):
        for m in pat.finditer(txt):
            cnt = num(m.group(1))
            u   = clean_unit(m.group(2))
            ml  = float(m.group(3).replace(",", "."))
            if m.group(4).lower() == "l": ml *= 1000.0
            if cnt and cnt>0 and u in learned_lists:
                learned_lists[u].append(ml/cnt)

ML_PER = {k:(median(v) if len(v)>=3 else DEFAULT_ML.get(k)) for k,v in learned_lists.items()}

# build unit map when stuff is ambiguous ahh
unit_map = dict(BASE)
for k, ml in ML_PER.items():
    if ml is not None:
        unit_map[k] = (ml/1000.0, CANON_L)  

# normalize part of our task
vals, units = [], []
for _, row in annotated_data.iterrows():
    try:
        ann = parse_ann(row.get("ingr_annotation"))
        if not ann: raise ValueError("no_json")
        key, amt = pick_amount(ann)
        if amt is None:
            amt = num(row.get("amount"))
            if amt is None: amt = 1.0
        u = clean_unit(ann.get("einheit"))
        if u in unit_map:
            f, target = unit_map[u]; val, unit = float(amt)*f, target
        else:
            # fallback by key if unit unknown
            if key in ("menge","anzahl"):   val, unit = float(amt), CANON_P
            elif key == "gewicht":          val, unit = float(amt), CANON_G
            elif key == "volumen":          val, unit = float(amt), CANON_L
            else:                           val, unit = float(amt), CANON_P
    except Exception:
        v = num(row.get("amount"))
        val, unit = (v if v is not None else 1.0), CANON_P

    vals.append(val); units.append(unit)

annotated_data["norm_value"] = vals
annotated_data["norm_unit"]  = units

# just wanna preview lol
display(annotated_data[["ingredient","amount","ingr_annotation","norm_value","norm_unit"]])
