In [2]:
# - Normalizes to canonical units: g, l, stk
# - Learns ml-per-unit for EL/TL/Tasse/Becher/Glas/Prise/MSP from co-mentions in free-text 'amount' when present
# - Uses NFKC normalization + tolerant numeric parsing

import json, re, unicodedata
import pandas as pd
from statistics import median


annotated_data = pd.read_csv("gemma_annotation.csv")

# ------------------ helpers ------------------
def _nfkc(s: str) -> str:
    return unicodedata.normalize("NFKC", s)

def num(x):
    """
    Robust numeric parser:
    - Unicode NFKC normalization (handles full-width digits, '㎖', etc.)
    - decimals with , or .
    - ascii fractions: 1/2, and mixed "1 1/2"
    - unicode fractions: ½ ¼ ¾ ⅓ ⅔ ⅛ ⅜ ⅝ ⅞
      Works for: "1½", "1 ½", "½"
    - strips trailing unit words (e.g., 'liter', 'g', 'ml') before evaluation
    - tolerates fraction slash '⁄' (U+2044)
    """
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return None
    s = _nfkc(str(x)).strip().lower()
    s = s.replace(",", ".").replace("⁄", "/")

    # ensure "1½" → "1 ½" for tokenization
    s = re.sub(r"(\d)([½¼¾⅓⅔⅛⅜⅝⅞])", r"\1 \2", s)

    # keep only the leading numeric chunk (stop at first word-only token)
    # e.g., "1 ½ liter" -> "1 ½"
    s = re.split(r"\s+[a-zäöüß]+", s, maxsplit=1)[0]

    UF = {"½":0.5,"¼":0.25,"¾":0.75,"⅓":1/3,"⅔":2/3,"⅛":0.125,"⅜":0.375,"⅝":0.625,"⅞":0.875}

    total, seen = 0.0, False
    for tok in s.split():
        if tok in UF:
            total += UF[tok]; seen = True; continue
        if re.fullmatch(r"\d+/\d+", tok):
            n,d = tok.split("/")
            try:
                total += float(n)/float(d); seen = True; continue
            except:
                break
        try:
            total += float(tok); seen = True; continue
        except:
            break
    return total if seen else None

def clean_unit(u):
    if u is None or (isinstance(u, float) and pd.isna(u)):
        return ""
    s = _nfkc(str(u)).lower().strip().replace(".","").replace("⁄","/")
    return s

def parse_ann(raw):
    return raw if isinstance(raw, dict) else (json.loads(raw) if isinstance(raw, str) else None)

def pick_amount(d):
    for k in ("gewicht","volumen","menge","anzahl"):
        v = num(d.get(k))
        if v is not None:
            return k, v
    return None, None

# ------------------ canonical + unit tables ------------------
CANON_G, CANON_L, CANON_P = "g", "l", "stk"

BASE = {
    # weight -> g
    "g":(1.0,CANON_G), "gramm":(1.0,CANON_G), "gram":(1.0,CANON_G), "gr":(1.0,CANON_G),
    "kg":(1000.0,CANON_G), "kilogramm":(1000.0,CANON_G), "mg":(0.001,CANON_G),
    # volume -> l
    "l":(1.0,CANON_L), "liter":(1.0,CANON_L),
    "ml":(0.001,CANON_L), "milliliter":(0.001,CANON_L), "mililiter":(0.001,CANON_L),
    "㎖":(0.001,CANON_L),  # compatibility symbol; NFKC maps this to 'ml' but kept for safety
    # pieces -> stk
    "stk":(1.0,CANON_P), "stück":(1.0,CANON_P), "stueck":(1.0,CANON_P),
    "piece":(1.0,CANON_P), "anzahl":(1.0,CANON_P)
}

# Ambiguous cooking measures — treat as volume (liters) via ml-per-unit
AMB = {"el","esslöffel","essloeffel","tl","teelöffel","teeloeffel",
       "prise","msp","messerspitze","glas","tasse","becher"}

DEFAULT_ML = {
    "el":15,"esslöffel":15,"essloeffel":15,
    "tl":5,"teelöffel":5,"teeloeffel":5,
    "prise":0.5,"msp":0.5,"messerspitze":0.5,
    "glas":200,"tasse":250,"becher":250
}


# Look for co-mentions in free-text column 'amount' like: "2 EL (30 ml)" or "1 Tasse = 250 ml"
learned_lists = {k:[] for k in AMB}
if "amount" in annotated_data.columns:
    pat = re.compile(
        r"(\d+[.,]?\d*|\d+/\d+|[½¼¾⅓⅔⅛⅜⅝⅞])\s*"
        r"(el|esslöffel|essloeffel|tl|teelöffel|teeloeffel|tasse|becher|glas|msp|messerspitze|prise)"
        r".*?(\d+[.,]?\d*)\s*(ml|l|㎖)",
        re.IGNORECASE
    )
    for raw in annotated_data["amount"].astype(str).fillna(""):
        txt = _nfkc(raw)
        for m in pat.finditer(txt):
            cnt = num(m.group(1))
            u   = clean_unit(m.group(2))
            mlv = _nfkc(m.group(3)).replace(",", ".")
            try:
                ml = float(mlv)
            except:
                continue
            unit_found = clean_unit(m.group(4))
            if unit_found in ("l","liter"):
                ml *= 1000.0
            # 'ml', '㎖' both become ml already
            if cnt and cnt > 0 and u in learned_lists:
                learned_lists[u].append(ml/cnt)

ML_PER = {k:(median(v) if len(v)>=3 else DEFAULT_ML.get(k)) for k,v in learned_lists.items()}

# ------------------ build unit map ------------------
unit_map = dict(BASE)
for k, ml in ML_PER.items():
    if ml is not None:
        unit_map[k] = (ml/1000.0, CANON_L)  # liters per unit

# ------------------ normalize ------------------
vals, units = [], []
for _, row in annotated_data.iterrows():
    try:
        ann = parse_ann(row.get("ingr_annotation"))
        if not ann:
            raise ValueError("no_json")
        key, amt = pick_amount(ann)
        if amt is None:
            amt = num(row.get("amount"))
            if amt is None:
                amt = 1.0
        u = clean_unit(ann.get("einheit"))
        if u in unit_map:
            f, target = unit_map[u]
            val, unit = float(amt)*f, target
        else:
            # fallback by key if unit unknown
            if key in ("menge","anzahl"):   val, unit = float(amt), CANON_P
            elif key == "gewicht":          val, unit = float(amt), CANON_G
            elif key == "volumen":          val, unit = float(amt), CANON_L
            else:                           val, unit = float(amt), CANON_P
    except Exception:
        v = num(row.get("amount"))
        val, unit = (v if v is not None else 1.0), CANON_P

    vals.append(val); units.append(unit)

annotated_data["norm_value"] = vals
annotated_data["norm_unit"]  = units

# sleep deprived preview
display(annotated_data[["ingredient","amount","ingr_annotation","norm_value","norm_unit"]].head(80))


Unnamed: 0,ingredient,amount,ingr_annotation,norm_value,norm_unit
0,zucchini,1,"{""anzahl"": 1, ""einheit"": ""St\u00fcck""}",1.0000,stk
1,paprika rot,2,"{""anzahl"": 2, ""einheit"": ""St\u00fcck""}",2.0000,stk
2,feta,200 g,"{""gewicht"": 200, ""einheit"": ""g""}",200.0000,g
3,schinken,250 g,"{""gewicht"": 250, ""einheit"": ""g""}",250.0000,g
4,zwiebel frisch,1,"{""anzahl"": 1, ""einheit"": ""St\u00fcck""}",1.0000,stk
...,...,...,...,...,...
75,sonnenblumenkerne geröstet,1 el,"{""volumen"": 1, ""einheit"": ""el""}",0.0150,l
76,pfeffer aus der mühle,1 prise,"{""anzahl"": 1, ""einheit"": ""Prise""}",0.0005,l
77,salz aus der mühle,1 prise,"{""anzahl"": 1, ""einheit"": ""Prise"", ""zutat"": ""sa...",0.0005,l
78,spaghetti,170 g,"{""gewicht"": 170, ""einheit"": ""g""}",170.0000,g
