In [8]:
import json, re, unicodedata
import pandas as pd
from statistics import median

annotated_data = pd.read_csv(r"gemma_annotation_original.csv")

# helper functions for parsing
def _nfkc(s: str) -> str:
    return unicodedata.normalize("NFKC", s)

def num(x):
    if x is None or (isinstance(x, float) and pd.isna(x)): return None
    s = _nfkc(str(x)).strip().lower()
    s = s.replace(",", ".").replace("⁄", "/")
    s = re.sub(r"(\d)([½¼¾⅓⅔⅛⅜⅝⅞])", r"\1 \2", s)  # "1½" -> "1 ½"
    s = re.split(r"\s+[a-zäöüß]+", s, maxsplit=1)[0]  # keep leading numeric chunk
    UF = {"½":0.5,"¼":0.25,"¾":0.75,"⅓":1/3,"⅔":2/3,"⅛":0.125,"⅜":0.375,"⅝":0.625,"⅞":0.875}
    total, seen = 0.0, False
    for tok in s.split():
        if tok in UF: total += UF[tok]; seen = True; continue
        if re.fullmatch(r"\d+/\d+", tok):
            n,d = tok.split("/")
            try: total += float(n)/float(d); seen = True; continue
            except: break
        try: total += float(tok); seen = True; continue
        except: break
    return total if seen else None

def clean_unit(u):
    if u is None or (isinstance(u, float) and pd.isna(u)): return ""
    return _nfkc(str(u)).lower().strip().replace(".","").replace("⁄","/")

def parse_ann(raw):
    return raw if isinstance(raw, dict) else (json.loads(raw) if isinstance(raw, str) else None)

def pick_amount(d):
    for k in ("gewicht","volumen","menge","anzahl"):
        v = num(d.get(k))
        if v is not None: return k, v
    return None, None

# ------------------ canonical targets ------------------
CANON_G, CANON_L, CANON_P = "g", "l", "stk"

# Base map (we'll extend it from *present* tokens)
BASE = {
    "g": (1.0, CANON_G), "kg": (1000.0, CANON_G), "mg": (0.001, CANON_G),
    "l": (1.0, CANON_L), "ml": (0.001, CANON_L),
    "stk": (1.0, CANON_P), "stück": (1.0, CANON_P), "stueck": (1.0, CANON_P), "piece": (1.0, CANON_P)
}

#  mine unit-like tokens after numbers from `amount` column
VULGAR = "½¼¾⅓⅔⅛⅜⅝⅞"
NUM = rf"(?:\d+[.,]?\d*|\d+/\d+|[{VULGAR}])"
RANGE_SEP = r"(?:\s*(?:-|–|—|to|bis)\s*)"
NUMRANGE = rf"{NUM}(?:{RANGE_SEP}{NUM})?"
WORD = r"[a-zA-ZäöüÄÖÜß]+\.?"
SYMBOL = r"[%%°]+"
pat_space = re.compile(rf"{NUMRANGE}\s*({WORD}|{SYMBOL})", re.IGNORECASE)
pat_glued = re.compile(rf"({NUM})([a-zA-ZäöüÄÖÜß]+\.?)", re.IGNORECASE)

def token_after_number(text: str):
    if not isinstance(text, str) or not text.strip(): return None
    t = _nfkc(text).strip().replace(",", ".").replace("⁄", "/")
    t = re.sub(rf"(\d)([{VULGAR}])", r"\1 \2", t)
    m = pat_space.search(t)
    if m: return m.group(1).lower().rstrip(".")
    m2 = pat_glued.match(t)
    if m2: return m2.group(2).lower().rstrip(".")
    return None

present_tokens_amount = set()
for raw in annotated_data.get("amount", pd.Series([], dtype=str)).astype(str):
    tok = token_after_number(raw)
    if tok: present_tokens_amount.add(tok)

# also collect units that appear inside annotations
present_units_ann = set()
for raw in annotated_data.get("ingr_annotation", pd.Series([], dtype=str)).astype(str):
    ann = parse_ann(raw) if isinstance(raw, str) else None
    if ann:
        present_units_ann.add(clean_unit(ann.get("einheit")))

present_tokens_all = present_tokens_amount | present_units_ann

#  alias tables (we use them ONLY if token is present) 
# mass/volume/piece aliases → canonical
MASS_ALIAS = {
    "gram":"g","grams":"g","gramm":"g","gramme":"g","grammes":"g","gr":"g","g.":"g"
}
VOLUME_ALIAS = {
    "liter":"l","litre":"l","liters":"l","litres":"l","ltr":"l","lt":"l","l.":"l",
    "milliliter":"ml","millilitre":"ml","㎖":"ml","mℓ":"ml","mililiter":"ml"
}
PIECE_ALIAS = {
    "st":"stk","st.":"stk","pcs":"stk","pc":"stk","pieces":"stk","stuck":"stk"
}

# ambiguous *volume* units (we only keep those present & with defaults)
AMB_ALIASES = {
    "el": {"el","esslöffel","essloeffel"},
    "tl": {"tl","teelöffel","teeloeffel"},
    "tasse": {"tasse","tassen"},
    "becher": {"becher"},
    "glas": {"glas","gläser","glaeser"},
    "prise": {"prise"},
    "msp": {"msp","messerspitze"}
}
DEFAULT_ML = {  
    "el": 15, "tl": 5, "tasse": 250, "becher": 250, "glas": 200, "prise": 0.5, "msp": 0.5
}

# 
# (Only tokens that actually occur will be added to BASE, so no overreach)
PIECEY_ALIASES = {
    "päckchen": {"päckchen","packchen","päck.","päck","packung","pkg"},
    "dose": {"dose","dosen"},
    "flasche": {"flasche","flaschen"},
    "bund": {"bund","bunde"},
    "kopf": {"kopf","köpfe","koepfe"},
    "scheibe": {"scheibe","scheiben"},
    "zehe": {"zehe","zehen"},
    "stange": {"stange","stangen"},
    "beutel": {"beutel"},
    "tüte": {"tüte","tuete"},
}

# extend BASE from tokens actually seen 
def extend_base_from_present(base: dict, tokens: set):
    # mass/volume/pieces aliases
    for t, tgt in MASS_ALIAS.items():
        if t in tokens: base[t] = base.get(tgt, (1.0, CANON_G))
    for t, tgt in VOLUME_ALIAS.items():
        if t in tokens: base[t] = base.get(tgt, (1.0, CANON_L) if tgt=="l" else (0.001, CANON_L))
    for t, tgt in PIECE_ALIAS.items():
        if t in tokens: base[t] = base.get(tgt, (1.0, CANON_P))
    # packaging/count → pieces (only if present)
    for canon, aliases in PIECEY_ALIASES.items():
        if any(a in tokens for a in aliases) or canon in tokens:
            
            for a in aliases | {canon}:
                if a in tokens:
                    base[a] = (1.0, CANON_P)

extend_base_from_present(BASE, present_tokens_all)

#  select ambiguous volume units to consider 
def canon_amb(u_raw: str) -> str:
    u = clean_unit(u_raw)
    for c, aliases in AMB_ALIASES.items():
        if u == c or u in aliases: return c
    return u

amb_from_amount = {c for c, aliases in AMB_ALIASES.items()
                   if any(tok == c or tok in aliases for tok in present_tokens_amount)}
amb_from_ann = set()
for u in present_units_ann:
    amb_from_ann.add(canon_amb(u))
AMB_USED = sorted((amb_from_amount | amb_from_ann) & set(DEFAULT_ML.keys()))

#learn co-mentions (only for AMB_USED)
EVIDENCE_CSV = "ambiguous_evidence.csv"
SUMMARY_CSV  = "ambiguous_summary.csv"
MIN_EVIDENCE = 3

learned_lists = {k: [] for k in AMB_USED}
evidence_rows = []

if "amount" in annotated_data.columns and AMB_USED:
    alias_pool = []
    for canon in AMB_USED:
        alias_pool += list(AMB_ALIASES.get(canon, {canon}))
    UNIT_PAT = "|".join(sorted(map(re.escape, alias_pool)))
    COUNT = r"(\d+[.,]?\d*|\d+/\d+|[½¼¾⅓⅔⅛⅜⅝⅞])"
    VOL   = r"(\d+[.,]?\d*)"
    ML    = r"(ml|l|㎖)"
    SEP   = r".{0,40}?"

    pat_fwd = re.compile(fr"{COUNT}\s*({UNIT_PAT}){SEP}(?:≈|~|=|:|ca\.\s*)?\s*{VOL}\s*{ML}", re.IGNORECASE)
    pat_rev = re.compile(fr"{VOL}\s*{ML}{SEP}(?:≈|~|=|:|ca\.\s*)?\s*{COUNT}\s*({UNIT_PAT})", re.IGNORECASE)

    for idx, raw in annotated_data["amount"].astype(str).fillna("").items():
        txt = _nfkc(raw)
        for m in pat_fwd.finditer(txt):
            cnt = num(m.group(1)); u = canon_amb(m.group(2))
            vol = _nfkc(m.group(3)).replace(",", ".")
            try: ml = float(vol)
            except: continue
            uvol = clean_unit(m.group(4))
            if uvol in ("l","liter"): ml *= 1000.0
            if cnt and cnt > 0 and u in learned_lists:
                learned_lists[u].append(ml/cnt)
                evidence_rows.append({"row_index":idx,"amount_text":raw,"unit":u,"pattern":"fwd",
                                      "count":cnt,"explicit_volume_ml":ml,"ml_per_unit":ml/cnt})
        for m in pat_rev.finditer(txt):
            vol = _nfkc(m.group(1)).replace(",", ".")
            try: ml = float(vol)
            except: continue
            uvol = clean_unit(m.group(2))
            if uvol in ("l","liter"): ml *= 1000.0
            cnt = num(m.group(3)); u = canon_amb(m.group(4))
            if cnt and cnt > 0 and u in learned_lists:
                learned_lists[u].append(ml/cnt)
                evidence_rows.append({"row_index":idx,"amount_text":raw,"unit":u,"pattern":"rev",
                                      "count":cnt,"explicit_volume_ml":ml,"ml_per_unit":ml/cnt})

df_evidence = pd.DataFrame(evidence_rows)
df_evidence.to_csv(EVIDENCE_CSV, index=False, encoding="utf-8")

summary_rows, ML_PER = [], {}
for u in AMB_USED:
    vals = learned_lists.get(u, [])
    n = len(vals)
    if n >= MIN_EVIDENCE:
        chosen = median(vals); source = "learned_median"
    else:
        chosen = DEFAULT_ML[u]; source = "default_fallback"
    ML_PER[u] = chosen
    row = {"unit": u, "evidence_n": n, "chosen_ml_per_unit": chosen, "source": source}
    if n:
        v = sorted(vals)
        row.update({"min": v[0], "median": median(vals), "max": v[-1], "mean": sum(vals)/n})
    summary_rows.append(row)

df_summary = pd.DataFrame(summary_rows).sort_values("unit")
df_summary.to_csv(SUMMARY_CSV, index=False, encoding="utf-8")

# build unit map 
unit_map = dict(BASE)
for k, ml in ML_PER.items():  
    unit_map[k] = (ml/1000.0, CANON_L)

# normalize rows 
vals, units = [], []
for _, row in annotated_data.iterrows():
    try:
        ann = parse_ann(row.get("ingr_annotation"))
        if not ann: raise ValueError("no_json")
        key, amt = pick_amount(ann)
        if amt is None:
            amt = num(row.get("amount"))
            if amt is None: amt = 1.0
        u = clean_unit(ann.get("einheit"))
        # fold ambiguous volume aliases to canonical key
        u = canon_amb(u)
        if u in unit_map:
            f, target = unit_map[u]
            val, unit = float(amt)*f, target
        else:
            # if it's a present packaging/count alias we added to BASE, it already hits above.
            # otherwise: fallback by JSON key
            if key in ("menge","anzahl"):   val, unit = float(amt), CANON_P
            elif key == "gewicht":          val, unit = float(amt), CANON_G
            elif key == "volumen":          val, unit = float(amt), CANON_L
            else:                           val, unit = float(amt), CANON_P
    except Exception:
        v = num(row.get("amount"))
        val, unit = (v if v is not None else 1.0), CANON_P
    vals.append(val); units.append(unit)

annotated_data["norm_value"] = vals
annotated_data["norm_unit"]  = units

# ------------------ preview ------------------
print("Present tokens from amount:", sorted(present_tokens_amount))
print("Present units from annotation:", sorted(present_units_ann))
print(f"BASE extended with present aliases. Current keys: {sorted(set(BASE.keys()))[:20]} ... (+{len(BASE)-20 if len(BASE)>20 else 0} more)")
print(f"Ambiguous VOLUME units considered (present & with defaults): {AMB_USED}")
print(f"Evidence → ambiguous_evidence.csv ({len(df_evidence)} rows)")
print(f"Summary  → ambiguous_summary.csv")
display(df_summary)
display(annotated_data[["ingredient","amount","ingr_annotation","norm_value","norm_unit"]].head(60))

NORMALIZED_CSV = "normalized_units.csv"
annotated_data.to_csv(NORMALIZED_CSV, index=False, encoding="utf-8")
print(f"Saved full normalized table {NORMALIZED_CSV}")




Present tokens from amount: ['bd', 'becher', 'bestecklöffel', 'beutel', 'blatt', 'blätter', 'bund', 'bündel', 'cl', 'dose', 'dosen', 'el', 'esslöffel', 'frucht', 'g', 'glas', 'gr', 'gramm', 'hand', 'kg', 'kl', 'klecks', 'kleine', 'kleinere', 'knolle', 'kopf', 'l', 'lange', 'liter', 'mal', 'mg', 'mittelgross', 'mittelgrosse', 'ml', 'msp', 'pack', 'pck', 'prise', 'päckchen', 'scheibe', 'scheiben', 'schuss', 'schüssel', 'spr', 'spritzer', 'stange', 'stangen', 'stck', 'stiel', 'stk', 'stück', 'stücke', 'tasse', 'tassen', 'teelöffel', 'tl', 'tropfen', 'voll', 'würfel', 'zehe', 'zehen', 'zweig', 'zweige']
Present units from annotation: ['bd', 'becher', 'bestecklöffel', 'beutel', 'blatt', 'bund', 'bündel', 'cl', 'dose', 'dosen', 'el', 'esslöffel', 'frucht', 'g', 'glas', 'gr', 'gramm', 'hand', 'hand voll', 'kg', 'klecks', 'knolle', 'kopf', 'l', 'limette', 'liter', 'mal', 'mg', 'ml', 'msp', 'pack', 'pck', 'prise', 'päckchen', 'scheibe', 'scheiben', 'schuss', 'schüssel', 'spritzer', 'stange', 's

Unnamed: 0,unit,evidence_n,chosen_ml_per_unit,source
0,becher,0,250.0,default_fallback
1,el,0,15.0,default_fallback
2,glas,0,200.0,default_fallback
3,msp,0,0.5,default_fallback
4,prise,0,0.5,default_fallback
5,tasse,0,250.0,default_fallback
6,tl,0,5.0,default_fallback


Unnamed: 0,ingredient,amount,ingr_annotation,norm_value,norm_unit
0,zucchini,1,"{""anzahl"": 1, ""einheit"": ""St\u00fcck""}",1.0,stk
1,paprika rot,2,"{""anzahl"": 2, ""einheit"": ""St\u00fcck""}",2.0,stk
2,feta,200 g,"{""gewicht"": 200, ""einheit"": ""g""}",200.0,g
3,schinken,250 g,"{""gewicht"": 250, ""einheit"": ""g""}",250.0,g
4,zwiebel frisch,1,"{""anzahl"": 1, ""einheit"": ""St\u00fcck""}",1.0,stk
5,passierte tomaten,300 g,"{""gewicht"": 300, ""einheit"": ""g""}",300.0,g
6,hähnchenbrust frisch,500 g,"{""gewicht"": 500, ""einheit"": ""g""}",500.0,g
7,zwiebel frisch,120 g,"{""gewicht"": 120, ""einheit"": ""g""}",120.0,g
8,austernpilze,300 g,"{""gewicht"": 300, ""einheit"": ""g""}",300.0,g
9,schmand,1 becher,"{""volumen"": 1, ""einheit"": ""becher""}",0.25,l


Saved full normalized table normalized_units.csv
