In [4]:
# - Normalizes to canonical units: g, l, stk
# - Uses NFKC + tolerant numeric parsing (handles 1½, 1 ½, ½, 1/2, full-width digits, ㎖, etc.)
# - Learns ml-per-unit for EL/TL/Tasse/Becher/Glas/Prise/MSP from co-mentions in free-text `amount`
# - Additionally saves:
#     * ambiguous_evidence.csv  (row-level co-mentions with ml_per_unit)
#     * ambiguous_summary.csv (per-unit stats and chosen value)


import json, re, unicodedata
import pandas as pd
from statistics import median

# the data morty...always read the data
annotated_data = pd.read_csv(r"gemma_annotation_original.csv")

# our helper functions
def _nfkc(s: str) -> str:
    return unicodedata.normalize("NFKC", s)

def num(x):
    """
    Robust numeric parser:
      - NFKC normalization (full-width digits → ASCII, ㎖ → ml, etc.)
      - decimals with , or .
      - ascii fractions: 1/2 and mixed "1 1/2"
      - unicode fractions: ½ ¼ ¾ ⅓ ⅔ ⅛ ⅜ ⅝ ⅞ (also "1½" / "1 ½")
      - strips trailing words ("1 ½ liter" -> "1 ½")
      - converts Unicode fraction slash ⁄ to /
    """
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return None
    s = _nfkc(str(x)).strip().lower()
    s = s.replace(",", ".").replace("⁄", "/")
    s = re.sub(r"(\d)([½¼¾⅓⅔⅛⅜⅝⅞])", r"\1 \2", s)  # "1½" -> "1 ½"
    s = re.split(r"\s+[a-zäöüß]+", s, maxsplit=1)[0]  # cut trailing unit words

    UF = {"½":0.5,"¼":0.25,"¾":0.75,"⅓":1/3,"⅔":2/3,"⅛":0.125,"⅜":0.375,"⅝":0.625,"⅞":0.875}
    total, seen = 0.0, False
    for tok in s.split():
        if tok in UF:
            total += UF[tok]; seen = True; continue
        if re.fullmatch(r"\d+/\d+", tok):
            n,d = tok.split("/")
            try: total += float(n)/float(d); seen = True; continue
            except: break
        try:
            total += float(tok); seen = True; continue
        except:
            break
    return total if seen else None

def clean_unit(u):
    if u is None or (isinstance(u, float) and pd.isna(u)):
        return ""
    s = _nfkc(str(u)).lower().strip().replace(".","").replace("⁄","/")
    return s

def parse_ann(raw):
    return raw if isinstance(raw, dict) else (json.loads(raw) if isinstance(raw, str) else None)

def pick_amount(d):
    for k in ("gewicht","volumen","menge","anzahl"):
        v = num(d.get(k))
        if v is not None:
            return k, v
    return None, None

# ------------------ Canonical + unit tables ------------------
CANON_G, CANON_L, CANON_P = "g", "l", "stk"

BASE = {
    # weight -> g
    "g":(1.0,CANON_G), "gramm":(1.0,CANON_G), "gram":(1.0,CANON_G), "gr":(1.0,CANON_G),
    "kg":(1000.0,CANON_G), "kilogramm":(1000.0,CANON_G), "mg":(0.001,CANON_G),
    # volume -> l
    "l":(1.0,CANON_L), "liter":(1.0,CANON_L),
    "ml":(0.001,CANON_L), "milliliter":(0.001,CANON_L), "mililiter":(0.001,CANON_L),
    "㎖":(0.001,CANON_L),  # actually when I did NFKC it handles this parsing but kept for safety
    # pieces -> stk
    "stk":(1.0,CANON_P), "stück":(1.0,CANON_P), "stueck":(1.0,CANON_P),
    "piece":(1.0,CANON_P), "anzahl":(1.0,CANON_P)
}

# Ambiguous measures -> volume via ml-per-unit
AMB = {"el","esslöffel","essloeffel","tl","teelöffel","teeloeffel",
       "prise","msp","messerspitze","glas","tasse","becher"}

DEFAULT_ML = {
    "el":15,"esslöffel":15,"essloeffel":15,
    "tl":5,"teelöffel":5,"teeloeffel":5,
    "prise":0.5,"msp":0.5,"messerspitze":0.5,
    "glas":200,"tasse":250,"becher":250
}

# Learning from co-mentions in our free text amount column (and capture evidence out of...paranoia?) 
EVIDENCE_CSV = "ambiguous_evidence.csv"
SUMMARY_CSV  = "ambiguous_summary.csv"
SUMMARY_JSON = "ambiguous_summary.json"
MIN_EVIDENCE = 3  # we can always change this but for now checking for median among 3 found

learned_lists = {k: [] for k in AMB}
evidence_rows = []

if "amount" in annotated_data.columns:
    pat = re.compile(
        r"(\d+[.,]?\d*|\d+/\d+|[½¼¾⅓⅔⅛⅜⅝⅞])\s*"
        r"(el|esslöffel|essloeffel|tl|teelöffel|teeloeffel|tasse|becher|glas|msp|messerspitze|prise)"
        r".*?(\d+[.,]?\d*)\s*(ml|l|㎖)",
        re.IGNORECASE
    )
    for idx, raw in annotated_data["amount"].astype(str).fillna("").items():
        txt = _nfkc(raw)
        for m in pat.finditer(txt):
            cnt = num(m.group(1))
            u   = clean_unit(m.group(2))
            mlv = _nfkc(m.group(3)).replace(",", ".")
            try:
                ml = float(mlv)
            except:
                continue
            unit_found = clean_unit(m.group(4))
            if unit_found in ("l","liter"):
                ml *= 1000.0  # normalize to ml for evidence
            if cnt and cnt > 0 and u in AMB:
                ml_per_unit = ml / cnt
                learned_lists[u].append(ml_per_unit)
                evidence_rows.append({
                    "row_index": idx,
                    "amount_text": raw,
                    "count": cnt,
                    "ambiguous_unit": u,
                    "explicit_volume_ml": ml,
                    "explicit_volume_unit_raw": m.group(4),
                    "ml_per_unit": ml_per_unit
                })

# Persist evidence (even if empty) for awareness
df_evidence = pd.DataFrame(evidence_rows)
df_evidence.to_csv(EVIDENCE_CSV, index=False, encoding="utf-8")

# Aggregate per-unit stats and choose learned/default (did we learn from data or use our fallback values)
summary_rows, ML_PER = [], {}
for u in sorted(AMB):
    vals = learned_lists.get(u, [])
    n = len(vals)
    if n >= MIN_EVIDENCE:
        chosen = median(vals); source = "learned_median"
    else:
        chosen = DEFAULT_ML.get(u); source = "default_fallback"
    ML_PER[u] = chosen
    row = {"unit": u, "evidence_n": n, "chosen_ml_per_unit": chosen, "source": source}
    if n:
        v = sorted(vals)
        row.update({"min": v[0], "median": median(vals), "max": v[-1],
                    "mean": sum(vals)/n})
    summary_rows.append(row)

df_summary = pd.DataFrame(summary_rows)
df_summary.to_csv(SUMMARY_CSV, index=False, encoding="utf-8")


# building our unit map finally
unit_map = dict(BASE)
for k, ml in ML_PER.items():
    if ml is not None:
        unit_map[k] = (ml/1000.0, CANON_L)  

# just normalizing rows here
vals, units = [], []
for _, row in annotated_data.iterrows():
    try:
        ann = parse_ann(row.get("ingr_annotation"))
        if not ann:
            raise ValueError("no_json")
        key, amt = pick_amount(ann)
        if amt is None:
            amt = num(row.get("amount"))
            if amt is None:
                amt = 1.0
        u = clean_unit(ann.get("einheit"))
        if u in unit_map:
            f, target = unit_map[u]
            val, unit = float(amt)*f, target
        else:
            if key in ("menge","anzahl"):   val, unit = float(amt), CANON_P
            elif key == "gewicht":          val, unit = float(amt), CANON_G
            elif key == "volumen":          val, unit = float(amt), CANON_L
            else:                           val, unit = float(amt), CANON_P
    except Exception:
        v = num(row.get("amount"))
        val, unit = (v if v is not None else 1.0), CANON_P
    vals.append(val); units.append(unit)

annotated_data["norm_value"] = vals
annotated_data["norm_unit"]  = units

# 
print(f"Ambiguous evidence out of curiosity: {EVIDENCE_CSV} ({len(df_evidence)} rows)")
print(f"Just keeping same summary in 2 different formats: {SUMMARY_CSV}")
display(df_summary)
display(annotated_data[["ingredient","amount","ingr_annotation","norm_value","norm_unit"]].head(80))


Ambiguous evidence out of curiosity: ambiguous_evidence.csv (0 rows)
Just keeping same summary in 2 different formats: ambiguous_summary.csv


Unnamed: 0,unit,evidence_n,chosen_ml_per_unit,source
0,becher,0,250.0,default_fallback
1,el,0,15.0,default_fallback
2,essloeffel,0,15.0,default_fallback
3,esslöffel,0,15.0,default_fallback
4,glas,0,200.0,default_fallback
5,messerspitze,0,0.5,default_fallback
6,msp,0,0.5,default_fallback
7,prise,0,0.5,default_fallback
8,tasse,0,250.0,default_fallback
9,teeloeffel,0,5.0,default_fallback


Unnamed: 0,ingredient,amount,ingr_annotation,norm_value,norm_unit
0,zucchini,1,"{""anzahl"": 1, ""einheit"": ""St\u00fcck""}",1.0000,stk
1,paprika rot,2,"{""anzahl"": 2, ""einheit"": ""St\u00fcck""}",2.0000,stk
2,feta,200 g,"{""gewicht"": 200, ""einheit"": ""g""}",200.0000,g
3,schinken,250 g,"{""gewicht"": 250, ""einheit"": ""g""}",250.0000,g
4,zwiebel frisch,1,"{""anzahl"": 1, ""einheit"": ""St\u00fcck""}",1.0000,stk
...,...,...,...,...,...
75,sonnenblumenkerne geröstet,1 el,"{""volumen"": 1, ""einheit"": ""el""}",0.0150,l
76,pfeffer aus der mühle,1 prise,"{""anzahl"": 1, ""einheit"": ""Prise""}",0.0005,l
77,salz aus der mühle,1 prise,"{""anzahl"": 1, ""einheit"": ""Prise"", ""zutat"": ""sa...",0.0005,l
78,spaghetti,170 g,"{""gewicht"": 170, ""einheit"": ""g""}",170.0000,g
