In [1]:
import pandas as pd
import subprocess, re, time
from pathlib import Path
from sklearn.metrics import classification_report, accuracy_score, f1_score

MODEL = "llama3.1:8b"
INPUT_CSV = "incidents_test.csv"  # <-- test

OUT_DIR = Path("results")
OUT_DIR.mkdir(exist_ok=True)

OUT_CSV = OUT_DIR / "zero_shot_test_all_strict.csv"  # <-- test output
SAVE_EVERY = 20

HAZ_LIST = [
    "allergens", "biological", "chemical", "food additives and flavourings",
    "foreign bodies", "fraud", "migration", "organoleptic aspects",
    "other hazard", "packaging defect"
]

PROD_LIST = [
    "alcoholic beverages", "cereals and bakery products",
    "cocoa and cocoa preparations, coffee and tea", "confectionery",
    "dietetic foods, food supplements, fortified foods", "fats and oils",
    "feed materials", "food additives and flavourings", "food contact materials",
    "fruits and vegetables", "herbs and spices", "honey and royal jelly",
    "ices and desserts", "meat, egg and dairy products", "non-alcoholic beverages",
    "nuts, nut products and seeds", "other food product / mixed", "pet feed",
    "prepared dishes and snacks", "seafood",
    "soups, broths, sauces and condiments", "sugars and syrups"
]

PROMPT_TEMPLATE = """You are a food safety incident classifier.

Choose labels ONLY from the allowed lists.
Do NOT invent new labels. If unsure, choose the closest match based on definitions.

Hazard-category definitions (use these):
- allergens: undeclared allergens or allergen contamination.
- biological: pathogenic microorganisms (bacteria/viruses/parasites), spoilage microbes, mycotoxins of biological origin.
- chemical: chemicals, toxins (non-biological), residues, heavy metals, cleaning agents.
- food additives and flavourings: issues specifically about additives/flavourings (unauthorized, excessive, wrong additive).
- foreign bodies: physical objects (plastic, glass, metal, stones).
- fraud: intentional deception (mislabeling, substitution, counterfeit, origin fraud).
- migration: substances migrating from packaging/materials into food.
- organoleptic aspects: abnormal smell/taste/texture/appearance without a clear contaminant.
- other hazard: hazards that do not fit above categories.
- packaging defect: packaging integrity/closure/seal defects causing risk.

Product-category guidance:
Pick the category that best matches the recalled product itself (not the ingredient).
If the product is a meal/ready-to-eat item, prefer "prepared dishes and snacks".
If it is clearly a raw/processed meat/egg/dairy item, prefer "meat, egg and dairy products".

Before choosing labels, internally identify:
- the hazard evidence phrase (e.g., "Listeria", "plastic fragment", "undeclared milk")
- the recalled product phrase (e.g., "ham, sliced", "cookies", "baby formula")
Use those internal findings to select the closest allowed labels.

Output ONLY two lines, no extra text.
Line1 must be: hazard-category: <one of: {haz_opts}>
Line2 must be: product-category: <one of: {prod_opts}>

Report:
Title: {title}
Text: {text}
""".strip()


def run_ollama(prompt: str) -> str:
    res = subprocess.run(
        ["ollama", "run", MODEL, prompt],
        capture_output=True,
        text=True,
        encoding="utf-8",
        errors="replace"
    )
    if res.returncode != 0:
        raise RuntimeError(res.stderr)
    return res.stdout.strip()


def parse_two_lines(out: str):
    haz = None
    prod = None

    m1 = re.search(r"hazard-category\s*:\s*(.+)", out, flags=re.IGNORECASE)
    m2 = re.search(r"product-category\s*:\s*(.+)", out, flags=re.IGNORECASE)

    if m1:
        haz = m1.group(1).strip()
    if m2:
        prod = m2.group(1).strip()

    def norm(x):
        if x is None:
            return None
        x = x.strip().strip('"').strip("'").strip()
        x = re.sub(r"\s+", " ", x)
        x = x.rstrip(" .,:;")
        return x

    haz = norm(haz)
    prod = norm(prod)

    haz_in_list = (haz in HAZ_LIST) if haz else False
    prod_in_list = (prod in PROD_LIST) if prod else False

    if not haz_in_list:
        haz = None
    if not prod_in_list:
        prod = None

    parse_ok = (haz is not None) and (prod is not None)
    return haz, prod, parse_ok, haz_in_list, prod_in_list


def compute_metrics_if_labels_exist(df_res: pd.DataFrame):
    # if test has no ground-truth, skip metrics cleanly
    if ("haz_true" not in df_res.columns) or ("prod_true" not in df_res.columns):
        print("\nNo ground-truth columns found. Skipping metrics.")
        return
    if df_res["haz_true"].isna().all() or df_res["prod_true"].isna().all():
        print("\nGround-truth appears empty. Skipping metrics.")
        return

    df_ok = df_res[df_res["parse_ok"] == True].copy()
    if len(df_ok) == 0:
        print("\nNo parse_ok samples. Cannot compute metrics.")
        return

    # Hazard
    y_true = df_ok["haz_true"].astype(str)
    y_pred = df_ok["haz_pred"].astype(str)
    print("\n=== Hazard-category Metrics (on parse_ok samples: {}/{}) ===".format(len(df_ok), len(df_res)))
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Macro-F1:", f1_score(y_true, y_pred, average="macro", zero_division=0))
    print("Micro-F1:", f1_score(y_true, y_pred, average="micro", zero_division=0))
    print("Weighted-F1:", f1_score(y_true, y_pred, average="weighted", zero_division=0))
    print("\nClassification Report:\n")
    print(classification_report(y_true, y_pred, zero_division=0))

    # Product
    y_true = df_ok["prod_true"].astype(str)
    y_pred = df_ok["prod_pred"].astype(str)
    print("\n=== Product-category Metrics (on parse_ok samples: {}/{}) ===".format(len(df_ok), len(df_res)))
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Macro-F1:", f1_score(y_true, y_pred, average="macro", zero_division=0))
    print("Micro-F1:", f1_score(y_true, y_pred, average="micro", zero_division=0))
    print("Weighted-F1:", f1_score(y_true, y_pred, average="weighted", zero_division=0))
    print("\nClassification Report:\n")
    print(classification_report(y_true, y_pred, zero_division=0))


def main():
    df = pd.read_csv(INPUT_CSV)
    n = len(df)
    print(f"Loaded {n} rows from {INPUT_CSV}")

    # resume support
    done = {}
    if OUT_CSV.exists():
        prev = pd.read_csv(OUT_CSV)
        for _, r in prev.iterrows():
            done[int(r["idx"])] = r.to_dict()
        print(f"Resuming: found {len(done)} already processed rows in {OUT_CSV}")

    haz_opts = "; ".join(HAZ_LIST)
    prod_opts = "; ".join(PROD_LIST)

    rows = []
    start_time = time.time()

    for i, r in df.iterrows():
        if int(i) in done:
            rows.append(done[int(i)])
            continue

        prompt = PROMPT_TEMPLATE.format(
            haz_opts=haz_opts,
            prod_opts=prod_opts,
            title=str(r.get("title", "")),
            text=str(r.get("text", ""))
        )

        t0 = time.time()
        try:
            out = run_ollama(prompt)
            haz_pred, prod_pred, parse_ok, haz_ok, prod_ok = parse_two_lines(out)
            err = ""
        except Exception as e:
            out = ""
            haz_pred, prod_pred, parse_ok, haz_ok, prod_ok = None, None, False, False, False
            err = str(e)

        dt = time.time() - t0

        row = {
            "idx": int(i),
            "parse_ok": bool(parse_ok),
            "haz_pred": haz_pred,
            "prod_pred": prod_pred,
            "haz_in_list": bool(haz_ok),
            "prod_in_list": bool(prod_ok),
            # these may not exist in test; safe:
            "haz_true": r.get("hazard-category", None),
            "prod_true": r.get("product-category", None),
            "latency_sec": round(dt, 3),
            "error": err,
            "raw_output": out[:1500]
        }
        rows.append(row)

        if (i + 1) % 10 == 0:
            elapsed = time.time() - start_time
            print(f"[{i+1}/{n}] parse_ok={parse_ok} haz={haz_pred} prod={prod_pred} (elapsed {elapsed/60:.1f} min)")

        if (i + 1) % SAVE_EVERY == 0:
            pd.DataFrame(rows).to_csv(OUT_CSV, index=False)

    out_df = pd.DataFrame(rows)
    out_df.to_csv(OUT_CSV, index=False)
    print(f"\nSaved final: {OUT_CSV}")

    print("\nParse OK rate:", out_df["parse_ok"].mean())
    print("Hazard in-list rate:", out_df["haz_in_list"].mean())
    print("Product in-list rate:", out_df["prod_in_list"].mean())
    print("Avg latency (sec):", out_df["latency_sec"].mean())

    compute_metrics_if_labels_exist(out_df)


if __name__ == "__main__":
    main()

Loaded 997 rows from incidents_test.csv
[10/997] parse_ok=True haz=biological prod=meat, egg and dairy products (elapsed 0.3 min)
[20/997] parse_ok=True haz=biological prod=cereals and bakery products (elapsed 0.4 min)
[30/997] parse_ok=True haz=foreign bodies prod=non-alcoholic beverages (elapsed 0.5 min)
[40/997] parse_ok=True haz=allergens prod=prepared dishes and snacks (elapsed 0.6 min)
[50/997] parse_ok=True haz=chemical prod=meat, egg and dairy products (elapsed 0.7 min)
[60/997] parse_ok=True haz=foreign bodies prod=meat, egg and dairy products (elapsed 0.8 min)
[70/997] parse_ok=True haz=biological prod=prepared dishes and snacks (elapsed 0.9 min)
[80/997] parse_ok=True haz=allergens prod=prepared dishes and snacks (elapsed 1.0 min)
[90/997] parse_ok=True haz=biological prod=prepared dishes and snacks (elapsed 1.0 min)
[100/997] parse_ok=True haz=biological prod=nuts, nut products and seeds (elapsed 1.1 min)
[110/997] parse_ok=True haz=biological prod=prepared dishes and snack