In [1]:
import pandas as pd
import random
from collections import defaultdict

SEED = 42
random.seed(SEED)

TRAIN_CSV = "incidents_train.csv"
train_df = pd.read_csv(TRAIN_CSV).fillna("")

# label lists
haz_labels = sorted(train_df["hazard-category"].unique().tolist())
prod_labels = sorted(train_df["product-category"].unique().tolist())

def short_text(s, n=350):
    s = str(s).replace("\n", " ").strip()
    return s[:n]


HAZ_DEF = {
    "allergens": "Allergen contamination or undeclared allergens (e.g., peanuts, milk, gluten).",
    "biological": "Microbiological hazards: bacteria, viruses, parasites (e.g., Salmonella, Listeria).",
    "chemical": "Chemical contamination: toxins, residues, cleaning agents, heavy metals.",
    "food additives and flavourings": "Issues with additives/flavourings: unauthorized, excessive, misused.",
    "foreign bodies": "Physical foreign objects in food: plastic, metal, glass, stones.",
    "fraud": "Food fraud/mislabelling/adulteration: substitution, counterfeit, incorrect origin.",
    "migration": "Migration of substances from packaging/contact materials into food.",
    "organoleptic aspects": "Quality/sensory issues: taste, smell, texture, spoilage not necessarily pathogens.",
    "other hazard": "Hazard type not covered by other categories (miscellaneous).",
    "packaging defect": "Defects in packaging integrity: seal failure, leakage, broken packaging."
}

PROD_DEF = {lab: f"Product category: {lab}." for lab in prod_labels}  

# --- Build documents
docs = []

# Hazard definition docs
for lab in haz_labels:
    definition = HAZ_DEF.get(lab, f"Hazard category: {lab}.")
    docs.append({
        "doc_id": f"haz_def::{lab}",
        "group": "hazard",
        "label": lab,
        "text": f"[HAZARD DEFINITION]\nLabel: {lab}\nDefinition: {definition}"
    })

# Product definition docs
for lab in prod_labels:
    definition = PROD_DEF.get(lab, f"Product category: {lab}.")
    docs.append({
        "doc_id": f"prod_def::{lab}",
        "group": "product",
        "label": lab,
        "text": f"[PRODUCT DEFINITION]\nLabel: {lab}\nDefinition: {definition}"
    })

# --- Example docs (few per label from TRAIN only)
EXAMPLES_PER_LABEL = 8 
def sample_examples(df, col, label, k):
    sub = df[df[col] == label]
    if len(sub) == 0:
        return []
    idxs = list(sub.index)
    random.shuffle(idxs)
    idxs = idxs[:min(k, len(idxs))]
    rows = []
    for i in idxs:
        r = df.loc[i]
        rows.append(
            f"Title: {short_text(r['title'],120)}\nText: {short_text(r['text'],350)}\n"
        )
    return rows

for lab in haz_labels:
    exs = sample_examples(train_df, "hazard-category", lab, EXAMPLES_PER_LABEL)
    for j, ex in enumerate(exs):
        docs.append({
            "doc_id": f"haz_ex::{lab}::{j}",
            "group": "hazard",
            "label": lab,
            "text": f"[HAZARD EXAMPLE]\nLabel: {lab}\n{ex}"
        })

for lab in prod_labels:
    exs = sample_examples(train_df, "product-category", lab, EXAMPLES_PER_LABEL)
    for j, ex in enumerate(exs):
        docs.append({
            "doc_id": f"prod_ex::{lab}::{j}",
            "group": "product",
            "label": lab,
            "text": f"[PRODUCT EXAMPLE]\nLabel: {lab}\n{ex}"
        })

print("KB docs:", len(docs))
print(docs[0]["text"][:300])

KB docs: 277
[HAZARD DEFINITION]
Label: allergens
Definition: Allergen contamination or undeclared allergens (e.g., peanuts, milk, gluten).


In [2]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(EMB_MODEL)

corpus_texts = [d["text"] for d in docs]

emb = embedder.encode(
    corpus_texts,
    convert_to_numpy=True,
    show_progress_bar=True
)

# cosine similarity via normalized inner product
faiss.normalize_L2(emb)
dim = emb.shape[1]

index = faiss.IndexFlatIP(dim)
index.add(emb)

print("Embedding shape:", emb.shape)
print("FAISS index size:", index.ntotal)



Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

Embedding shape: (277, 384)
FAISS index size: 277


In [3]:
import faiss

def retrieve(query, k=5, group=None):
    q_emb = embedder.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(q_emb)
    scores, ids = index.search(q_emb, k*6)  

    results = []
    for score, idx in zip(scores[0], ids[0]):
        d = docs[int(idx)]
        if group is None or d["group"] == group:
            results.append((float(score), d))
        if len(results) >= k:
            break
    return results


sample_query = """Recall Notification: FSIS-024-94
Product: SMOKED CHICKEN SAUSAGE
Problem: BACTERIA
Description: LISTERIA"""

haz_hits = retrieve(sample_query, k=5, group="hazard")
prod_hits = retrieve(sample_query, k=5, group="product")

print("\n--- HAZARD TOP-5 ---")
for s, d in haz_hits:
    print(f"\nscore={s:.3f} | {d['doc_id']} | label={d['label']}")
    print(d["text"][:220])

print("\n--- PRODUCT TOP-5 ---")
for s, d in prod_hits:
    print(f"\nscore={s:.3f} | {d['doc_id']} | label={d['label']}")
    print(d["text"][:220])


--- HAZARD TOP-5 ---

score=0.624 | haz_ex::biological::5 | label=biological
[HAZARD EXAMPLE]
Label: biological
Title: Recall of Additional Smoked Meat Products sold at Finns Butchers’ Shop, Co. Cork Due to Processing in an Unapproved Faci
Text: Recall of Additional Smoked Meat Products sold at F

score=0.621 | haz_ex::foreign bodies::5 | label=foreign bodies
[HAZARD EXAMPLE]
Label: foreign bodies
Title: Schneiders brand Mock Chicken Loaf recalled due to presence of pieces of rubber
Text: Notice This archive of previously issued food recalls and allergy alerts is provided for

score=0.613 | haz_ex::packaging defect::6 | label=packaging defect
[HAZARD EXAMPLE]
Label: packaging defect
Title: President's Choice brand Roasted Garlic Mayo Sandwich Spread recalled due to bursting bottles
Text: Notification - President's Choice brand Roasted Garlic Mayo Sandwich Spr

score=0.604 | haz_ex::biological::2 | label=biological
[HAZARD EXAMPLE]
Label: biological
Title: Pepperidge Farm brand Goldfis

In [4]:
import numpy as np
import faiss

# --- split docs
haz_docs = [d for d in docs if d["group"] == "hazard"]
prod_docs = [d for d in docs if d["group"] == "product"]

# --- embed separately
haz_texts = [d["text"] for d in haz_docs]
prod_texts = [d["text"] for d in prod_docs]

haz_emb = embedder.encode(haz_texts, convert_to_numpy=True, show_progress_bar=True)
prod_emb = embedder.encode(prod_texts, convert_to_numpy=True, show_progress_bar=True)

faiss.normalize_L2(haz_emb)
faiss.normalize_L2(prod_emb)

haz_index = faiss.IndexFlatIP(haz_emb.shape[1])
prod_index = faiss.IndexFlatIP(prod_emb.shape[1])

haz_index.add(haz_emb)
prod_index.add(prod_emb)

print("Haz docs:", len(haz_docs), " | Haz index:", haz_index.ntotal)
print("Prod docs:", len(prod_docs), " | Prod index:", prod_index.ntotal)

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Haz docs: 85  | Haz index: 85
Prod docs: 192  | Prod index: 192


In [5]:
def retrieve_with_def(query, index, docs_list, k=5):
    q_emb = embedder.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(q_emb)
    scores, ids = index.search(q_emb, k*8)


    defs = []
    exs = []
    for s, idx in zip(scores[0], ids[0]):
        d = docs_list[int(idx)]
        if "::def::" in d["doc_id"] or d["doc_id"].startswith(("haz_def::", "prod_def::")):
            defs.append((float(s), d))
        else:
            exs.append((float(s), d))


    chosen = []
    if len(defs) > 0:
        chosen.append(defs[0]) 
    chosen.extend(exs[:max(0, k - len(chosen))])
    return chosen

def get_rag_context(title, text, k_h=5, k_p=5):
    query = (str(title) + " " + str(text)).strip()
    haz_hits = retrieve_with_def(query, haz_index, haz_docs, k=k_h)
    prod_hits = retrieve_with_def(query, prod_index, prod_docs, k=k_p)

    ctx_h = "\n\n".join([f"(score={s:.3f}) {d['doc_id']} | label={d['label']}\n{d['text']}" for s,d in haz_hits])
    ctx_p = "\n\n".join([f"(score={s:.3f}) {d['doc_id']} | label={d['label']}\n{d['text']}" for s,d in prod_hits])
    return ctx_h, ctx_p

In [6]:
sample_title = "Recall Notification: FSIS-024-94"
sample_text = """Product: SMOKED CHICKEN SAUSAGE
Problem: BACTERIA
Description: LISTERIA"""

ctx_h, ctx_p = get_rag_context(sample_title, sample_text, k_h=5, k_p=5)
print("\n--- HAZARD CONTEXT (preview) ---\n", ctx_h[:600])
print("\n--- PRODUCT CONTEXT (preview) ---\n", ctx_p[:600])


--- HAZARD CONTEXT (preview) ---
 (score=0.417) haz_def::biological | label=biological
[HAZARD DEFINITION]
Label: biological
Definition: Microbiological hazards: bacteria, viruses, parasites (e.g., Salmonella, Listeria).

(score=0.624) haz_ex::biological::5 | label=biological
[HAZARD EXAMPLE]
Label: biological
Title: Recall of Additional Smoked Meat Products sold at Finns Butchers’ Shop, Co. Cork Due to Processing in an Unapproved Faci
Text: Recall of Additional Smoked Meat Products sold at Finns Butchers’ Shop, Co. Cork Due to Processing in an Unapproved Facility Tweet Thursday, 16 August 2018 Summary Category 2: For Informat

--- PRODUCT CONTEXT (preview) ---
 (score=0.675) prod_ex::prepared dishes and snacks::7 | label=prepared dishes and snacks
[PRODUCT EXAMPLE]
Label: prepared dishes and snacks
Title: Culinary Creations Gourmet brand & Denny's Express brand roast beef-containing sandwiches recalled due to Listeria monoc


In [7]:
from collections import Counter
import re

def extract_keywords(text, topn=12):

    toks = re.findall(r"[a-zA-Z]{4,}", str(text).lower())
    stop = set(["this","that","with","from","have","were","will","been","case","date","closed","opened",
                "recall","notification","report","press","release","product","problem","description",
                "company","class","total","pounds","recovered","distribution","reason","risk","possible"])
    toks = [t for t in toks if t not in stop]
    return [w for w,_ in Counter(toks).most_common(topn)]


PROD_DEF_RICH = {}
SAMPLE_PER_PROD_DEF = 30  

for lab in prod_labels:
    sub = train_df[train_df["product-category"] == lab]
    if len(sub) == 0:
        PROD_DEF_RICH[lab] = f"Product category: {lab}."
        continue

    # sample rows
    rows = sub.sample(n=min(SAMPLE_PER_PROD_DEF, len(sub)), random_state=SEED)
    joined = " ".join((rows["title"].astype(str) + " " + rows["text"].astype(str)).tolist())

    kw = extract_keywords(joined, topn=14)


    ex_titles = rows["title"].astype(str).tolist()[:5]
    ex_titles = [short_text(t, 80) for t in ex_titles]

    PROD_DEF_RICH[lab] = (
        f"Definition: Product category '{lab}'.\n"
        f"Typical keywords/items: {', '.join(kw)}.\n"
        f"Typical recall titles: " + " | ".join(ex_titles)
    )


print(PROD_DEF_RICH["meat, egg and dairy products"])

Definition: Product category 'meat, egg and dairy products'.
Typical keywords/items: products, food, consumers, milk, fsis, beef, should, recalled, allergy, sold, available, usda, chicken, safety.
Typical recall titles: Thai Chicken Panang withdrawn (update) | Lion-Dairy & Drinks Pty Ltd—Masters Flavoured Milk | Mrs. Grissom’s Salads Issues a Voluntary Recall | Müller Kids Corner Butterflies Strawberry Yogurt recalled as a single pot from t | CFS urges public not to consume a kind of Italian sausage suspected to be contam


In [8]:
# 1) update product definition docs in-place
for d in docs:
    if d["doc_id"].startswith("prod_def::"):
        lab = d["label"]
        d["text"] = f"[PRODUCT DEFINITION]\nLabel: {lab}\n{PROD_DEF_RICH.get(lab, f'Product category: {lab}.')}"

# 2) rebuild prod_docs, embeddings, index
prod_docs = [d for d in docs if d["group"] == "product"]
prod_texts = [d["text"] for d in prod_docs]

prod_emb = embedder.encode(prod_texts, convert_to_numpy=True, show_progress_bar=True)
faiss.normalize_L2(prod_emb)

prod_index = faiss.IndexFlatIP(prod_emb.shape[1])
prod_index.add(prod_emb)

print("Prod docs:", len(prod_docs), "| Prod index:", prod_index.ntotal)

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Prod docs: 192 | Prod index: 192


In [9]:
ctx_h, ctx_p = get_rag_context(sample_title, sample_text, k_h=5, k_p=5)
print("\n--- PRODUCT CONTEXT (preview) ---\n", ctx_p[:600])


--- PRODUCT CONTEXT (preview) ---
 (score=0.630) prod_def::meat, egg and dairy products | label=meat, egg and dairy products
[PRODUCT DEFINITION]
Label: meat, egg and dairy products
Definition: Product category 'meat, egg and dairy products'.
Typical keywords/items: products, food, consumers, milk, fsis, beef, should, recalled, allergy, sold, available, usda, chicken, safety.
Typical recall titles: Thai Chicken Panang withdrawn (update) | Lion-Dairy & Drinks Pty Ltd—Masters Flavoured Milk | Mrs. Grissom’s Salads Issues a Voluntary Recall | Müller Kids Corner Butterflies Strawberry Yogurt recalled as a single pot from t | CFS ur


In [10]:
import re, json

HAZ_LABELS = haz_labels
PROD_LABELS = prod_labels

RAG_PROMPT = """
You must output EXACTLY one line of valid JSON and NOTHING else.
No explanations. No markdown. No extra text.

Allowed hazard-category labels (copy EXACTLY one):
{haz_list}

Allowed product-category labels (copy EXACTLY one):
{prod_list}

[HAZARD KNOWLEDGE]
{ctx_h}

[PRODUCT KNOWLEDGE]
{ctx_p}

[RECALL REPORT]
Title: {title}
Text: {text}

Return exactly this JSON schema:
{{"hazard-category":"<one label from allowed hazard list>","product-category":"<one label from allowed product list>"}}
""".strip()
def parse_json_one_line(s: str):
    m = re.search(r"\{.*\}", s, flags=re.S)
    if not m:
        return None
    try:
        obj = json.loads(m.group(0))
        return obj
    except:
        return None

In [11]:
import requests, json

OLLAMA_MODEL = "llama3.1:8b"

def llm_generate(prompt: str) -> str:
    url = "http://localhost:11434/api/chat"
    payload = {
        "model": OLLAMA_MODEL,
        "messages": [
            {"role": "system", "content": "You output JSON only. Never output explanations."},
            {"role": "user", "content": prompt}
        ],
        "stream": False,
        "options": {
            "temperature": 0,
            "num_predict": 120
        }
    }
    r = requests.post(url, json=payload, timeout=120)
    r.raise_for_status()
    data = r.json()
    return data["message"]["content"]

In [14]:
print(llm_generate('Return ONLY this JSON in one line: {"hazard-category":"biological","product-category":"meat, egg and dairy products"}'))

{"hazard-category":"biological","product-category":"meat, egg and dairy products"}


In [12]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score

def run_rag(df, k_h=3, k_p=3, max_rows=None):
    y_h_true, y_h_pred = [], []
    y_p_true, y_p_pred = [], []
    parse_ok = 0
    total = 0

    rows = df if max_rows is None else df.head(max_rows)

    for _, r in tqdm(rows.iterrows(), total=len(rows)):
        title = str(r["title"])
        text  = str(r["text"])

        ctx_h, ctx_p = get_rag_context(title, text, k_h=k_h, k_p=k_p)

        prompt = RAG_PROMPT.format(
            haz_list=HAZ_LABELS,
            prod_list=PROD_LABELS,
            ctx_h=ctx_h,
            ctx_p=ctx_p,
            title=title,
            text=text
        )

        out = llm_generate(prompt)
        obj = parse_json_one_line(out)

        total += 1
        if obj and "hazard-category" in obj and "product-category" in obj:
            hz = obj["hazard-category"]
            pr = obj["product-category"]
            if hz in HAZ_LABELS and pr in PROD_LABELS:
                parse_ok += 1
                y_h_true.append(r["hazard-category"])
                y_h_pred.append(hz)
                y_p_true.append(r["product-category"])
                y_p_pred.append(pr)

    def metrics(y_true, y_pred):
        if len(y_true) == 0:
            return {"acc":0,"macro_f1":0,"micro_f1":0,"weighted_f1":0}
        return {
            "acc": accuracy_score(y_true, y_pred),
            "macro_f1": f1_score(y_true, y_pred, average="macro", zero_division=0),
            "micro_f1": f1_score(y_true, y_pred, average="micro", zero_division=0),
            "weighted_f1": f1_score(y_true, y_pred, average="weighted", zero_division=0)
        }

    return {
        "parse_ok_rate": parse_ok / max(total,1),
        "n_total": total,
        "n_parse_ok": parse_ok,
        "hazard": metrics(y_h_true, y_h_pred),
        "product": metrics(y_p_true, y_p_pred),
    }

## FIRST 30 ROW

In [22]:
import pandas as pd
valid_df = pd.read_csv("incidents_valid.csv").fillna("")

res_smoke = run_rag(valid_df, k_h=3, k_p=3, max_rows=30)
print(res_smoke)

100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [01:32<00:00,  3.09s/it]

{'parse_ok_rate': 1.0, 'n_total': 30, 'n_parse_ok': 30, 'hazard': {'acc': 0.3, 'macro_f1': 0.20455840455840454, 'micro_f1': 0.3, 'weighted_f1': 0.34923076923076923}, 'product': {'acc': 0.5666666666666667, 'macro_f1': 0.5470085470085471, 'micro_f1': 0.5666666666666667, 'weighted_f1': 0.5933333333333333}}





In [17]:
def run_rag_debug(df, k_h=3, k_p=3, max_rows=30, show_fail=5):
    parse_ok = 0
    total = 0
    shown = 0

    rows = df.head(max_rows)

    for i, r in rows.iterrows():
        title = str(r["title"])
        text  = str(r["text"])
        ctx_h, ctx_p = get_rag_context(title, text, k_h=k_h, k_p=k_p)

        prompt = RAG_PROMPT.format(
            haz_list=HAZ_LABELS,
            prod_list=PROD_LABELS,
            ctx_h=ctx_h,
            ctx_p=ctx_p,
            title=title,
            text=text
        )

        out = llm_generate(prompt)
        obj = parse_json_one_line(out)

        total += 1
        ok = True
        reason = ""

        if not obj:
            ok = False
            reason = "no_json_parsed"
        else:
            hz = obj.get("hazard-category", None)
            pr = obj.get("product-category", None)

            if hz not in HAZ_LABELS:
                ok = False
                reason = f"bad_hazard_label: {hz}"
            if pr not in PROD_LABELS:
                ok = False
                reason = f"bad_product_label: {pr}"

        if ok:
            parse_ok += 1
        else:
            if shown < show_fail:
                print("\n================ FAIL SAMPLE ================")
                print("Reason:", reason)
                print("GT hazard:", r["hazard-category"])
                print("GT product:", r["product-category"])
                print("\nModel output:\n", out[:800])
                shown += 1

    print("\nParse OK:", parse_ok, "/", total, "=", parse_ok/total)

In [18]:
run_rag_debug(valid_df, k_h=3, k_p=3, max_rows=30, show_fail=5)


Reason: no_json_parsed
GT hazard: biological
GT product: meat, egg and dairy products

Model output:
 Based on the provided recall report, I will make predictions for hazard-category and product-category.

**Hazard-Category:** fraud
The reason is that the title of the recall notification mentions "misbranded" products, which implies a fraudulent activity. The text also states that the products were produced without the benefit of federal inspection, further supporting this classification.

**Product-Category:** meat, egg and dairy products
This classification is based on the product name mentioned in the recall report: "DRY SALAMI". Dry salami is a type of cured meat product, which falls under the category of

Reason: no_json_parsed
GT hazard: allergens
GT product: ices and desserts

Model output:
 Based on the provided knowledge and constraints, I will make a prediction for the hazard-category and product-category.

**Hazard-Category:** allergens
**Product-Category:** ices and desser

## All VAL DF

In [23]:
import pandas as pd
valid_df = pd.read_csv("incidents_valid.csv").fillna("")

res_smoke = run_rag(valid_df, k_h=3, k_p=3)
print(res_smoke)

100%|████████████████████████████████████████████████████████████████████████████████| 565/565 [29:34<00:00,  3.14s/it]

{'parse_ok_rate': 0.9964601769911504, 'n_total': 565, 'n_parse_ok': 563, 'hazard': {'acc': 0.4955595026642984, 'macro_f1': 0.30624548848461136, 'micro_f1': 0.4955595026642984, 'weighted_f1': 0.5391691880819212}, 'product': {'acc': 0.42984014209591476, 'macro_f1': 0.3741908438772263, 'micro_f1': 0.42984014209591476, 'weighted_f1': 0.4389399618492287}}





In [24]:
for k in [1,5]:
    res = run_rag(valid_df, k_h=k, k_p=k)
    print("\n==== VALID RAG (k=%d) ====" % k)
    print("Hazard Macro-F1:", res["hazard"]["macro_f1"])
    print("Product Macro-F1:", res["product"]["macro_f1"])

100%|████████████████████████████████████████████████████████████████████████████████| 565/565 [27:54<00:00,  2.96s/it]



==== VALID RAG (k=1) ====
Hazard Macro-F1: 0.3018350904605418
Product Macro-F1: 0.3387472056752452


100%|████████████████████████████████████████████████████████████████████████████████| 565/565 [31:05<00:00,  3.30s/it]


==== VALID RAG (k=5) ====
Hazard Macro-F1: 0.30001552429629924
Product Macro-F1: 0.3919217574828468





## TEST

In [13]:
import pandas as pd

test_df = pd.read_csv("incidents_test.csv").fillna("")

print("Test size:", len(test_df))

Test size: 997


In [14]:
# Full RAG on TEST (best k from ablation)
res_test_rag_k5 = run_rag(test_df, k_h=5, k_p=5)
print("TEST RAG (k=5):", res_test_rag_k5)

100%|████████████████████████████████████████████████████████████████████████████████| 997/997 [54:37<00:00,  3.29s/it]

TEST RAG (k=5): {'parse_ok_rate': 0.9859578736208626, 'n_total': 997, 'n_parse_ok': 983, 'hazard': {'acc': 0.49847405900305186, 'macro_f1': 0.2613990546167623, 'micro_f1': 0.49847405900305186, 'weighted_f1': 0.5405143627642336}, 'product': {'acc': 0.4577822990844354, 'macro_f1': 0.4132363729529004, 'micro_f1': 0.4577822990844354, 'weighted_f1': 0.4676560806127195}}



