In [1]:
import os, re, json
import numpy as np
import pandas as pd
from tqdm import tqdm
import requests

import faiss
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score, f1_score

In [2]:
TRAIN_CSV = "incidents_train.csv"
VALID_CSV = "incidents_valid.csv"
TEST_CSV  = "incidents_test.csv"

train_df = pd.read_csv(TRAIN_CSV).fillna("")
valid_df = pd.read_csv(VALID_CSV).fillna("")
test_df  = pd.read_csv(TEST_CSV).fillna("")

haz_labels = sorted(train_df["hazard-category"].astype(str).unique().tolist())
prod_labels = sorted(train_df["product-category"].astype(str).unique().tolist())

HAZ_LABELS = haz_labels
PROD_LABELS = prod_labels

print("Train:", len(train_df), "Valid:", len(valid_df), "Test:", len(test_df))
print("Hazard classes:", len(HAZ_LABELS))
print("Product classes:", len(PROD_LABELS))

Train: 5082 Valid: 565 Test: 997
Hazard classes: 10
Product classes: 22


In [3]:
def build_kb_docs(train_df, haz_labels, prod_labels, n_ex_per_label=8):
    docs = []

    # --- Hazard definitions ---
    haz_def_map = {
        "allergens": "Allergen contamination or undeclared allergens (e.g., peanuts, milk, gluten).",
        "biological": "Microbiological hazards: bacteria, viruses, parasites (e.g., Salmonella, Listeria).",
        "chemical": "Chemical contamination (e.g., pesticides, cleaning agents, toxins, heavy metals).",
        "food additives and flavourings": "Issues related to additives, colorants, preservatives, or flavorings.",
        "foreign bodies": "Physical foreign objects in food (e.g., plastic, metal, glass, rubber).",
        "fraud": "Mislabeling, adulteration, counterfeit, incorrect date/label claims, or regulatory deception.",
        "migration": "Substances migrating from packaging/contact materials into food.",
        "organoleptic aspects": "Sensory/quality issues (taste, odor, texture) affecting acceptability.",
        "other hazard": "Hazard not covered by other categories.",
        "packaging defect": "Packaging integrity issues (broken seals, leaks, bursting bottles, damaged packaging).",
    }

    for lab in haz_labels:
        definition = haz_def_map.get(lab, f"Definition for hazard category '{lab}'.")
        docs.append({
            "id": f"haz_def::{lab}",
            "kind": "haz_def",
            "label": lab,
            "text": f"[HAZARD DEFINITION]\nLabel: {lab}\nDefinition: {definition}"
        })

    # --- Product definitions (lightweight) ---
    for lab in prod_labels:
        docs.append({
            "id": f"prod_def::{lab}",
            "kind": "prod_def",
            "label": lab,
            "text": f"[PRODUCT DEFINITION]\nLabel: {lab}\nDefinition: Product category '{lab}'."
        })

    # --- Examples from train (targeted) ---
    def add_examples(label_col, prefix, kind, labels):
        for lab in labels:
            sub = train_df[train_df[label_col].astype(str) == lab].head(n_ex_per_label)
            for j, row in enumerate(sub.itertuples(index=False), start=0):
                title = str(getattr(row, "title"))
                text  = str(getattr(row, "text"))
                docs.append({
                    "id": f"{prefix}::{lab}::{j}",
                    "kind": kind,
                    "label": lab,
                    "text": f"[{kind.upper().replace('_',' ')}]\nLabel: {lab}\nTitle: {title}\nText: {text}"
                })

    add_examples("hazard-category", "haz_ex", "haz_ex", haz_labels)
    add_examples("product-category", "prod_ex", "prod_ex", prod_labels)

    return docs

kb_docs = build_kb_docs(train_df, HAZ_LABELS, PROD_LABELS, n_ex_per_label=8)
print("KB docs:", len(kb_docs))
print(kb_docs[0]["text"][:120])

KB docs: 277
[HAZARD DEFINITION]
Label: allergens
Definition: Allergen contamination or undeclared allergens (e.g., peanuts, milk, gl


In [4]:
EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(EMB_MODEL)

def embed_texts(texts, batch_size=32, show_bar=False):
    vecs = embedder.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=show_bar,  
        convert_to_numpy=True,
        normalize_embeddings=True
    )
    return vecs.astype("float32")

def build_faiss_index(docs):
    texts = [d["text"] for d in docs]
    X = embed_texts(texts)
    index = faiss.IndexFlatIP(X.shape[1])   # cosine sim via normalized vectors
    index.add(X)
    return index, X

# --- Hybrid split ---
haz_def_docs = [d for d in kb_docs if d["kind"] == "haz_def"]      # ✅ فقط definitions
prod_docs    = [d for d in kb_docs if d["kind"] in ("prod_def","prod_ex")]  # ✅ definitions + examples

haz_index, _ = build_faiss_index(haz_def_docs)
prod_index, _ = build_faiss_index(prod_docs)

print("Haz def docs:", len(haz_def_docs), "| Haz index:", haz_index.ntotal)
print("Prod docs:", len(prod_docs), "| Prod index:", prod_index.ntotal)

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Haz def docs: 10 | Haz index: 10
Prod docs: 192 | Prod index: 192


In [5]:
def retrieve(index, docs, query_text, top_k=5):
    q = embed_texts([query_text], batch_size=1, show_bar=False) 
    scores, idxs = index.search(q, top_k)
    out = []
    for s, i in zip(scores[0].tolist(), idxs[0].tolist()):
        if i < 0:
            continue
        out.append((s, docs[i]))
    return out

def format_ctx(items, max_chars=1600):
    blocks = []
    total = 0
    for score, d in items:
        block = f"(score={score:.3f}) {d['id']} | label={d['label']}\n{d['text']}"
        if total + len(block) > max_chars:
            break
        blocks.append(block)
        total += len(block)
    return "\n\n".join(blocks)

def get_rag_context_hybrid(title, text, k_h=1, k_p=5):
    q = f"{title}\n{text}"
    haz_hits = retrieve(haz_index, haz_def_docs, q, top_k=k_h)   # ✅ hazard defs only
    prod_hits = retrieve(prod_index, prod_docs, q, top_k=k_p)    # ✅ product full
    return format_ctx(haz_hits), format_ctx(prod_hits)

In [6]:
RAG_PROMPT = """
You must output EXACTLY one line of valid JSON and NOTHING else.
No explanations. No markdown. No extra text.

Allowed hazard-category labels (copy EXACTLY one):
{haz_list}

Allowed product-category labels (copy EXACTLY one):
{prod_list}

[HAZARD KNOWLEDGE]
{ctx_h}

[PRODUCT KNOWLEDGE]
{ctx_p}

[RECALL REPORT]
Title: {title}
Text: {text}

Return exactly:
{{"hazard-category":"<one hazard label>","product-category":"<one product label>"}}
""".strip()

def parse_json_one_line(s: str):
    m = re.search(r"\{.*\}", s, flags=re.S)
    if not m:
        return None
    try:
        return json.loads(m.group(0))
    except:
        return None

In [7]:
OLLAMA_MODEL = "llama3.1:8b"

def llm_generate(prompt: str) -> str:
    url = "http://localhost:11434/api/chat"
    payload = {
        "model": OLLAMA_MODEL,
        "messages": [
            {"role": "system", "content": "You output JSON only. Never output explanations."},
            {"role": "user", "content": prompt}
        ],
        "stream": False,
        "options": {
            "temperature": 0,
            "num_predict": 120
        }
    }
    r = requests.post(url, json=payload, timeout=120)
    r.raise_for_status()
    return r.json()["message"]["content"]

# quick sanity test
print(llm_generate('Return ONLY this JSON in one line: {"hazard-category":"biological","product-category":"meat, egg and dairy products"}'))

{"hazard-category":"biological","product-category":"meat, egg and dairy products"}


In [8]:
from tqdm.auto import tqdm

def run_hybrid_rag(df, k_h=1, k_p=5, max_rows=None):
    y_h_true, y_h_pred = [], []
    y_p_true, y_p_pred = [], []
    parse_ok = 0
    total = 0

    rows = df if max_rows is None else df.head(max_rows)

    for _, r in tqdm(rows.iterrows(), total=len(rows), desc="Hybrid RAG", leave=True):
        title = str(r["title"])
        text  = str(r["text"])

        ctx_h, ctx_p = get_rag_context_hybrid(title, text, k_h=k_h, k_p=k_p)

        prompt = RAG_PROMPT.format(
            haz_list=HAZ_LABELS,
            prod_list=PROD_LABELS,
            ctx_h=ctx_h,
            ctx_p=ctx_p,
            title=title,
            text=text
        )

        out = llm_generate(prompt)
        obj = parse_json_one_line(out)

        total += 1

        if obj and "hazard-category" in obj and "product-category" in obj:
            hz = obj["hazard-category"]
            pr = obj["product-category"]

            if hz in HAZ_LABELS and pr in PROD_LABELS:
                parse_ok += 1
                y_h_true.append(r["hazard-category"])
                y_h_pred.append(hz)
                y_p_true.append(r["product-category"])
                y_p_pred.append(pr)

    def metrics(y_true, y_pred):
        if len(y_true) == 0:
            return {"acc":0,"macro_f1":0,"micro_f1":0,"weighted_f1":0}
        return {
            "acc": accuracy_score(y_true, y_pred),
            "macro_f1": f1_score(y_true, y_pred, average="macro", zero_division=0),
            "micro_f1": f1_score(y_true, y_pred, average="micro", zero_division=0),
            "weighted_f1": f1_score(y_true, y_pred, average="weighted", zero_division=0),
        }

    return {
        "parse_ok_rate": parse_ok / max(total,1),
        "n_total": total,
        "n_parse_ok": parse_ok,
        "hazard": metrics(y_h_true, y_h_pred),
        "product": metrics(y_p_true, y_p_pred),
    }

In [19]:
# smoke
smoke = run_hybrid_rag(valid_df, k_h=1, k_p=5, max_rows=30)
print("SMOKE:", smoke)



Hybrid RAG:   0%|          | 0/30 [00:00<?, ?it/s]

SMOKE: {'parse_ok_rate': 1.0, 'n_total': 30, 'n_parse_ok': 30, 'hazard': {'acc': 0.3333333333333333, 'macro_f1': 0.23820297349709116, 'micro_f1': 0.3333333333333333, 'weighted_f1': 0.3352187028657617}, 'product': {'acc': 0.5333333333333333, 'macro_f1': 0.42600732600732594, 'micro_f1': 0.5333333333333333, 'weighted_f1': 0.5326984126984127}}


In [20]:
# full valid
res_valid_hybrid = run_hybrid_rag(valid_df, k_h=1, k_p=5)
print("VALID HYBRID:", res_valid_hybrid)

Hybrid RAG:   0%|          | 0/565 [00:00<?, ?it/s]

VALID HYBRID: {'parse_ok_rate': 0.9964601769911504, 'n_total': 565, 'n_parse_ok': 563, 'hazard': {'acc': 0.36767317939609234, 'macro_f1': 0.1852865297189791, 'micro_f1': 0.36767317939609234, 'weighted_f1': 0.40747091540909086}, 'product': {'acc': 0.41207815275310833, 'macro_f1': 0.37913966695272266, 'micro_f1': 0.41207815275310833, 'weighted_f1': 0.42425313104242834}}


## TEST

In [9]:
res_test_hybrid = run_hybrid_rag(test_df, k_h=1, k_p=5)
print("TEST HYBRID (k_h=1, k_p=5):", res_test_hybrid)

Hybrid RAG:   0%|          | 0/997 [00:00<?, ?it/s]

TEST HYBRID (k_h=1, k_p=5): {'parse_ok_rate': 0.9899699097291875, 'n_total': 997, 'n_parse_ok': 987, 'hazard': {'acc': 0.3617021276595745, 'macro_f1': 0.1779860956140859, 'micro_f1': 0.3617021276595745, 'weighted_f1': 0.39705575042621366}, 'product': {'acc': 0.44782168186423504, 'macro_f1': 0.38151108644541865, 'micro_f1': 0.44782168186423504, 'weighted_f1': 0.4597753456295018}}
