
# 00 ‚Äì Data Collection & Preparation for DietCheck

**Course:** CS6120 ‚Äì Natural Language Processing  
**Project:** DietCheck ‚Äì NLP System for Dietary Claim Verification  
**Notebook:** `00` ‚Äì Core data preparation, numeric labels for Task 1, and claim subsets for Task 2.

This notebook does the following:

1. Loads the **core product table** (`products.csv`) for DietCheck.
2. Computes **per-serving nutrition features** and **Task 1 dietary labels**:
   - `keto_compliant`, `high_protein`, `low_sodium`, `low_fat`  
     (using the FDA-style thresholds in the research plan).
3. Creates **train/validation/test splits** with label-combination awareness.
4. Extracts a **small, high-precision set of claim-like strings** from `products.csv`
   for **Task 2 manual annotation** ‚Üí `candidate_claims_task2.csv`.
5. Builds a **claim-rich subset from OpenFoodFacts via HuggingFace** using
   `labels_tags` ‚Üí `openfoodfacts_claims_subset.csv` for additional Task 2 data.

You should run this notebook top-to-bottom in a Colab or local environment with internet access
(for the HuggingFace step).


In [33]:
# ======================================================================
# Cell 1: Imports, paths, and logging
# ======================================================================

import os
import math
from pathlib import Path

import numpy as np
import pandas as pd

import logging

# Configure basic logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Data directory
DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

print(f"DATA_DIR set to: {DATA_DIR.resolve()}")
print(f"Random seed set to: {RANDOM_SEED}")

DATA_DIR set to: /content/data
Random seed set to: 42


In [34]:
# ======================================================================
# Cell 2: Load products.csv
# ======================================================================

products_path = DATA_DIR / "products.csv"

if not products_path.exists():
    raise FileNotFoundError(
        f"Expected {products_path} to exist.\n"
        "Please copy your DietCheck products table to data/products.csv and re-run."
    )

df = pd.read_csv(products_path)
print(f"‚úì Loaded products.csv with shape: {df.shape}")
print(f"‚úì Columns: {list(df.columns)}")
print(f"‚úì Product count: {len(df)}")

‚úì Loaded products.csv with shape: (279, 29)
‚úì Columns: ['product_id', 'name', 'brand', 'category', 'ingredients', 'serving_size_g', 'energy_100g', 'fat_100g', 'saturated_fat_100g', 'carbs_100g', 'fiber_100g', 'sugars_100g', 'protein_100g', 'sodium_100g', 'net_carbs_100g', 'energy_per_serving', 'fat_per_serving', 'saturated_fat_per_serving', 'carbs_per_serving', 'fiber_per_serving', 'sugars_per_serving', 'protein_per_serving', 'sodium_per_serving', 'net_carbs_per_serving', 'keto_compliant', 'high_protein', 'low_sodium', 'low_fat', 'label_combination']
‚úì Product count: 279


In [35]:
# ======================================================================
# Cell 3: Compute per-serving nutrition and Task 1 labels
# ======================================================================

print("\n‚û§ Computing per-serving nutrition and Task 1 dietary labels...\n")

# Check if per-serving columns already exist
serving_cols = ["energy_per_serving", "fat_per_serving", "protein_per_serving",
                "sodium_per_serving", "carbs_per_serving", "fiber_per_serving",
                "saturated_fat_per_serving", "sugars_per_serving", "net_carbs_per_serving"]

if all(col in df.columns for col in serving_cols):
    print("‚úì Per-serving columns already exist, skipping computation")
else:
    print("Computing per-serving values from 100g data...")

    # Compute per-serving values
    df["energy_per_serving"] = df["energy_100g"] * df["serving_size_g"] / 100.0
    df["fat_per_serving"] = df["fat_100g"] * df["serving_size_g"] / 100.0
    df["saturated_fat_per_serving"] = df["saturated_fat_100g"] * df["serving_size_g"] / 100.0
    df["carbs_per_serving"] = df["carbs_100g"] * df["serving_size_g"] / 100.0
    df["fiber_per_serving"] = df["fiber_100g"] * df["serving_size_g"] / 100.0
    df["sugars_per_serving"] = df["sugars_100g"] * df["serving_size_g"] / 100.0
    df["protein_per_serving"] = df["protein_100g"] * df["serving_size_g"] / 100.0
    df["sodium_per_serving"] = df["sodium_100g"] * df["serving_size_g"] / 100.0

    # Net carbs
    df["net_carbs_per_serving"] = (
        df["carbs_per_serving"] - df["fiber_per_serving"]
    ).fillna(df["carbs_per_serving"])

    print("‚úì Per-serving values computed")

# Check if Task 1 labels already exist
label_cols = ["keto_compliant", "high_protein", "low_sodium", "low_fat"]

if all(col in df.columns for col in label_cols):
    print("‚úì Task 1 labels already exist, skipping computation")
else:
    print("Computing Task 1 dietary classification labels...")

    # FDA thresholds (adjusted for keto as per research plan)
    KETO_NET_CARBS_THRESHOLD = 40.0  # Adjusted from 5g as documented
    HIGH_PROTEIN_THRESHOLD = 10.0     # grams (20% DV)
    LOW_SODIUM_THRESHOLD = 140.0      # mg
    LOW_FAT_THRESHOLD = 3.0           # grams

    df["keto_compliant"] = (df["net_carbs_per_serving"] <= KETO_NET_CARBS_THRESHOLD).fillna(False)
    df["high_protein"] = (df["protein_per_serving"] >= HIGH_PROTEIN_THRESHOLD).fillna(False)
    df["low_sodium"] = (df["sodium_per_serving"] <= LOW_SODIUM_THRESHOLD).fillna(False)
    df["low_fat"] = (df["fat_per_serving"] <= LOW_FAT_THRESHOLD).fillna(False)

    print("‚úì Task 1 labels computed")

# Create label combination for stratification
df["label_combination"] = (
    df["keto_compliant"].astype(str) + "_" +
    df["high_protein"].astype(str) + "_" +
    df["low_sodium"].astype(str) + "_" +
    df["low_fat"].astype(str)
)

print(f"\n‚úì Total products: {len(df)}")
print(f"‚úì Unique label combinations: {df['label_combination'].nunique()}")


‚û§ Computing per-serving nutrition and Task 1 dietary labels...

‚úì Per-serving columns already exist, skipping computation
‚úì Task 1 labels already exist, skipping computation

‚úì Total products: 279
‚úì Unique label combinations: 15


In [36]:
# ======================================================================
# Cell 4: Data quality and stratification report
# ======================================================================

print("\n" + "="*70)
print("DATA QUALITY REPORT")
print("="*70)

# Overall counts
print(f"\nüìä OVERALL STATISTICS")
print(f"  ‚Ä¢ Total products: {len(df)}")
print(f"  ‚Ä¢ Products with ingredients: {df['ingredients'].notna().sum()}")
print(f"  ‚Ä¢ Products with complete nutrition: {df[['fat_per_serving', 'protein_per_serving', 'sodium_per_serving', 'carbs_per_serving']].notna().all(axis=1).sum()}")

# Task 1 label distribution
print(f"\nüè∑Ô∏è  TASK 1 LABEL DISTRIBUTION")
for label in ["keto_compliant", "high_protein", "low_sodium", "low_fat"]:
    count = df[label].sum()
    pct = (count / len(df) * 100) if len(df) > 0 else 0
    print(f"  ‚Ä¢ {label}: {count} ({pct:.1f}%)")

# Category distribution (if exists)
if "category" in df.columns:
    print(f"\nüì¶ CATEGORY DISTRIBUTION (Top 8)")
    print(df["category"].value_counts().head(8))

# Label combinations
print(f"\nüîÄ LABEL COMBINATIONS (Top 10)")
print(df["label_combination"].value_counts().head(10))

# Nutrient completeness
print(f"\nüß™ NUTRIENT DATA COMPLETENESS")
nutrient_cols = ["fat_per_serving", "protein_per_serving", "sodium_per_serving",
                 "carbs_per_serving", "fiber_per_serving", "sugars_per_serving"]
for col in nutrient_cols:
    if col in df.columns:
        count = df[col].notna().sum()
        pct = (count / len(df) * 100) if len(df) > 0 else 0
        print(f"  ‚Ä¢ {col}: {count}/{len(df)} ({pct:.1f}%)")

print("\n" + "="*70)


DATA QUALITY REPORT

üìä OVERALL STATISTICS
  ‚Ä¢ Total products: 279
  ‚Ä¢ Products with ingredients: 279
  ‚Ä¢ Products with complete nutrition: 279

üè∑Ô∏è  TASK 1 LABEL DISTRIBUTION
  ‚Ä¢ keto_compliant: 90 (32.3%)
  ‚Ä¢ high_protein: 105 (37.6%)
  ‚Ä¢ low_sodium: 124 (44.4%)
  ‚Ä¢ low_fat: 103 (36.9%)

üì¶ CATEGORY DISTRIBUTION (Top 8)
category
en:plant-based-foods-and-beverages         130
en:dairies                                  47
en:beverages-and-beverages-preparations     29
en:condiments                               23
en:snacks                                   18
en:seafood                                  10
en:meals                                     6
en:meats-and-their-products                  5
Name: count, dtype: int64

üîÄ LABEL COMBINATIONS (Top 10)
label_combination
0_1_0_0    68
1_0_1_1    40
0_0_0_0    30
0_0_1_0    29
0_0_1_1    21
1_0_0_0    15
0_0_0_1    14
1_0_1_0    13
0_1_1_0    12
1_0_0_1    12
Name: count, dtype: int64

üß™ NUTRIENT DATA COMP

In [37]:
# ======================================================================
# Cell 5: Create train/validation/test splits
# ======================================================================

from sklearn.model_selection import train_test_split

print("\n‚û§ Creating train/validation/test splits...\n")

# First split: train vs (val+test)
df_train, df_temp = train_test_split(
    df,
    test_size=0.3,
    random_state=RANDOM_SEED,
    stratify=df["label_combination"]
)

# Second split: val vs test
df_val, df_test = train_test_split(
    df_temp,
    test_size=0.5,
    random_state=RANDOM_SEED,
    stratify=df_temp["label_combination"]
)

print(f"‚úì Train set: {len(df_train)} products ({len(df_train)/len(df)*100:.1f}%)")
print(f"‚úì Validation set: {len(df_val)} products ({len(df_val)/len(df)*100:.1f}%)")
print(f"‚úì Test set: {len(df_test)} products ({len(df_test)/len(df)*100:.1f}%)")

# Save splits
df_train.to_csv(DATA_DIR / "train.csv", index=False)
df_val.to_csv(DATA_DIR / "val.csv", index=False)
df_test.to_csv(DATA_DIR / "test.csv", index=False)

print(f"\n‚úì Saved splits to:")
print(f"  ‚Ä¢ {DATA_DIR / 'train.csv'}")
print(f"  ‚Ä¢ {DATA_DIR / 'val.csv'}")
print(f"  ‚Ä¢ {DATA_DIR / 'test.csv'}")


‚û§ Creating train/validation/test splits...



ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [None]:
# ======================================================================
# Cell 6: Candidate claim extraction (HELPER for manual Task 2 annotation)
# ======================================================================
# NOTE: This cell ONLY extracts candidate text snippets.
# All Task 2 labels (claim_verifiable, claim_conflict, explanations)
# must be manually annotated by humans.
# ======================================================================

import re

print("\n‚û§ Extracting claim-like strings for MANUAL Task 2 annotation\n")
print("‚ö†Ô∏è  This is a HELPER step - no automatic labels are created.")
print("‚ö†Ô∏è  All claim_verifiable and claim_conflict labels must be done manually.\n")

source_df = df_train.copy()
print(f"  ‚Æï Using TRAINING split: {len(source_df)} products")

TEXT_FIELDS = [f for f in ["name", "category", "brand"] if f in source_df.columns]
print(f"  ‚Æï Scanning text fields: {TEXT_FIELDS}\n")

CLAIM_PATTERNS = {
    "low_sugar": [
        r"\bno\s+added\s+sugar\b",
        r"\bwithout\s+added\s+sugar\b",
        r"\bsugar[-\s]?free\b",
    ],
    "low_fat": [
        r"\blow[-\s]?fat\b",
        r"\b0\s*%\s*fat\b",
        r"\bfat[-\s]?free\b",
    ],
    "high_protein": [
        r"\b(high|rich)\s+in\s+protein\b",
        r"\bprotein[-\s]?rich\b",
        r"\bsource\s+of\s+protein\b",
    ],
    "high_fiber": [
        r"\b(high|rich)\s+in\s+fib(re|er)s?\b",
        r"\bsource\s+of\s+fib(re|er)s?\b",
    ],
    "low_sodium": [
        r"\blow\s+(salt|sodium)\b",
        r"\breduced\s+salt\b",
        r"\breduced\s+sodium\b",
        r"\bno\s+added\s+salt\b",
    ],
    "gluten_free": [
        r"\bgluten[-\s]?free\b",
    ],
    "lactose_free": [
        r"\blactose[-\s]?free\b",
    ],
    "keto": [
        r"\bketo(?:genic)?\b",
        r"\bketo[-\s]?friendly\b",
    ],
    "light": [
        r"\blight\b",
        r"\blightly\s+salted\b",
    ],
}

compiled_patterns = {
    k: [re.compile(p, flags=re.IGNORECASE) for p in v]
    for k, v in CLAIM_PATTERNS.items()
}

def extract_claims_from_text(pid, field_name, text, context_window=25):
    if not isinstance(text, str) or not text.strip():
        return []
    candidates = []
    for claim_type, regex_list in compiled_patterns.items():
        for regex in regex_list:
            for match in regex.finditer(text):
                start, end = match.span()
                left = max(0, start - context_window)
                right = min(len(text), end + context_window)
                snippet = text[left:right].strip()
                candidates.append(
                    {
                        "product_id": pid,
                        "claim_text": snippet,
                        "claim_type_hint": claim_type,
                        "source_field": field_name,
                        "full_text": text,
                    }
                )
    return candidates

all_candidates = []
for _, row in source_df.iterrows():
    pid = row.get("product_id", None)
    for field in TEXT_FIELDS:
        text = row.get(field, None)
        all_candidates.extend(
            extract_claims_from_text(pid, field, text, context_window=25)
        )

if not all_candidates:
    print("‚ö†Ô∏è  No candidate claims found with current patterns.")
    candidates_df = pd.DataFrame(
        columns=["product_id", "claim_text", "claim_type_hint", "source_field", "full_text"]
    )
else:
    candidates_df = pd.DataFrame(all_candidates)
    candidates_df = candidates_df.drop_duplicates(
        subset=["product_id", "claim_text", "claim_type_hint", "source_field"]
    ).reset_index(drop=True)

print(f"  ‚Æï Extracted {len(candidates_df)} claim-like strings")

if not candidates_df.empty:
    print("\n  ‚Æï Claim type hints (for manual review):")
    print(candidates_df["claim_type_hint"].value_counts())
    print(f"\n  ‚Æï Products with ‚â•1 claim snippet: {candidates_df['product_id'].nunique()}")

claims_path = DATA_DIR / "candidate_claims_task2.csv"
candidates_df.to_csv(claims_path, index=False)
print(f"\n‚úì Saved candidate claim strings to: {claims_path}")
print("  ‚Üí Use this CSV as a STARTING POINT for manual Task 2 annotation")

In [None]:
# ======================================================================
# Cell 7: Load claim-rich OFF subset from HuggingFace
# ======================================================================

from datasets import load_dataset

print("\n‚û§ Loading OpenFoodFacts claim-rich subset...\n")

# Target claim tags from OFF
TARGET_TAGS = [
    "en:no-gluten",
    "en:gluten-free",
    "en:vegan",
    "en:vegetarian",
    "en:organic",
    "en:no-lactose",
    "en:lactose-free",
    "en:no-palm-oil",
    "en:palm-oil-free",
    "en:low-fat",
    "en:low-sugar",
    "en:no-added-sugar",
    "en:high-protein",
    "en:high-fiber",
]

# Map OFF tags to our claim types
CLAIM_LABEL_MAP = {
    "en:no-gluten": "gluten_free",
    "en:gluten-free": "gluten_free",
    "en:vegan": "vegan",
    "en:vegetarian": "vegetarian",
    "en:organic": "organic",
    "en:no-lactose": "lactose_free",
    "en:lactose-free": "lactose_free",
    "en:no-palm-oil": "palm_oil_free",
    "en:palm-oil-free": "palm_oil_free",
    "en:low-fat": "low_fat",
    "en:low-sugar": "low_sugar",
    "en:no-added-sugar": "no_added_sugar",
    "en:high-protein": "high_protein",
    "en:high-fiber": "high_fiber",
}

print(f"Mapped labels: {len(CLAIM_LABEL_MAP)} unique tags\n")

def extract_main_text(val):
    """Extract main text from multilingual dict or return string."""
    if isinstance(val, dict):
        return val.get("en", "") or val.get("en-us", "") or ""
    return str(val) if val else ""

def extract_ingredients_text(val):
    """Extract ingredient text from multilingual dict."""
    if isinstance(val, dict):
        return val.get("en", "") or val.get("en-us", "") or ""
    return str(val) if val else ""

def get_float(d, key):
    """Extract float from nutriments dict, return None if invalid."""
    if not isinstance(d, dict):
        return None
    val = d.get(key)
    if val is None:
        return None
    try:
        f = float(val)
        return f if not (math.isnan(f) or math.isinf(f)) else None
    except (ValueError, TypeError):
        return None

def extract_nutriments(nutriments):
    """Extract key nutrients as floats (or None)."""
    if not isinstance(nutriments, dict):
        return {
            "energy_100g": None,
            "fat_100g": None,
            "saturated_fat_100g": None,
            "carbs_100g": None,
            "fiber_100g": None,
            "sugars_100g": None,
            "protein_100g": None,
            "sodium_100g": None,
        }

    return {
        "energy_100g": get_float(nutriments, "energy-kcal_100g"),
        "fat_100g": get_float(nutriments, "fat_100g"),
        "saturated_fat_100g": get_float(nutriments, "saturated-fat_100g"),
        "carbs_100g": get_float(nutriments, "carbohydrates_100g"),
        "fiber_100g": get_float(nutriments, "fiber_100g"),
        "sugars_100g": get_float(nutriments, "sugars_100g"),
        "protein_100g": get_float(nutriments, "proteins_100g"),
        "sodium_100g": get_float(nutriments, "sodium_100g"),
    }

MAX_ROWS = 2000

print("Loading OpenFoodFacts dataset (streaming from HuggingFace)...")
ds = load_dataset("openfoodfacts/product-database", split="food", streaming=True)

rows = []
seen_codes = set()
n_scanned = 0

for example in ds:
    n_scanned += 1
    labels_tags = example.get("labels_tags") or []
    labels_tags = [t for t in labels_tags if isinstance(t, str)]
    matching_tags = [t for t in labels_tags if t in TARGET_TAGS]
    if not matching_tags:
        continue

    code = example.get("code")
    if not code or code in seen_codes:
        continue
    seen_codes.add(code)

    claim_types = []
    for t in matching_tags:
        mapped = CLAIM_LABEL_MAP.get(t)
        if mapped:
            claim_types.append(mapped)
    if not claim_types:
        continue

    product_name = extract_main_text(example.get("product_name"))
    brand = (example.get("brands") or "").strip()
    categories = (example.get("categories") or "").strip()
    ingredients_text = extract_ingredients_text(example.get("ingredients_text"))
    labels_str = (example.get("labels") or "").strip()
    nutriments = extract_nutriments(example.get("nutriments"))

    row = {
        "product_id": code,
        "name": product_name,
        "brand": brand,
        "category": categories,
        "ingredients_text": ingredients_text,
        "labels": labels_str,
        "labels_tags": "|".join(labels_tags),
        "claim_type_hint": ";".join(sorted(set(claim_types))),
        "source_field": "labels/labels_tags",
        "full_text": " | ".join(
            [x for x in [product_name, brand, categories, ingredients_text, labels_str] if x]
        ),
        **nutriments,
    }
    rows.append(row)

    if len(rows) % 200 == 0:
        print(f"  ‚Æï Collected {len(rows)} claim-rich products (scanned {n_scanned})...")

    if len(rows) >= MAX_ROWS:
        break

print(f"\n‚úì Finished. Collected {len(rows)} claim-rich products (scanned {n_scanned} total rows).")

df_claims = pd.DataFrame(rows)
hf_output_path = DATA_DIR / "openfoodfacts_claims_subset.csv"
df_claims.to_csv(hf_output_path, index=False)
print(f"‚úì Saved claim-rich subset to: {hf_output_path}")

# Show summary
print(f"\nüìä OFF Claim Subset Summary:")
print(f"  ‚Ä¢ Total products: {len(df_claims)}")
print(f"  ‚Ä¢ With nutrition data: {df_claims[['fat_100g', 'protein_100g', 'sodium_100g', 'carbs_100g']].notna().any(axis=1).sum()}")
print(f"  ‚Ä¢ With ingredients: {df_claims['ingredients_text'].notna().sum()}")

In [None]:
# ======================================================================
# Cell 7: Load claim-rich OFF subset from HuggingFace
# ======================================================================

from datasets import load_dataset

print("\n‚û§ Loading OpenFoodFacts claim-rich subset...\n")

# Target claim tags from OFF
TARGET_TAGS = [
    "en:no-gluten",
    "en:gluten-free",
    "en:vegan",
    "en:vegetarian",
    "en:organic",
    "en:no-lactose",
    "en:lactose-free",
    "en:no-palm-oil",
    "en:palm-oil-free",
    "en:low-fat",
    "en:low-sugar",
    "en:no-added-sugar",
    "en:high-protein",
    "en:high-fiber",
]

# Map OFF tags to our claim types
CLAIM_LABEL_MAP = {
    "en:no-gluten": "gluten_free",
    "en:gluten-free": "gluten_free",
    "en:vegan": "vegan",
    "en:vegetarian": "vegetarian",
    "en:organic": "organic",
    "en:no-lactose": "lactose_free",
    "en:lactose-free": "lactose_free",
    "en:no-palm-oil": "palm_oil_free",
    "en:palm-oil-free": "palm_oil_free",
    "en:low-fat": "low_fat",
    "en:low-sugar": "low_sugar",
    "en:no-added-sugar": "no_added_sugar",
    "en:high-protein": "high_protein",
    "en:high-fiber": "high_fiber",
}

print(f"Mapped labels: {len(CLAIM_LABEL_MAP)} unique tags\n")

def extract_main_text(val):
    """Extract main text from multilingual dict or return string."""
    if isinstance(val, dict):
        return val.get("en", "") or val.get("en-us", "") or ""
    return str(val) if val else ""

def extract_ingredients_text(val):
    """Extract ingredient text from multilingual dict."""
    if isinstance(val, dict):
        return val.get("en", "") or val.get("en-us", "") or ""
    return str(val) if val else ""

def get_float(d, key):
    """Extract float from nutriments dict, return None if invalid."""
    if not isinstance(d, dict):
        return None
    val = d.get(key)
    if val is None:
        return None
    try:
        f = float(val)
        return f if not (math.isnan(f) or math.isinf(f)) else None
    except (ValueError, TypeError):
        return None

def extract_nutriments(nutriments):
    """Extract key nutrients as floats (or None)."""
    if not isinstance(nutriments, dict):
        return {
            "energy_100g": None,
            "fat_100g": None,
            "saturated_fat_100g": None,
            "carbs_100g": None,
            "fiber_100g": None,
            "sugars_100g": None,
            "protein_100g": None,
            "sodium_100g": None,
        }

    return {
        "energy_100g": get_float(nutriments, "energy-kcal_100g"),
        "fat_100g": get_float(nutriments, "fat_100g"),
        "saturated_fat_100g": get_float(nutriments, "saturated-fat_100g"),
        "carbs_100g": get_float(nutriments, "carbohydrates_100g"),
        "fiber_100g": get_float(nutriments, "fiber_100g"),
        "sugars_100g": get_float(nutriments, "sugars_100g"),
        "protein_100g": get_float(nutriments, "proteins_100g"),
        "sodium_100g": get_float(nutriments, "sodium_100g"),
    }

MAX_ROWS = 2000

print("Loading OpenFoodFacts dataset (streaming from HuggingFace)...")
ds = load_dataset("openfoodfacts/product-database", split="food", streaming=True)

rows = []
seen_codes = set()
n_scanned = 0

for example in ds:
    n_scanned += 1
    labels_tags = example.get("labels_tags") or []
    labels_tags = [t for t in labels_tags if isinstance(t, str)]
    matching_tags = [t for t in labels_tags if t in TARGET_TAGS]
    if not matching_tags:
        continue

    code = example.get("code")
    if not code or code in seen_codes:
        continue
    seen_codes.add(code)

    claim_types = []
    for t in matching_tags:
        mapped = CLAIM_LABEL_MAP.get(t)
        if mapped:
            claim_types.append(mapped)
    if not claim_types:
        continue

    product_name = extract_main_text(example.get("product_name"))
    brand = (example.get("brands") or "").strip()
    categories = (example.get("categories") or "").strip()
    ingredients_text = extract_ingredients_text(example.get("ingredients_text"))
    labels_str = (example.get("labels") or "").strip()
    nutriments = extract_nutriments(example.get("nutriments"))

    row = {
        "product_id": code,
        "name": product_name,
        "brand": brand,
        "category": categories,
        "ingredients_text": ingredients_text,
        "labels": labels_str,
        "labels_tags": "|".join(labels_tags),
        "claim_type_hint": ";".join(sorted(set(claim_types))),
        "source_field": "labels/labels_tags",
        "full_text": " | ".join(
            [x for x in [product_name, brand, categories, ingredients_text, labels_str] if x]
        ),
        **nutriments,
    }
    rows.append(row)

    if len(rows) % 200 == 0:
        print(f"  ‚Æï Collected {len(rows)} claim-rich products (scanned {n_scanned})...")

    if len(rows) >= MAX_ROWS:
        break

print(f"\n‚úì Finished. Collected {len(rows)} claim-rich products (scanned {n_scanned} total rows).")

df_claims = pd.DataFrame(rows)
hf_output_path = DATA_DIR / "openfoodfacts_claims_subset.csv"
df_claims.to_csv(hf_output_path, index=False)
print(f"‚úì Saved claim-rich subset to: {hf_output_path}")

# Show summary
print(f"\nüìä OFF Claim Subset Summary:")
print(f"  ‚Ä¢ Total products: {len(df_claims)}")
print(f"  ‚Ä¢ With nutrition data: {df_claims[['fat_100g', 'protein_100g', 'sodium_100g', 'carbs_100g']].notna().any(axis=1).sum()}")
print(f"  ‚Ä¢ With ingredients: {df_claims['ingredients_text'].notna().sum()}")

In [None]:
# ======================================================================
# Cell 9: Sample OFF products for manual annotation (FIXED)
# ======================================================================

print("\n‚û§ Creating stratified OFF sample for manual annotation...\n")

if len(df_filtered) == 0:
    print("‚ö†Ô∏è  No products available for sampling.")
    print("   Skipping OFF sampling step.")
    sampled = pd.DataFrame()
else:
    # Create primary_claim from claim_type_hint
    df_filtered["primary_claim"] = (
        df_filtered["claim_type_hint"]
        .fillna("")
        .str.split(";")
        .str[0]
        .str.strip()
    )

    # Remove rows where primary_claim is empty
    df_filtered = df_filtered[df_filtered["primary_claim"].ne("")].copy()

    print(f"  ‚Æï Unique primary claims: {df_filtered['primary_claim'].nunique()}")

    # Target number of OFF products
    TARGET_OFF = 120

    per_label_target = max(1, TARGET_OFF // max(1, df_filtered["primary_claim"].nunique()))

    def _sample_group(g):
        n = min(len(g), per_label_target)
        return g.sample(n=n, random_state=RANDOM_SEED)

    sampled = (
        df_filtered
        .groupby("primary_claim", group_keys=False)
        .apply(_sample_group)
    )

    # Top up if under target
    if len(sampled) < TARGET_OFF:
        remaining = df_filtered.drop(sampled.index)
        extra_needed = TARGET_OFF - len(sampled)
        if extra_needed > 0 and len(remaining) > 0:
            extra = remaining.sample(
                n=min(extra_needed, len(remaining)),
                random_state=RANDOM_SEED,
            )
            sampled = pd.concat([sampled, extra], ignore_index=True)

    sampled = sampled.reset_index(drop=True)

    print(f"\n‚úì Final OFF sample size: {len(sampled)}")
    print("\n  Claim distribution:")
    print(sampled["primary_claim"].value_counts())

    # Save sampled subset
    off_sampled_path = DATA_DIR / "openfoodfacts_off_sample_for_manual.csv"
    sampled.to_csv(off_sampled_path, index=False)
    print(f"\n‚úì Saved OFF sample to: {off_sampled_path}")

In [None]:
# ======================================================================
# Cell 10: Final summary and reproducibility check
# ======================================================================

print("\n" + "="*70)
print("NOTEBOOK 00 COMPLETION SUMMARY")
print("="*70)

print("\n‚úÖ COMPLETED TASKS:")
print("  1. Loaded base products.csv")
print("  2. Computed per-serving nutrition and Task 1 labels")
print("  3. Created stratified train/val/test splits")
print("  4. Generated candidate claims for manual Task 2 annotation")
print("  5. Loaded and filtered OpenFoodFacts claim-rich subset")
print("  6. Created stratified OFF sample for manual annotation")

print("\nüìÅ OUTPUT FILES:")
files_created = [
    "train.csv",
    "val.csv",
    "test.csv",
    "candidate_claims_task2.csv",
    "openfoodfacts_claims_subset.csv",
    "openfoodfacts_claims_filtered.csv",
    "openfoodfacts_off_sample_for_manual.csv"
]

for fname in files_created:
    path = DATA_DIR / fname
    if path.exists():
        size = path.stat().st_size / 1024
        print(f"  ‚úì {fname} ({size:.1f} KB)")
    else:
        print(f"  ‚ö†Ô∏è  {fname} (NOT FOUND)")

print("\nüìä CURRENT DATA STATUS:")
print(f"  ‚Ä¢ Products in train/val/test: {len(df)}")
print(f"  ‚Ä¢ Task 1 labels: ‚úì Complete")
print(f"  ‚Ä¢ Task 2 claim candidates: {len(candidates_df)} snippets")
print(f"  ‚Ä¢ Task 3 ready: ‚úì Ingredients available")

print("\n‚è≠Ô∏è  NEXT STEPS FOR GRADE CONTRACT:")
print("\n  FOR B GRADE (minimum 120 products):")
print("    ‚Ä¢ Current products: {len(df)}")
if len(df) >= 120:
    print("    ‚úì You have enough products!")
else:
    print(f"    ‚ö†Ô∏è  Need {120 - len(df)} more products")
print("    ‚Ä¢ Manually annotate Task 1 on all products")
print("    ‚Ä¢ Manually annotate Task 2 on ‚â•50 claims")
print("    ‚Ä¢ Implement rule-based + TF-IDF baselines")

print("\n  FOR B+ GRADE (minimum 180 products):")
if len(df) >= 180:
    print("    ‚úì You have enough products!")
else:
    print(f"    ‚ö†Ô∏è  Need {180 - len(df)} more products")
print("    ‚Ä¢ Expand Task 2 to ‚â•120 claims with explanations")
print("    ‚Ä¢ Double-annotate ‚â•25 products (Cohen's kappa)")
print("    ‚Ä¢ Implement BERT baseline")

print("\n  FOR A- GRADE (minimum 200 products):")
if len(df) >= 200:
    print("    ‚úì You have enough products!")
else:
    print(f"    ‚ö†Ô∏è  Need {200 - len(df)} more products")
print("    ‚Ä¢ Expand Task 2 to ‚â•160 claims")
print("    ‚Ä¢ Start Task 3 BIO tagging on ‚â•80 products")
print("    ‚Ä¢ Implement multimodal model")

print("\n  FOR A GRADE (minimum 200 products):")
if len(df) >= 200:
    print("    ‚úì You have enough products!")
else:
    print(f"    ‚ö†Ô∏è  Need {200 - len(df)} more products")
print("    ‚Ä¢ Complete Task 3 on ‚â•120 products")
print("    ‚Ä¢ Implement claim-table model")
print("    ‚Ä¢ Perform slice-based analysis")

print("\nüîí REPRODUCIBILITY:")
print(f"  ‚Ä¢ Random seed: {RANDOM_SEED}")
print("  ‚Ä¢ All splits saved with stratification")
print("  ‚Ä¢ All intermediate files saved")

print("\n‚ö†Ô∏è  IMPORTANT REMINDERS:")
print("  ‚Ä¢ candidate_claims_task2.csv is ONLY a helper")
print("  ‚Ä¢ ALL Task 2 labels must be manually annotated")
print("  ‚Ä¢ Task 3 BIO tagging must be manual")
print("  ‚Ä¢ Follow FDA thresholds for Task 1 verification")

print("\n" + "="*70)
print("Notebook 00 complete! Ready for manual annotation.")
print("="*70)