<a href="https://colab.research.google.com/github/ashp902/blind-label/blob/main/Regex_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Contribution 3: Regex-based Information Extraction v1

This notebook demonstrates my first version (**v1**) of the regex-based information
extraction pipeline for food labels.

Given OCR text from front/back images, the code extracts:

- Product name
- Ingredients (full list + major ingredients)
- Nutrition facts (calories, protein, fats, sugars, etc.)
- Allergen information (including simple matching against a user profile)
- Expiry / use-by date (several formats)
- Usage / storage instructions

This is the basis for the structured information that will be read out by TTS
and used by the allergy alert system.


In [1]:
import re
from typing import Dict, Any, List, Optional
import pprint

pp = pprint.PrettyPrinter(indent=2)

sample_labels: Dict[str, str] = {
    "corn_flakes": """
Kellogg's® Corn Flakes

INGREDIENTS: Milled Corn, Sugar, Barley Malt Extract, Salt, Vitamins
(Niacinamide, Iron, Vitamin B6, Vitamin B2, Vitamin B1, Folic Acid, Vitamin D, Vitamin B12).
CONTAINS: Corn, Barley.

NUTRITION FACTS
Serving Size 1 cup (28 g)
Calories 110
Total Fat 0.5 g
Saturated Fat 0 g
Trans Fat 0 g
Cholesterol 0 mg
Sodium 200 mg
Total Carbohydrate 24 g
Total Sugars 2 g
Protein 2 g

Best Before: 15/09/2026

USAGE INSTRUCTIONS: Store in a cool, dry place. Consume within 30 days after opening.
""",
    "chocolate_bar": """
Dark Choco Bar 70%

INGREDIENTS: Cocoa Mass, Sugar, Cocoa Butter, Emulsifier (Soy Lecithin),
Natural Vanilla Flavour.
ALLERGENS: Contains Soy. May contain Milk and Tree Nuts.

NUTRITION FACTS (Per 25 g)
Energy 140 kcal
Fat 9 g
of which Saturates 5 g
Carbohydrate 13 g
of which Sugars 12 g
Protein 2 g
Salt 0.03 g

USE BY 03-2025

Storage: Store in a cool, dry place away from direct sunlight.
""",
    "almond_milk": """
Almond Dream Unsweetened

Ingredients: Water, Almonds (2.3%), Calcium Carbonate, Sea Salt, Stabiliser (Gellan Gum).

Nutrition Information (per 100 ml)
Energy 13 kcal
Fat 1.1 g
of which Saturates 0.1 g
Carbohydrate 0.1 g
of which Sugars 0.1 g
Protein 0.4 g
Salt 0.13 g

Allergen information: Contains Almonds. May contain other Tree Nuts.

Best Before JAN 2026

Directions: Shake well before use. Refrigerate after opening and use within 5 days.
""",
}


In [2]:
SECTION_PATTERNS = {
    "ingredients": r"\bingredients?\b\s*[:\-]",
    "nutrition": r"\b(nutrition facts?|nutritional information|nutrition information)\b",
    "expiry": r"\b(best before|use by|use-by|use before|expiry|exp\.)\b",
    "allergens": r"\b(allergen information|allergens?)\b|\bmay contain\b",
    "usage": r"\b(usage instructions|directions|storage)\b\s*[:\-]?",
}

ALLERGEN_KEYWORDS = [
    "milk",
    "egg",
    "eggs",
    "peanut",
    "peanuts",
    "tree nut",
    "tree nuts",
    "almond",
    "hazelnut",
    "walnut",
    "cashew",
    "pistachio",
    "soy",
    "soya",
    "wheat",
    "gluten",
    "fish",
    "shellfish",
    "sesame",
]


def find_sections(text: str) -> Dict[str, str]:
    """
    Use regexes to find approximate section boundaries and slice the full text
    into named sections.
    """
    lowered = text.lower()
    matches = []

    for name, pattern in SECTION_PATTERNS.items():
        m = re.search(pattern, lowered, flags=re.IGNORECASE)
        if m:
            matches.append((name, m.start()))

    if not matches:
        return {}

    matches.sort(key=lambda x: x[1])

    sections: Dict[str, str] = {}
    for i, (name, start) in enumerate(matches):
        end = matches[i + 1][1] if i + 1 < len(matches) else len(text)
        sections[name] = text[start:end].strip()

    return sections


In [3]:
def extract_ingredients(section_text: str) -> Dict[str, List[str]]:
    if not section_text:
        return {"full": [], "major": []}

    m = re.search(r"ingredients?\s*[:\-]\s*(.*)", section_text, flags=re.IGNORECASE | re.DOTALL)
    if m:
        list_text = m.group(1)
    else:
        list_text = section_text

    # Stop at next obvious header if present
    list_text = re.split(r"\n\s*(nutrition|allergen|storage|usage|directions)\b", list_text, flags=re.IGNORECASE)[0]

    items = re.split(r"[;,]", list_text)
    items = [it.strip(" .\n\t") for it in items if it.strip(" .\n\t")]

    major = items[:5]
    return {"full": items, "major": major}


def extract_nutrients(section_text: str) -> Dict[str, str]:
    """
    Extract basic nutrients like calories, protein, fat, sugar, salt.
    """
    if not section_text:
        return {}

    pattern = re.compile(
        r"(calories|energy|protein|total fat|fat|saturated fat|"
        r"total carbohydrate|carbohydrate|total sugars|sugars?|salt|sodium)"
        r"\s*[:\-]?\s*([\d\.]+)\s*([a-zA-Z%]+)?",
        flags=re.IGNORECASE,
    )

    nutrients: Dict[str, str] = {}
    for m in pattern.finditer(section_text):
        name = m.group(1).lower()
        value = m.group(2)
        unit = m.group(3) or ""
        nutrients[name] = f"{value} {unit}".strip()

    return nutrients


In [4]:
def extract_ingredients(section_text: str) -> Dict[str, List[str]]:
    if not section_text:
        return {"full": [], "major": []}

    m = re.search(r"ingredients?\s*[:\-]\s*(.*)", section_text, flags=re.IGNORECASE | re.DOTALL)
    if m:
        list_text = m.group(1)
    else:
        list_text = section_text

    # Stop at next obvious header if present
    list_text = re.split(r"\n\s*(nutrition|allergen|storage|usage|directions)\b", list_text, flags=re.IGNORECASE)[0]

    items = re.split(r"[;,]", list_text)
    items = [it.strip(" .\n\t") for it in items if it.strip(" .\n\t")]

    major = items[:5]
    return {"full": items, "major": major}


def extract_nutrients(section_text: str) -> Dict[str, str]:
    """
    Extract basic nutrients like calories, protein, fat, sugar, salt.
    """
    if not section_text:
        return {}

    pattern = re.compile(
        r"(calories|energy|protein|total fat|fat|saturated fat|"
        r"total carbohydrate|carbohydrate|total sugars|sugars?|salt|sodium)"
        r"\s*[:\-]?\s*([\d\.]+)\s*([a-zA-Z%]+)?",
        flags=re.IGNORECASE,
    )

    nutrients: Dict[str, str] = {}
    for m in pattern.finditer(section_text):
        name = m.group(1).lower()
        value = m.group(2)
        unit = m.group(3) or ""
        nutrients[name] = f"{value} {unit}".strip()

    return nutrients


In [5]:
def extract_expiry(text: str) -> Optional[str]:
    if not text:
        return None

    patterns = [
        r"\b\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4}\b",                           # 15/09/2026, 03-2025
        r"\b(best before|use by|use-before|use before|expiry|exp\.)\s*[:\-]?\s*[A-Z]{3,9}\s+\d{4}\b",  # Best Before JAN 2026
    ]

    for pat in patterns:
        m = re.search(pat, text, flags=re.IGNORECASE)
        if m:
            return m.group(0).strip()

    return None


def detect_allergens(text: str) -> List[str]:
    lowered = text.lower()
    found = set()
    for allergen in ALLERGEN_KEYWORDS:
        if allergen in lowered:
            found.add(allergen)
    return sorted(found)


In [6]:
def parse_label(text: str, user_allergens: Optional[List[str]] = None) -> Dict[str, Any]:
    """
    High-level v1 parser:
    - Product name = first non-empty line
    - Sections using regex boundaries
    - Regex-based ingredient and nutrition parsing
    - Simple expiry and allergen extraction
    """
    if user_allergens is None:
        user_allergens = []

    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    product_name = lines[0] if lines else "Unknown Product"

    sections = find_sections(text)
    ingredients_info = extract_ingredients(sections.get("ingredients", ""))
    nutrition_info = extract_nutrients(sections.get("nutrition", ""))
    expiry = extract_expiry(sections.get("expiry", text))
    allergens_detected = detect_allergens(text)

    user_allergens_norm = [a.lower() for a in user_allergens]
    user_alert = sorted(a for a in allergens_detected if a in user_allergens_norm)

    return {
        "product_name": product_name,
        "ingredients_full": ingredients_info["full"],
        "ingredients_major": ingredients_info["major"],
        "nutrition": nutrition_info,
        "expiry_raw": expiry,
        "allergens_detected": allergens_detected,
        "user_allergen_alert": user_alert,
        "usage_section": sections.get("usage", "").strip(),
    }


In [7]:
user_profile = ["Milk", "Soy", "Gluten", "Almond"]

for name, txt in sample_labels.items():
    print("=" * 80)
    print("LABEL:", name)
    result = parse_label(txt, user_allergens=user_profile)
    pp.pprint(result)


LABEL: corn_flakes
{ 'allergens_detected': [],
  'expiry_raw': '15/09/2026',
  'ingredients_full': [ 'Milled Corn',
                        'Sugar',
                        'Barley Malt Extract',
                        'Salt',
                        'Vitamins\n(Niacinamide',
                        'Iron',
                        'Vitamin B6',
                        'Vitamin B2',
                        'Vitamin B1',
                        'Folic Acid',
                        'Vitamin D',
                        'Vitamin B12).\nCONTAINS: Corn',
                        'Barley'],
  'ingredients_major': [ 'Milled Corn',
                         'Sugar',
                         'Barley Malt Extract',
                         'Salt',
                         'Vitamins\n(Niacinamide'],
  'nutrition': { 'calories': '110 Total',
                 'fat': '0 g',
                 'protein': '2 g',
                 'saturated fat': '0 g',
                 'sodium': '200 mg',
                

In [8]:
r1 = parse_label(sample_labels["corn_flakes"], user_allergens=["milk", "soy"])
assert "calories" in r1["nutrition"]
assert r1["expiry_raw"] is not None

r2 = parse_label(sample_labels["chocolate_bar"], user_allergens=["milk", "soy"])
assert any("soy" in a for a in r2["allergens_detected"])

r3 = parse_label(sample_labels["almond_milk"], user_allergens=["almond"])
assert "almond" in r3["allergens_detected"]
assert "almond" in r3["user_allergen_alert"]

print("Basic regex extraction v1 tests passed ✅")


Basic regex extraction v1 tests passed ✅
