<a href="https://colab.research.google.com/github/aayushis1203/dietcheck/blob/main/00_data_collection_and_automatic_labels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Install Required Packages

Installs all Python dependencies needed for data collection and processing.

In [51]:
# Install dependencies
!pip install openfoodfacts pandas numpy scikit-learn matplotlib seaborn -q

print("All packages installed successfully")

All packages installed successfully


## 2. Import Core Libraries

Imports all necessary Python libraries and configures visualization settings for consistent styling.

In [52]:
# Import core libraries
import os
import json
import subprocess
import re  # For robust serving size parsing

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Configure visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 10

print("Libraries imported successfully")


Libraries imported successfully


## 3. Setup Workspace and Directory Structure

Handles environment detection (Colab vs local) and sets up the project directory structure.
- **In Colab:** Clones GitHub repo if not already present
- **Locally:** Finds repository root automatically
- **Creates:** `data/` and `results/` directories


In [53]:
# ============================================================================
# CONFIGURATION - Update this with your GitHub repository URL
# ============================================================================
GITHUB_REPO = "https://github.com/aayushis1203/dietcheck.git"
REPO_NAME = GITHUB_REPO.split('/')[-1].replace('.git', '')

# ============================================================================
# Helper Functions
# ============================================================================

def find_repo_root():
    """
    Find repository root by searching for .git directory.
    Prevents nested repo cloning if already inside repo.
    """
    current = os.path.abspath(os.getcwd())

    for _ in range(5):  # Search up to 5 levels
        if os.path.exists(os.path.join(current, '.git')):
            return current
        parent = os.path.dirname(current)
        if parent == current:
            break
        current = parent

    return None

def setup_workspace():
    """
    Setup workspace for both Colab and local environments.
    Returns absolute paths to repo root, data, and results directories.
    """
    try:
        import google.colab
        in_colab = True
        print("üîß Running in Google Colab")

        # Check if already inside repo (prevents nested cloning)
        repo_root = find_repo_root()

        if repo_root:
            print(f"‚úÖ Already inside repo at: {repo_root}")
            os.chdir(repo_root)
        else:
            # Clone repo if not present
            if not os.path.exists(REPO_NAME):
                print(f"üì• Cloning {GITHUB_REPO}...")
                result = subprocess.run(
                    ['git', 'clone', GITHUB_REPO],
                    capture_output=True,
                    text=True
                )
                if result.returncode != 0:
                    raise RuntimeError(f"Git clone failed: {result.stderr}")

            os.chdir(REPO_NAME)

    except ImportError:
        in_colab = False
        print("üîß Running locally")

        # Find repo root automatically
        repo_root = find_repo_root()

        if repo_root:
            os.chdir(repo_root)
        else:
            print("‚ö†Ô∏è  Warning: Not in a git repository, using current directory")

    # Get absolute paths
    repo_root = os.path.abspath(os.getcwd())
    data_dir = os.path.join(repo_root, 'data')
    results_dir = os.path.join(repo_root, 'results')

    # Create directories
    os.makedirs(data_dir, exist_ok=True)
    os.makedirs(results_dir, exist_ok=True)

    print(f"‚úÖ Repo root: {repo_root}")
    print(f"üìÅ Data: {data_dir}")
    print(f"üìÅ Results: {results_dir}")

    return repo_root, data_dir, results_dir

# Execute setup and store paths
REPO_ROOT, DATA_DIR, RESULTS_DIR = setup_workspace()

üîß Running in Google Colab
‚úÖ Already inside repo at: /content/dietcheck
‚úÖ Repo root: /content/dietcheck
üìÅ Data: /content/dietcheck/data
üìÅ Results: /content/dietcheck/results


## 4. Data Collection from Open Food Facts API

Fetches products from multiple categories using requests with timeout handling.

In [54]:
import requests
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

def create_robust_session():
    session = requests.Session()
    retry_strategy = Retry(
        total=3,
        backoff_factor=2,
        status_forcelist=[429, 500, 502, 503, 504],
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session

def fetch_products_from_category(category, page_size=15, max_products=25):
    session = create_robust_session()
    base_url = "https://world.openfoodfacts.org/cgi/search.pl"
    products = []
    page = 1

    while len(products) < max_products:
        params = {
            'action': 'process',
            'tagtype_0': 'categories',
            'tag_contains_0': 'contains',
            'tag_0': category,
            'page_size': page_size,
            'page': page,
            'json': 1
        }

        try:
            print(f"   Page {page}...", end=" ")
            response = session.get(
                base_url,
                params=params,
                timeout=60,
                headers={'User-Agent': 'DietCheck-Research/1.0'}
            )

            if response.status_code == 200:
                data = response.json()
                page_products = data.get('products', [])
                if not page_products:
                    print("done")
                    break
                products.extend(page_products)
                print(f"‚úì ({len(products)} total)")
                if len(products) >= max_products:
                    break
                page += 1
                time.sleep(2)
            else:
                print(f"‚úó HTTP {response.status_code}")
                break
        except requests.exceptions.Timeout:
            print(f"‚úó timeout")
            time.sleep(5)
            continue
        except requests.exceptions.RequestException:
            print(f"‚úó error")
            break

    return products[:max_products]

print("‚úÖ Collection functions ready")

‚úÖ Collection functions ready


## 5. Product Collection

Fetches products from 17 categories targeting 300+ total products.

In [56]:
CATEGORIES = {
    'breakfast-cereals': 25,
    'soups': 25,
    'protein-products': 25,
    'snacks': 25,
    'beverages': 25,
    'frozen-meals': 25,
    'dairy-alternatives': 15,
    'condiments': 15,
    'yogurts': 25,
    'cheeses': 25,
    'breads': 25,
    'pasta': 20,
    'plant-based-foods': 25,
    'canned-foods': 20,
    'sauces': 20,
    'spreads': 15
}

all_products = []
print(f" Starting collection from {len(CATEGORIES)} categories\n")

for category, target in CATEGORIES.items():
    print(f"üì¶ {category} (target: {target})")
    products = fetch_products_from_category(category=category, page_size=15, max_products=target)
    all_products.extend(products)
    print(f"    {len(products)} collected | Total: {len(all_products)}\n")
    time.sleep(3)

print(f" Collection complete: {len(all_products)} products")

 Starting collection from 16 categories

üì¶ breakfast-cereals (target: 25)
   Page 1... ‚úì (15 total)
   Page 2... ‚úì (30 total)
    25 collected | Total: 25

üì¶ soups (target: 25)
   Page 1... 



‚úì (15 total)
   Page 2... ‚úì (30 total)
    25 collected | Total: 50

üì¶ protein-products (target: 25)
   Page 1... ‚úó error
    0 collected | Total: 50

üì¶ snacks (target: 25)
   Page 1... ‚úì (15 total)
   Page 2... ‚úì (30 total)
    25 collected | Total: 75

üì¶ beverages (target: 25)
   Page 1... ‚úì (15 total)
   Page 2... 



‚úì (30 total)
    25 collected | Total: 100

üì¶ frozen-meals (target: 25)
   Page 1... ‚úì (15 total)
   Page 2... ‚úì (30 total)
    25 collected | Total: 125

üì¶ dairy-alternatives (target: 15)
   Page 1... 



‚úó error
    0 collected | Total: 125

üì¶ condiments (target: 15)
   Page 1... ‚úì (15 total)
    15 collected | Total: 140

üì¶ yogurts (target: 25)
   Page 1... ‚úì (15 total)
   Page 2... ‚úì (30 total)
    25 collected | Total: 165

üì¶ cheeses (target: 25)
   Page 1... ‚úì (15 total)
   Page 2... ‚úì (30 total)
    25 collected | Total: 190

üì¶ breads (target: 25)
   Page 1... ‚úì (15 total)
   Page 2... ‚úì (30 total)
    25 collected | Total: 215

üì¶ pasta (target: 20)
   Page 1... ‚úì (15 total)
   Page 2... ‚úì (30 total)
    20 collected | Total: 235

üì¶ plant-based-foods (target: 25)
   Page 1... ‚úì (15 total)
   Page 2... ‚úì (30 total)
    25 collected | Total: 260

üì¶ canned-foods (target: 20)
   Page 1... ‚úì (15 total)
   Page 2... ‚úì (30 total)
    20 collected | Total: 280

üì¶ sauces (target: 20)
   Page 1... ‚úì (15 total)
   Page 2... ‚úì (30 total)
    20 collected | Total: 300

üì¶ spreads (target: 15)
   Page 1... ‚úì (15 total)
    15 collected

## 6. Deduplicate Products

Removes duplicate products by barcode.

In [None]:
seen_codes = set()
unique_products = []

for product in all_products:
    code = product.get('code', product.get('_id', ''))
    if code and code not in seen_codes:
        seen_codes.add(code)
        unique_products.append(product)

all_products = unique_products

print(f"üìä After deduplication: {len(all_products)} unique products")

## 7. Extract and Clean Product Data

Extracts nutritional features, calculates derived metrics, and validates data quality.

In [None]:
def parse_serving_size(serving_size_raw):
    """
    Parse a free-text serving size string into approximate grams.

    Strategy:
    - Use the first numeric quantity in the string (e.g., "30 g (1/2 cup)" -> 30).
    - Interpret units containing 'g' or 'gram' as grams.
    - Interpret 'ml' as grams assuming density ~1 g/mL.
    - If parsing fails or is clearly unreasonable, default to 100 g.

    This avoids concatenating all digits (e.g., "30 g (1/2 cup)" -> "3012") which
    badly inflates serving size and corrupts per-serving nutrition.
    """
    if not serving_size_raw:
        return 100.0

    s = str(serving_size_raw)
    match = re.search(r'(\d+(\.\d+)?)', s)
    if not match:
        return 100.0

    value = float(match.group(1))
    s_lower = s.lower()

    # Basic unit handling
    if 'g' in s_lower or 'gram' in s_lower:
        serving_g = value
    elif 'ml' in s_lower:
        # Approximate 1 mL ~ 1 g for most liquids
        serving_g = value
    else:
        # Unknown unit: treat the numeric as grams but guard against absurd values
        serving_g = value

    # Guardrails against pathological values
    if serving_g <= 0:
        return 100.0
    if serving_g > 1000:  # e.g., "100 g x 12" kind of strings
        return 100.0

    return serving_g


def extract_product_features(product):
    """
    Extract nutritional features from API response with validation.

    Calculations:
    - Net carbs = total_carbs - fiber - sugar alcohols (polyols) where available
      (for ketogenic classification).
    - Serving size normalization from 100 g values.
    - Sodium conversion from g to mg.
    """
    try:
        nutriments = product.get('nutriments', {}) or {}

        # Extract base nutrition per 100 g
        carbs_100g = nutriments.get('carbohydrates_100g', 0) or 0
        fiber_100g = nutriments.get('fiber_100g', 0) or 0
        polyols_100g = nutriments.get('polyols_100g', 0) or 0
        net_carbs_100g = max(0, carbs_100g - fiber_100g - polyols_100g)

        # Get serving size (default 100 g if not specified or messy)
        serving_size_raw = product.get('serving_size', '100g')
        serving_g = parse_serving_size(serving_size_raw)

        # Calculate per-serving values
        multiplier = serving_g / 100.0

        return {
            'product_id': product.get('code', ''),
            'name': product.get('product_name', ''),
            'brand': product.get('brands', ''),
            'category': product.get('categories_tags', [''])[0]
            if product.get('categories_tags') else '',
            'ingredients': product.get('ingredients_text', ''),
            'serving_size_g': serving_g,

            # Per 100 g values
            'energy_100g': nutriments.get('energy-kcal_100g', 0) or 0,
            'fat_100g': nutriments.get('fat_100g', 0) or 0,
            'saturated_fat_100g': nutriments.get('saturated-fat_100g', 0) or 0,
            'carbs_100g': carbs_100g,
            'fiber_100g': fiber_100g,
            'sugars_100g': nutriments.get('sugars_100g', 0) or 0,
            'protein_100g': nutriments.get('proteins_100g', 0) or 0,
            'sodium_100g': (nutriments.get('sodium_100g', 0) or 0) * 1000,  # g ‚Üí mg
            'net_carbs_100g': net_carbs_100g,
            'polyols_100g': polyols_100g,

            # Per serving values
            'energy_per_serving': (nutriments.get('energy-kcal_100g', 0) or 0) * multiplier,
            'fat_per_serving': (nutriments.get('fat_100g', 0) or 0) * multiplier,
            'saturated_fat_per_serving': (nutriments.get('saturated-fat_100g', 0) or 0) * multiplier,
            'carbs_per_serving': carbs_100g * multiplier,
            'fiber_per_serving': fiber_100g * multiplier,
            'sugars_per_serving': (nutriments.get('sugars_100g', 0) or 0) * multiplier,
            'protein_per_serving': (nutriments.get('proteins_100g', 0) or 0) * multiplier,
            'sodium_per_serving': (nutriments.get('sodium_100g', 0) or 0) * 1000 * multiplier,
            'net_carbs_per_serving': net_carbs_100g * multiplier,
            'polyols_per_serving': polyols_100g * multiplier,
        }
    except Exception:
        # If anything goes wrong for this product, skip it
        return None


# Extract features from all products
products_data = []
for product in all_products:
    features = extract_product_features(product)
    if features:
        products_data.append(features)

df = pd.DataFrame(products_data)

print(f"‚úÖ Extracted features from {len(df)} products")
print(f"üìä Features per product: {len(df.columns)}")


## 8. Data Quality Validation

Validates completeness and removes products with missing critical fields.

In [None]:
# Required fields for dietary classification
REQUIRED_FIELDS = [
    'ingredients',
    'protein_per_serving',
    'sodium_per_serving',
    'fat_per_serving',
    'net_carbs_per_serving'
]

# Count missing values before cleaning
print("üìä Data Quality Report (Before Cleaning):")
print(f"   Total products: {len(df)}")
print(f"   Missing ingredients: {df['ingredients'].isna().sum()}")
print(f"   Empty ingredients: {(df['ingredients'] == '').sum()}")
print(f"   Missing nutrition data: {df[REQUIRED_FIELDS[1:]].isna().any(axis=1).sum()}")

# Remove products with missing critical data
initial_count = len(df)

df = df[df['ingredients'].notna() & (df['ingredients'] != '')]
df = df[df[REQUIRED_FIELDS[1:]].notna().all(axis=1)]
df = df[df['product_id'] != '']

# Remove duplicates by product_id
df = df.drop_duplicates(subset=['product_id'])

# Reset index
df = df.reset_index(drop=True)

print(f"\nüìä Data Quality Report (After Cleaning):")
print(f"   Products retained: {len(df)}")
print(f"   Products removed: {initial_count - len(df)}")
print(f"   Retention rate: {len(df)/initial_count*100:.1f}%")

## 9. Apply FDA Dietary Labels

Applies FDA regulatory thresholds for dietary classification.

**Thresholds:**
- Keto: ‚â§5g net carbs/serving
- High Protein: ‚â•10g protein/serving (20% DV)
- Low Sodium: ‚â§140mg sodium/serving
- Low Fat: ‚â§3g fat/serving

In [None]:
FDA_THRESHOLDS = {
    'keto_compliant': {
        'feature': 'net_carbs_per_serving',
        'threshold': 5.0,
        'operator': '<=',
        'source': 'Ketogenic diet standard'
    },
    'high_protein': {
        'feature': 'protein_per_serving',
        'threshold': 10.0,
        'operator': '>=',
        'source': 'FDA 21 CFR ¬ß101.54(b)'
    },
    'low_sodium': {
        'feature': 'sodium_per_serving',
        'threshold': 140.0,
        'operator': '<=',
        'source': 'FDA 21 CFR ¬ß101.61(b)(4)'
    },
    'low_fat': {
        'feature': 'fat_per_serving',
        'threshold': 3.0,
        'operator': '<=',
        'source': 'FDA 21 CFR ¬ß101.62(b)(2)'
    }
}


def apply_dietary_labels(row, thresholds, conservative_margin=0.10):
    """
    Apply FDA threshold-based classification with a conservative band.

    Rule:
    - Start from the usual rule:
        - '<=' labels compliant if value <= threshold.
        - '>=' labels compliant if value >= threshold.
    - Then, if the value is within ¬±conservative_margin of the threshold
      (relative difference), force label = 0 (non-compliant) to avoid
      optimistic labeling around the boundary.
    """
    labels = {}
    for label, config in thresholds.items():
        feature = config['feature']
        threshold = config['threshold']
        operator = config['operator']

        value = row.get(feature, np.nan)

        # Missing ‚Üí non-compliant
        if pd.isna(value):
            labels[label] = 0
            continue

        # Base decision
        if operator == '<=':
            compliant = value <= threshold
        else:  # '>='
            compliant = value >= threshold

        # Apply conservative band around the threshold
        if compliant and threshold > 0:
            rel_diff = abs(value - threshold) / threshold
            if rel_diff <= conservative_margin:
                compliant = False

        labels[label] = int(compliant)

    return pd.Series(labels)


# Apply labels
label_df = df.apply(lambda row: apply_dietary_labels(row, FDA_THRESHOLDS), axis=1)
df = pd.concat([df, label_df], axis=1)

print("‚úÖ FDA labels applied")
print("\nüìä Label Distribution:")
for label in FDA_THRESHOLDS.keys():
    count = df[label].sum()
    pct = (count / len(df)) * 100
    print(f"   {label}: {count}/{len(df)} ({pct:.1f}%)")


## 10. Dataset Statistics and Visualization

Analyzes label distribution and nutritional feature ranges.

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Label distribution
label_counts = [df[label].sum() for label in FDA_THRESHOLDS.keys()]
axes[0, 0].bar(FDA_THRESHOLDS.keys(), label_counts, color='steelblue')
axes[0, 0].set_title('FDA Label Distribution')
axes[0, 0].set_ylabel('Count')
axes[0, 0].tick_params(axis='x', rotation=45)

# Nutritional features distribution
nutrients = ['protein_per_serving', 'sodium_per_serving', 'fat_per_serving', 'net_carbs_per_serving']
positions = [(0, 1), (1, 0), (1, 1)]

for idx, nutrient in enumerate(nutrients[1:]):
    row, col = positions[idx]
    axes[row, col].hist(df[nutrient], bins=30, color='coral', edgecolor='black')
    axes[row, col].set_title(f'{nutrient.replace("_", " ").title()}')
    axes[row, col].set_xlabel('Value')
    axes[row, col].set_ylabel('Frequency')

plt.tight_layout()
plt.savefig(os.path.join(RESULTS_DIR, 'dataset_statistics.png'), dpi=300, bbox_inches='tight')
plt.show()

print(f"‚úÖ Visualization saved to {RESULTS_DIR}/dataset_statistics.png")

## 11. Train-Test Split

Creates stratified split attempting to balance label distribution.
Falls back to random split if stratification fails due to rare label combinations.

In [None]:
LABEL_COLS = ['keto_compliant', 'high_protein', 'low_sodium', 'low_fat']

# Create multi-label string for stratification attempt
df['label_combination'] = df[LABEL_COLS].apply(lambda x: '_'.join(x.astype(str)), axis=1)

# Check combination frequencies
combination_counts = df['label_combination'].value_counts()
print("üìä Label Combination Frequencies:")
print(f"   Unique combinations: {len(combination_counts)}")
print(f"   Singleton combinations: {(combination_counts == 1).sum()}")

# Attempt stratified split
try:
    train_df, test_df = train_test_split(
        df,
        test_size=0.2,
        random_state=42,
        stratify=df['label_combination']
    )
    print("\n‚úÖ Stratified split successful")
except ValueError:
    print("\n‚ö†Ô∏è  Stratification failed (rare label combinations)")
    print("   Using random split instead")
    train_df, test_df = train_test_split(
        df,
        test_size=0.2,
        random_state=42
    )

# Drop temporary column
train_df = train_df.drop('label_combination', axis=1)
test_df = test_df.drop('label_combination', axis=1)

print(f"\nüìä Split Summary:")
print(f"   Train set: {len(train_df)} ({len(train_df)/len(df)*100:.1f}%)")
print(f"   Test set: {len(test_df)} ({len(test_df)/len(df)*100:.1f}%)")

print("\nüìä Train Set Label Distribution:")
for label in LABEL_COLS:
    count = train_df[label].sum()
    pct = (count / len(train_df)) * 100
    print(f"   {label}: {count}/{len(train_df)} ({pct:.1f}%)")

print("\nüìä Test Set Label Distribution:")
for label in LABEL_COLS:
    count = test_df[label].sum()
    pct = (count / len(test_df)) * 100
    print(f"   {label}: {count}/{len(test_df)} ({pct:.1f}%)")

## 12. Save Datasets

Saves full dataset and train/test splits to CSV files.

In [None]:
# Save datasets
products_path = os.path.join(DATA_DIR, 'products.csv')
train_path = os.path.join(DATA_DIR, 'train.csv')
test_path = os.path.join(DATA_DIR, 'test.csv')

df.drop('label_combination', axis=1, errors='ignore').to_csv(products_path, index=False)
train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)

print("‚úÖ Datasets saved:")
print(f"   {products_path}")
print(f"   {train_path}")
print(f"   {test_path}")

print(f"\nüìä Final Dataset Summary:")
print(f"   Total products: {len(df)}")
print(f"   Features: {len(df.columns)}")
print(f"   Train samples: {len(train_df)}")
print(f"   Test samples: {len(test_df)}")

## 13. Dataset Metadata and Reproducibility

Documents dataset characteristics for reproducibility and reporting.

In [None]:
metadata = {
    'dataset_size': len(df),
    'train_size': len(train_df),
    'test_size': len(test_df),
    'num_features': len(df.columns),
    'label_distribution': {
        label: {
            'total': int(df[label].sum()),
            'percentage': float((df[label].sum() / len(df)) * 100),
            'train': int(train_df[label].sum()),
            'test': int(test_df[label].sum())
        }
        for label in LABEL_COLS
    },
    'fda_thresholds': FDA_THRESHOLDS,
    'collection_date': pd.Timestamp.now().strftime('%Y-%m-%d'),
    'categories_collected': list(CATEGORIES.keys()),
    'random_seed': 42
}

metadata_path = os.path.join(DATA_DIR, 'dataset_metadata.json')
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)

print("‚úÖ Metadata saved")
print(f"\nüìÑ Dataset Metadata:")
print(json.dumps(metadata, indent=2))