In [5]:
# =========================================================
# üìò 03_merge_for_optimization.ipynb
# Purpose: Merge nutritional + environmental + cost metrics
#          for Safety Layer & NSGA-II Optimization
# =========================================================

import pandas as pd
from pathlib import Path

# ---------------------------------------------------------
# 1Ô∏è‚É£ Define paths
# ---------------------------------------------------------
BASE_DIR = Path("D:/Complete_Data/ml_part_nutrition_project")
PROCESSED_DIR = BASE_DIR / "processed_data"

recipes_nutr_path = PROCESSED_DIR / "recipes_enriched.csv"
recipes_env_path = PROCESSED_DIR / "recipes_with_env_metrics.csv"
final_save_path = PROCESSED_DIR / "recipes_final_for_optimization.csv"

# ---------------------------------------------------------
# 2Ô∏è‚É£ Load datasets
# ---------------------------------------------------------
print("üì• Loading processed datasets...")
recipes_nutr = pd.read_csv(recipes_nutr_path)
recipes_env = pd.read_csv(recipes_env_path)

print(f"‚úÖ Loaded nutrition dataset: {recipes_nutr.shape}")
print(f"‚úÖ Loaded environment dataset: {recipes_env.shape}")

# ---------------------------------------------------------
# 3Ô∏è‚É£ Force recipe_title to string before merging
# ---------------------------------------------------------
recipes_nutr["recipe_title"] = recipes_nutr["recipe_title"].astype(str).str.strip()
recipes_env["recipe_title"] = recipes_env["recipe_title"].astype(str).str.strip()

# ---------------------------------------------------------
# 4Ô∏è‚É£ Merge on recipe_title
# ---------------------------------------------------------
recipes_merged = pd.merge(recipes_nutr, recipes_env, on="recipe_title", how="left")

# ---------------------------------------------------------
# 5Ô∏è‚É£ Clean column names
# ---------------------------------------------------------
recipes_merged.columns = recipes_merged.columns.str.replace(" ", "_").str.strip()

print("‚úÖ Columns after merge:")
print(list(recipes_merged.columns))

# ---------------------------------------------------------
# 6Ô∏è‚É£ Select key columns for optimization
# ---------------------------------------------------------
expected_cols = [
    "recipe_title",
    "energy_kcal_mean",
    "protein_g_mean",
    "fat_g_mean",
    "carbs_g_mean",
    "price_mean",
    "Total_emissions",
    "Land_use_change"
]

available_cols = [c for c in expected_cols if c in recipes_merged.columns]
recipes_final = recipes_merged[available_cols].dropna().reset_index(drop=True)

if len(available_cols) < 6:
    print("‚ö†Ô∏è Warning: Some expected columns are missing. Available columns:", available_cols)
else:
    print("‚úÖ All required columns found.")

print(f"‚úÖ Final dataset shape: {recipes_final.shape}")
print(f"‚úÖ Columns used: {available_cols}")

# ---------------------------------------------------------
# 7Ô∏è‚É£ Save merged dataset
# ---------------------------------------------------------
recipes_final.to_csv(final_save_path, index=False)
print(f"üíæ Saved final merged dataset ‚Üí {final_save_path}")


üì• Loading processed datasets...
‚úÖ Loaded nutrition dataset: (20130, 9)
‚úÖ Loaded environment dataset: (20130, 3)
‚úÖ Columns after merge:
['recipe_id', 'recipe_title', 'ingredient_text', 'energy_kcal_mean', 'protein_g_mean', 'fat_g_mean', 'carbs_g_mean', 'price_mean', 'Total_emissions_mean', 'Total_emissions', 'Land_use_change']
‚úÖ All required columns found.
‚úÖ Final dataset shape: (0, 8)
‚úÖ Columns used: ['recipe_title', 'energy_kcal_mean', 'protein_g_mean', 'fat_g_mean', 'carbs_g_mean', 'price_mean', 'Total_emissions', 'Land_use_change']
üíæ Saved final merged dataset ‚Üí D:\Complete_Data\ml_part_nutrition_project\processed_data\recipes_final_for_optimization.csv


In [1]:
# =========================================================
# 03_safety_layer_nsga2_prep.py
# Dynamic Safety Layer ‚Üí rule-based + small ML fallback + saving
# Author: Generated for Apoorva Sharma
# =========================================================

import pandas as pd
import numpy as np
import re
import json
from pathlib import Path
import joblib
import warnings
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score

warnings.filterwarnings("ignore")

# ----------------------------
# 0Ô∏è‚É£ Paths & Setup
# ----------------------------
BASE_DIR = Path("D:/Complete_Data/ml_part_nutrition_project")   # <-- update if necessary
PROCESSED_DIR = BASE_DIR / "processed_data"
RESULTS_DIR = BASE_DIR / "results"
MODELS_DIR = BASE_DIR / "models"

PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
MODELS_DIR.mkdir(parents=True, exist_ok=True)

# Input file(s) expected
RECIPES_ENRICHED = PROCESSED_DIR / "recipes_enriched.csv"         # preferred
RECIPES_FALLBACK = PROCESSED_DIR / "recipes_master_with_ingredients.csv"  # fallback

# Output files
GLOBAL_SAFE_PATH = PROCESSED_DIR / "recipes_safety_filtered.csv"
RULES_JSON_PATH = MODELS_DIR / "safety_rules.json"
MODEL_PATH = MODELS_DIR / "safety_layer_rf.joblib"
VECT_PATH = MODELS_DIR / "safety_tfidf_vectorizer.joblib"
SUMMARY_JSON = RESULTS_DIR / "safety_layer_summary.json"
PER_USER_DIR = RESULTS_DIR / "safety_by_user"
PER_USER_DIR.mkdir(parents=True, exist_ok=True)

# ----------------------------
# 1Ô∏è‚É£ Load dataset (robust)
# ----------------------------
def load_recipes():
    if RECIPES_ENRICHED.exists():
        df = pd.read_csv(RECIPES_ENRICHED)
        print(f"Loaded recipes_enriched: {df.shape} -> {RECIPES_ENRICHED}")
        return df
    elif RECIPES_FALLBACK.exists():
        df = pd.read_csv(RECIPES_FALLBACK)
        print(f"Loaded fallback recipes: {df.shape} -> {RECIPES_FALLBACK}")
        return df
    else:
        raise FileNotFoundError(f"Neither {RECIPES_ENRICHED} nor {RECIPES_FALLBACK} found. Place processed recipes CSV in {PROCESSED_DIR}")

recipes = load_recipes()

# Ensure necessary columns exist; create safe defaults where needed
def normalize_text(s):
    if pd.isna(s):
        return ""
    s = str(s).lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

# Find ingredient text column or construct
if "ingredient_text" not in recipes.columns:
    # try common variants
    candidates = [c for c in recipes.columns if "ingredient" in c.lower() or "ing" in c.lower()]
    if candidates:
        recipes["ingredient_text"] = recipes[candidates[0]].astype(str)
        print(f"Using column '{candidates[0]}' as ingredient_text")
    else:
        # attempt to combine any 'ingredient' lists in other formats
        print("No ingredient text column found ‚Äî filling with empty strings (you should create recipe->ingredient mapping earlier).")
        recipes["ingredient_text"] = ""

recipes["ingredient_text_clean"] = recipes["ingredient_text"].astype(str).apply(normalize_text)
recipes["title_clean"] = recipes.get("title", recipes.get("recipe_title", "")).astype(str).apply(normalize_text)

# ----------------------------
# 2Ô∏è‚É£ Base knowledge dictionaries (you can extend)
# ----------------------------
ALLERGENS = {
    "nuts": ["almond", "walnut", "cashew", "peanut", "hazelnut", "pecan", "pistachio", "macadamia"],
    "dairy": ["milk", "butter", "cheese", "cream", "yogurt", "ghee", "buttermilk"],
    "gluten": ["wheat", "barley", "rye", "flour", "bread", "pasta", "semolina"],
    "soy": ["soy", "soybean", "tofu", "soy sauce", "miso"],
    "egg": ["egg", "mayonnaise", "meringue"],
    "fish": ["fish", "salmon", "tuna", "cod", "anchovy"],
    "shellfish": ["shrimp", "prawn", "crab", "lobster", "clam", "oyster"],
    "sesame": ["sesame", "tahini", "sesame oil"],
    "mustard": ["mustard", "mustard seed", "mustard powder"]
}

DIETS = {
    "vegan": ["milk", "cheese", "butter", "meat", "egg", "fish", "honey", "yogurt", "gelatin"],
    "vegetarian": ["meat", "pork", "beef", "chicken"],
    "keto": ["rice", "bread", "sugar", "pasta", "potato", "banana", "corn"],
    "paleo": ["processed", "bread", "pasta", "cereal", "legumes"],
    "low_fodmap": ["garlic", "onion", "beans", "lentils", "apple", "pear", "mushroom"]
}

HEALTH_RESTRICTIONS = {
    "diabetes": ["sugar", "honey", "syrup", "soda", "sweet", "candy", "cake", "cookie", "pastry", "white rice", "white bread", "refined flour", "white pasta", "dessert", "jam", "jelly"],
    "hypertension": ["salt", "sodium", "soy sauce", "pickles", "canned", "processed meat", "ham", "bacon", "salami", "sausage", "instant noodles", "bouillon"],
    "celiac": ["wheat", "barley", "rye", "malt", "triticale", "bulgur", "semolina", "farina", "graham", "spelt", "kamut", "durum", "flour"],
    "pcos": ["sugar", "white bread", "fried", "processed snack", "soda", "refined carbs", "trans fat", "fast food"],
    "kidney_disease": ["potassium", "banana", "avocado", "potato", "tomato", "spinach", "orange", "orange juice", "dried fruit"],
    "gout": ["red meat", "organ meat", "liver", "kidney", "anchovy", "sardine", "mackerel", "tuna", "beer"],
    "thyroid": ["soy", "broccoli", "cabbage", "cauliflower", "kale", "turnip", "cassava"],
    "gerd": ["chocolate", "mint", "fried", "spicy", "tomato", "citrus", "onion", "garlic", "coffee", "alcohol"]
}

MEDICATION_RESTRICTIONS = {
    "warfarin": ["kale", "spinach", "broccoli", "collard", "parsley", "cabbage"],  # vitamin K
    "metformin": ["alcohol"],  # simplified example
    # add others as required
}

# Save rules JSON (human-readable)
rules_to_save = {
    "ALLERGENS": ALLERGENS,
    "DIETS": DIETS,
    "HEALTH_RESTRICTIONS": HEALTH_RESTRICTIONS,
    "MEDICATION_RESTRICTIONS": MEDICATION_RESTRICTIONS
}
with open(RULES_JSON_PATH, "w", encoding="utf-8") as f:
    json.dump(rules_to_save, f, indent=2)
print(f"Saved safety rules -> {RULES_JSON_PATH}")

# ----------------------------
# 3Ô∏è‚É£ Utility functions for checking
# ----------------------------
def text_contains_any(text: str, keywords: list) -> bool:
    if not keywords or not isinstance(text, str):
        return False
    t = normalize_text(text)
    for kw in keywords:
        k = kw.replace("_", " ").lower()
        if k in t:
            return True
    return False

def violates_allergen(text: str, user_allergies: list) -> bool:
    for allergen in (user_allergies or []):
        words = ALLERGENS.get(allergen, [])
        if text_contains_any(text, words):
            return True
    return False

def violates_diet(text: str, user_diet: str) -> bool:
    if not user_diet:
        return False
    return text_contains_any(text, DIETS.get(user_diet, []))

def violates_health(text: str, conditions: list) -> dict:
    flags = {}
    for cond in (conditions or []):
        flags[cond] = text_contains_any(text, HEALTH_RESTRICTIONS.get(cond, []))
    return flags

def violates_medications(text: str, medications: list) -> dict:
    flags = {}
    for med in (medications or []):
        flags[med] = text_contains_any(text, MEDICATION_RESTRICTIONS.get(med.lower(), []))
    return flags

# ----------------------------
# 4Ô∏è‚É£ Main filtering function
# ----------------------------
def filter_recipes_for_user(recipes_df: pd.DataFrame,
                            user_allergies=None,
                            user_diet=None,
                            user_conditions=None,
                            user_medications=None,
                            save_personalized=True,
                            user_id="user_default",
                            verbose=True):
    user_allergies = user_allergies or []
    user_conditions = user_conditions or []
    user_medications = user_medications or []
    df = recipes_df.copy()

    if "ingredient_text_clean" not in df.columns:
        df["ingredient_text_clean"] = df["ingredient_text"].fillna("").astype(str).apply(normalize_text)

    # Compute flags
    df["contains_allergen"] = df["ingredient_text_clean"].apply(lambda x: violates_allergen(x, user_allergies))
    df["violates_diet"] = df["ingredient_text_clean"].apply(lambda x: violates_diet(x, user_diet))

    # conditions
    for cond in user_conditions:
        col = f"violates_{cond}"
        kws = HEALTH_RESTRICTIONS.get(cond, [])
        df[col] = df["ingredient_text_clean"].apply(lambda x, kws=kws: text_contains_any(x, kws))

    # medications
    for med in user_medications:
        col = f"violates_med_{med.lower()}"
        kws = MEDICATION_RESTRICTIONS.get(med.lower(), [])
        df[col] = df["ingredient_text_clean"].apply(lambda x, kws=kws: text_contains_any(x, kws))

    # combined flag list
    extra_flags = [f"violates_{c}" for c in user_conditions] + [f"violates_med_{m.lower()}" for m in user_medications]
    flag_cols = ["contains_allergen", "violates_diet"] + extra_flags
    for c in flag_cols:
        if c not in df.columns:
            df[c] = False

    df["is_safe_for_user"] = ~df[flag_cols].any(axis=1)
    safe_df = df[df["is_safe_for_user"]].reset_index(drop=True)
    unsafe_df = df[~df["is_safe_for_user"]].reset_index(drop=True)

    if save_personalized:
        out_path = PER_USER_DIR / f"{user_id}_recipes_safe.csv"
        safe_df.to_csv(out_path, index=False)
        # also save full flagged dataset
        full_path = PER_USER_DIR / f"{user_id}_recipes_flagged.csv"
        df.to_csv(full_path, index=False)
        if verbose:
            print(f"Saved safe recipes -> {out_path}  (count: {len(safe_df)})")
            print(f"Saved flagged recipes -> {full_path} (count: {len(df)})")

    return safe_df, df

# ----------------------------
# 5Ô∏è‚É£ Example user tests (and save global default)
# ----------------------------
if __name__ == "__main__":
    # Example users (change as needed)
    users = [
        {
            "user_id": "user_vegan_nuts_diabetes",
            "allergies": ["nuts", "gluten"],
            "diet": "vegan",
            "conditions": ["diabetes"],
            "medications": []
        },
        {
            "user_id": "user_hypertension_warfarin",
            "allergies": [],
            "diet": None,
            "conditions": ["hypertension"],
            "medications": ["warfarin"]
        }
    ]

    summary = {"global": {}}
    for u in users:
        safe, full = filter_recipes_for_user(
            recipes,
            user_allergies=u["allergies"],
            user_diet=u["diet"],
            user_conditions=u["conditions"],
            user_medications=u["medications"],
            save_personalized=True,
            user_id=u["user_id"],
            verbose=True
        )
        summary[u["user_id"]] = {
            "safe_count": int(len(safe)),
            "total_recipes": int(len(recipes))
        }

    # Save a global default (no restrictions)
    global_safe, global_full = filter_recipes_for_user(recipes,
                                                       user_allergies=[],
                                                       user_diet=None,
                                                       user_conditions=[],
                                                       user_medications=[],
                                                       save_personalized=True,
                                                       user_id="global_default",
                                                       verbose=True)
    global_safe.to_csv(GLOBAL_SAFE_PATH, index=False)
    print(f"Saved global safe dataset -> {GLOBAL_SAFE_PATH} (count: {len(global_safe)})")
    summary["global"]["safe_count"] = int(len(global_safe))
    summary["global"]["total_recipes"] = int(len(recipes))

    # ----------------------------
    # 6Ô∏è‚É£ Train a small ML model to predict is_safe_for_user (optional)
    #    ‚Äî useful to speed up runtime checks (approximate rules)
    # ----------------------------
    # Build training dataset using the "global" rule labels (no user filters) vs an example user profile
    # We'll create labels using the earlier function for one simulated user (e.g. user_vegan_nuts_diabetes)
    train_user = users[0]
    _, flagged = filter_recipes_for_user(recipes,
                                        user_allergies=train_user["allergies"],
                                        user_diet=train_user["diet"],
                                        user_conditions=train_user["conditions"],
                                        user_medications=train_user["medications"],
                                        save_personalized=False,
                                        user_id="train_user",
                                        verbose=False)
    if flagged["is_safe_for_user"].nunique() < 2:
        print("Not enough label variety to train ML model (all safe or all unsafe). Skipping training.")
        summary["ml_model"] = {"trained": False, "reason": "not_enough_label_variability"}
    else:
        # Vectorize ingredient text
        vec = TfidfVectorizer(max_features=2000, stop_words="english")
        X = vec.fit_transform(flagged["ingredient_text_clean"].astype(str)).toarray()
        y = flagged["is_safe_for_user"].astype(int).values

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        clf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
        clf.fit(X_train, y_train)

        # Evaluate
        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, zero_division=0)
        rec = recall_score(y_test, y_pred, zero_division=0)
        report = classification_report(y_test, y_pred, zero_division=0, output_dict=True)

        # Save model + vectorizer
        joblib.dump(clf, MODEL_PATH)
        joblib.dump(vec, VECT_PATH)

        # Save test results
        ml_summary = {
            "trained": True,
            "model_path": str(MODEL_PATH),
            "vectorizer_path": str(VECT_PATH),
            "accuracy": float(acc),
            "precision": float(prec),
            "recall": float(rec),
            "classification_report": report
        }
        summary["ml_model"] = ml_summary
        print(f"Trained ML model saved -> {MODEL_PATH}")
        print(f"Vectorizer saved -> {VECT_PATH}")
        print("ML eval:", {"accuracy": acc, "precision": prec, "recall": rec})

    # Save summary
    with open(SUMMARY_JSON, "w", encoding="utf-8") as f:
        json.dump(summary, f, indent=2)
    print(f"Saved summary -> {SUMMARY_JSON}")

    print("‚úÖ Safety layer pipeline finished.")


Loaded recipes_enriched: (20130, 9) -> D:\Complete_Data\ml_part_nutrition_project\processed_data\recipes_enriched.csv
Saved safety rules -> D:\Complete_Data\ml_part_nutrition_project\models\safety_rules.json
Saved safe recipes -> D:\Complete_Data\ml_part_nutrition_project\results\safety_by_user\user_vegan_nuts_diabetes_recipes_safe.csv  (count: 4183)
Saved flagged recipes -> D:\Complete_Data\ml_part_nutrition_project\results\safety_by_user\user_vegan_nuts_diabetes_recipes_flagged.csv (count: 20130)
Saved safe recipes -> D:\Complete_Data\ml_part_nutrition_project\results\safety_by_user\user_hypertension_warfarin_recipes_safe.csv  (count: 5836)
Saved flagged recipes -> D:\Complete_Data\ml_part_nutrition_project\results\safety_by_user\user_hypertension_warfarin_recipes_flagged.csv (count: 20130)
Saved safe recipes -> D:\Complete_Data\ml_part_nutrition_project\results\safety_by_user\global_default_recipes_safe.csv  (count: 20130)
Saved flagged recipes -> D:\Complete_Data\ml_part_nutrition_