In [1]:
# =========================================================
# üìò 04_rebuild_final_optimization_dataset.py
# Purpose: Rebuild recipes_final_for_optimization.csv cleanly
# =========================================================

import pandas as pd
from pathlib import Path
import numpy as np

# ---------------------------------------------------------
# 1Ô∏è‚É£ Paths
# ---------------------------------------------------------
BASE_DIR = Path("D:/Complete_Data/ml_part_nutrition_project")
PROCESSED_DIR = BASE_DIR / "processed_data"

# Input files
NUTR_FILE = PROCESSED_DIR / "recipes_enriched.csv"
ENV_FILE = PROCESSED_DIR / "recipes_with_env_metrics.csv"
OUT_FILE = PROCESSED_DIR / "recipes_final_for_optimization.csv"

print(f"üì• Loading nutrition file: {NUTR_FILE}")
recipes_nutr = pd.read_csv(NUTR_FILE)

print(f"üì• Loading environment file: {ENV_FILE}")
recipes_env = pd.read_csv(ENV_FILE)

print(f"‚úÖ Shapes ‚Üí nutrition: {recipes_nutr.shape}, environment: {recipes_env.shape}")

# ---------------------------------------------------------
# 2Ô∏è‚É£ Clean up recipe_title and ensure same dtype
# ---------------------------------------------------------
recipes_nutr["recipe_title"] = recipes_nutr["recipe_title"].astype(str).str.strip().str.lower()
recipes_env["recipe_title"] = recipes_env["recipe_title"].astype(str).str.strip().str.lower()

# ---------------------------------------------------------
# 3Ô∏è‚É£ Merge on recipe_title
# ---------------------------------------------------------
recipes_final = pd.merge(
    recipes_nutr,
    recipes_env,
    on="recipe_title",
    how="left"
)
print(f"‚úÖ Merged recipes_final shape: {recipes_final.shape}")

# ---------------------------------------------------------
# 4Ô∏è‚É£ Fill missing numeric values
# ---------------------------------------------------------
num_cols = recipes_final.select_dtypes(include=[np.number]).columns
recipes_final[num_cols] = recipes_final[num_cols].fillna(recipes_final[num_cols].mean())

# ---------------------------------------------------------
# 5Ô∏è‚É£ Basic column cleanup
# ---------------------------------------------------------
recipes_final.rename(columns={
    "Total_emissions": "Total_emissions_mean",
    "Land_use_change": "Land_use_change_mean"
}, inplace=True)

recipes_final = recipes_final[
    [
        "recipe_id", "recipe_title", "ingredient_text",
        "energy_kcal_mean", "protein_g_mean", "fat_g_mean",
        "carbs_g_mean", "price_mean", "Total_emissions_mean",
        "Land_use_change_mean"
    ]
]

print(f"‚úÖ Final dataset prepared: {recipes_final.shape}")
print("Columns:", recipes_final.columns.tolist())

# ---------------------------------------------------------
# 6Ô∏è‚É£ Save the rebuilt dataset
# ---------------------------------------------------------
recipes_final.to_csv(OUT_FILE, index=False)
print(f"üíæ Saved rebuilt file ‚Üí {OUT_FILE}")


üì• Loading nutrition file: D:\Complete_Data\ml_part_nutrition_project\processed_data\recipes_enriched.csv
üì• Loading environment file: D:\Complete_Data\ml_part_nutrition_project\processed_data\recipes_with_env_metrics.csv
‚úÖ Shapes ‚Üí nutrition: (20130, 9), environment: (20130, 3)
‚úÖ Merged recipes_final shape: (20130, 11)
‚úÖ Final dataset prepared: (20130, 11)
Columns: ['recipe_id', 'recipe_title', 'ingredient_text', 'energy_kcal_mean', 'protein_g_mean', 'fat_g_mean', 'carbs_g_mean', 'price_mean', 'Total_emissions_mean', 'Total_emissions_mean', 'Land_use_change_mean']
üíæ Saved rebuilt file ‚Üí D:\Complete_Data\ml_part_nutrition_project\processed_data\recipes_final_for_optimization.csv
