In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings

In [2]:
def clean_nutrient_data(input_file='nutrients_csvfile.csv', output_file='data/nutrients_readable_cleaned.csv'):
    """
    Loads the raw nutrient data, cleans the nutrient columns,
    and saves the result as a new CSV, keeping food names intact.
    """
    print(f"Loading {input_file}...")
    try:
        nutrients_df = pd.read_csv(input_file)
    except FileNotFoundError:
        print(f"ERROR: {input_file} not found.")
        return

    # Define nutrient columns to clean
    columns_to_clean = ['Grams', 'Calories', 'Protein', 'Fat', 'Sat.Fat', 'Fiber', 'Carbs']
    
    for col in columns_to_clean:
        if nutrients_df[col].dtype == 'object':
            # Replace 't' (trace) with '0'
            nutrients_df[col] = nutrients_df[col].astype(str).str.replace('t', '0', regex=False)
            # Remove commas from numbers (e.g., "1,419")
            nutrients_df[col] = nutrients_df[col].astype(str).str.replace(',', '', regex=False)
    
    # Convert these columns to numeric
    for col in columns_to_clean:
        nutrients_df[col] = pd.to_numeric(nutrients_df[col], errors='coerce')
    
    # Fill any NaNs with 0
    nutrients_df[columns_to_clean] = nutrients_df[columns_to_clean].fillna(0)
    
    # Create the 'data' directory if it doesn't exist
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    # Save the cleaned dataframe
    nutrients_df.to_csv(output_file, index=False)
    print(f"Successfully cleaned nutrient data and saved to {output_file}")
    return nutrients_df

In [3]:
def perform_eda(user_data_file='Personalized_Diet_Recommendations.csv', cleaned_nutrients_df=None):
    """
    Generates and saves EDA plots for user and nutrient data.
    """
    print("Starting Exploratory Data Analysis (EDA)...")
    
    # Create 'eda_plots' directory
    plot_dir = 'eda_plots'
    os.makedirs(plot_dir, exist_ok=True)
    
    # --- User Data EDA ---
    try:
        user_df = pd.read_csv(user_data_file)
        
        # Plot 1: Age Distribution
        plt.figure(figsize=(10, 6))
        sns.histplot(user_df['Age'], kde=True, bins=30)
        plt.title('Distribution of User Age')
        plt.xlabel('Age')
        plt.ylabel('Frequency')
        plt.savefig(os.path.join(plot_dir, 'user_age_distribution.png'))
        plt.close()
        
        # Plot 2: BMI Distribution
        plt.figure(figsize=(10, 6))
        sns.histplot(user_df['BMI'], kde=True, bins=30)
        plt.title('Distribution of User BMI')
        plt.xlabel('BMI')
        plt.ylabel('Frequency')
        plt.savefig(os.path.join(plot_dir, 'user_bmi_distribution.png'))
        plt.close()

        # Plot 3: Dietary Habits
        plt.figure(figsize=(10, 6))
        sns.countplot(y=user_df['Dietary_Habits'])
        plt.title('Count of User Dietary Habits')
        plt.xlabel('Count')
        plt.ylabel('Dietary Habit')
        plt.savefig(os.path.join(plot_dir, 'user_dietary_habits.png'))
        plt.close()

        print(f"Saved user EDA plots to '{plot_dir}' folder.")

    except FileNotFoundError:
        print(f"ERROR: {user_data_file} not found. Skipping user EDA.")
    
    # --- Nutrient Data EDA ---
    if cleaned_nutrients_df is not None:
        # Plot 4: Protein Distribution
        plt.figure(figsize=(10, 6))
        sns.histplot(cleaned_nutrients_df['Protein'].replace(0, np.nan).dropna(), kde=True, bins=30)
        plt.title('Distribution of Protein in Foods (non-zero)')
        plt.xlabel('Protein (g)')
        plt.ylabel('Frequency')
        plt.savefig(os.path.join(plot_dir, 'nutrient_protein_distribution.png'))
        plt.close()
        
        # Plot 5: Calories Distribution
        plt.figure(figsize=(10, 6))
        sns.histplot(cleaned_nutrients_df['Calories'].replace(0, np.nan).dropna(), kde=True, bins=30)
        plt.title('Distribution of Calories in Foods (non-zero)')
        plt.xlabel('Calories')
        plt.ylabel('Frequency')
        plt.savefig(os.path.join(plot_dir, 'nutrient_calories_distribution.png'))
        plt.close()
        
        print(f"Saved nutrient EDA plots to '{plot_dir}' folder.")
    
    print("EDA complete.")

In [4]:
if __name__ == "__main__":
    warnings.filterwarnings('ignore')
    
    # Step 1: Clean nutrient data
    cleaned_nutrients = clean_nutrient_data()
    
    # Step 2: Perform EDA
    perform_eda(cleaned_nutrients_df=cleaned_nutrients)

Loading nutrients_csvfile.csv...
Successfully cleaned nutrient data and saved to data/nutrients_readable_cleaned.csv
Starting Exploratory Data Analysis (EDA)...
Saved user EDA plots to 'eda_plots' folder.
Saved nutrient EDA plots to 'eda_plots' folder.
EDA complete.
