## Data Processing

In [1]:
import re
import pandas as pd
from typing import Optional, Union

class RecipeDataProcessor:
    """A class to process recipe data from given files."""

    def __init__(self, recipe_file: str, ingredient_file: str) -> None:
        """
        Initialize the RecipeDataProcessor with file paths.

        :param recipe_file: Path to the recipe file.
        :param ingredient_file: Path to the ingredient file.
        """
        self.recipe_file = recipe_file
        self.ingredient_file = ingredient_file
        self.df_merged: Optional[pd.DataFrame] = None

    def load_data(self) -> None:
        """Load data from recipe and ingredient files and merge them."""
        df_recipes = pd.read_parquet(self.recipe_file)
        df_ingredients = pd.read_csv(self.ingredient_file)
        self.df_merged = df_ingredients.merge(df_recipes, how="inner", left_on="id", right_on="RecipeId")

    def select_columns(self) -> None:
        """Select relevant columns from the merged DataFrame."""
        selected_columns = [
            'id', 'name', 'ingredients_raw', 'steps', 'servings', 'serving_size', 'CookTime',
            'PrepTime', 'TotalTime', 'RecipeCategory', 'Calories', 'FatContent', 'SaturatedFatContent',
            'CholesterolContent', 'SodiumContent', 'CarbohydrateContent', 'FiberContent', 'SugarContent',
            'ProteinContent'
        ]
        self.df_merged = self.df_merged.loc[:, selected_columns]
        # Note: all nutritional amounts are on a per serving basis

    def rename_columns(self) -> None:
        """Rename columns according to a predefined naming scheme."""
        rename_dict = {
        'id': "id",  
        'name': "name",  
        'ingredients_raw': 'ingredientsRaw',
        'steps': 'steps',  
        'servings': 'servings',
        'serving_size': 'servingSize',
        'CookTime': 'cookTime',
        'PrepTime': 'prepTime',
        'TotalTime': 'totalTime',
        'RecipeCategory': 'recipeCategory',
        'Calories': 'calories',
        'FatContent': 'fatContent',
        'SaturatedFatContent': 'saturatedFatContent',
        'CholesterolContent': 'cholesterolContent',
        'SodiumContent': 'sodiumContent',
        'CarbohydrateContent': 'carbohydrateContent',
        'FiberContent': 'fiberContent',
        'SugarContent': 'sugarContent',
        'ProteinContent': 'proteinContent'}
        self.df_merged.rename(columns=rename_dict, inplace=True)
        # Note: cholesterolContent and sodiumContent are in milligrams, the rest are in grams

    def fill_missing_values(self) -> None:
        """Fill missing values in the DataFrame."""
        self.df_merged["cookTime"] = self.df_merged["cookTime"].fillna(0)
        self.df_merged.dropna(inplace=True)

    def convert_columns_to_string(self) -> None:
        """Convert columns of type 'object' to strings."""
        for col in self.df_merged.columns:
            if self.df_merged[col].dtype == 'object':
                self.df_merged[col] = self.df_merged[col].astype(str)

    @staticmethod
    def convert_iso_duration_to_readable(duration: Union[str, None]) -> str:
        """
        Convert ISO 8601 duration format to a more readable format.

        :param duration: The ISO 8601 duration string.
        :return: A human-readable duration string.
        """
        if not duration:
            return ''

        pattern = re.compile(r'PT(\d+H)?(\d+M)?')
        match = pattern.match(duration)

        hours, minutes = match.groups() if match else (None, None)
        hours_readable = f"{int(hours[:-1])} Hour{'s' if int(hours[:-1]) > 1 else ''}" if hours else ''
        minutes_readable = f"{int(minutes[:-1])} Minute{'s' if int(minutes[:-1]) > 1 else ''}" if minutes else ''

        return ' '.join(filter(None, [hours_readable, minutes_readable]))

    def convert_time_columns(self) -> None:
        """Convert time-related columns to a readable format."""
        time_cols = ['cookTime', 'prepTime', 'totalTime']
        for col in time_cols:
            self.df_merged[col] = self.df_merged[col].apply(self.convert_iso_duration_to_readable)
    
    @staticmethod
    def convert_time_to_minutes(time_str: str) -> int:
        """Convert a time string to minutes."""
        if pd.isna(time_str):
            return 0
        hours_match = re.search(r'(\d+) Hour', time_str)
        minutes_match = re.search(r'(\d+) Minute', time_str)
        hours = int(hours_match.group(1)) if hours_match else 0
        minutes = int(minutes_match.group(1)) if minutes_match else 0
        return hours * 60 + minutes

    def categorize_meal_type(self) -> None:
        """Categorize each recipe as 'Breakfast', 'Snack', or 'Main Dish'."""
        # First, convert totalTime to minutes
        self.df_merged['totalTimeMinutes'] = self.df_merged['totalTime'].apply(self.convert_time_to_minutes)
        
        # Define categories for Lunch
        lunch_categories = ['Lunch/Snacks', 'One Dish Meal', 'Vegetable']

        # Categorize mealType
        self.df_merged['mealType'] = self.df_merged.apply(
            lambda row: 'Breakfast' if row['recipeCategory'] == 'Breakfast' else 
                        ('Snack' if row['totalTimeMinutes'] < 20 else 
                        ('Lunch' if row['recipeCategory'] in lunch_categories else 'Dinner')), axis=1)

    def process_data(self) -> pd.DataFrame:
        """Process the recipe data through various cleaning and transforming steps."""
        self.load_data()
        self.select_columns()
        self.rename_columns()
        self.fill_missing_values()
        self.convert_columns_to_string()
        self.convert_time_columns()
        self.categorize_meal_type()  
        return self.df_merged

# if __name__ == "__main__":
#     processor = RecipeDataProcessor("./data/recipes.parquet", "./data/recipes_ingredients.csv")
#     processed_data = processor.process_data()
#     print(processed_data.head())

In [2]:
processor = RecipeDataProcessor("./data/recipes.parquet", "./data/recipes_ingredients.csv")
processed_data = processor.process_data()
processed_data.head()

Unnamed: 0,id,name,ingredientsRaw,steps,servings,servingSize,cookTime,prepTime,totalTime,recipeCategory,...,fatContent,saturatedFatContent,cholesterolContent,sodiumContent,carbohydrateContent,fiberContent,sugarContent,proteinContent,totalTimeMinutes,mealType
0,71247,Cherry Streusel Cobbler,"[""2 (21 ounce) cans cherry pie filling"",""2...","[""Preheat oven to 375°F."", ""Spread cherry pie ...",6.0,1 (347 g),50 Minutes,10 Minutes,1 Hour,Dessert,...,29.1,7.7,93.0,536.5,125.0,3.3,54.4,12.3,60,Dinner
1,76133,Reuben and Swiss Casserole Bake,"[""1/2-1 lb corned beef, cooked and choppe...","[""Set oven to 350 degrees F."", ""Butter a 9 x 1...",4.0,1 (207 g),25 Minutes,15 Minutes,40 Minutes,Cheese,...,45.3,22.1,142.3,2074.2,33.9,5.9,7.1,31.0,40,Dinner
2,503816,Yam-Pecan Recipe,"[""3/4 cup unsalted butter, at room tempera...","[""Preheat oven to 350°F In a mixing bowl, usi...",8.0,1 (198 g),1 Hour,15 Minutes,1 Hour 15 Minutes,Quick Breads,...,53.2,21.7,192.5,664.9,112.8,4.1,63.3,10.7,75,Dinner
3,418749,Tropical Orange Layer Cake,"[""1 (18 ounce) pkge.orange cake mix"",""1 (3...","[""In a large mixing bowl, combine the first 6 ...",16.0,1 (191 g),30 Minutes,10 Minutes,40 Minutes,Dessert,...,30.5,17.5,62.4,361.5,74.6,3.5,61.8,6.2,40,Dinner
4,392934,Safe to Eat Raw Chocolate Chip Oreo Cookie &qu...,"[""1/2 cup butter, room temperature "",""1/2 ...","[""Cream butter and sugars together."", ""Blend i...",24.0,1 (26 g),,15 Minutes,15 Minutes,Dessert,...,5.9,3.3,10.5,77.8,16.8,0.5,10.3,1.1,15,Snack


## Modeling

In [3]:
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
import pandas as pd

def estimate_daily_nutritional_needs(age, sex, weight, height, activity_level, goal):
    """
    Estimate daily nutritional needs, including macronutrients and micronutrients,
    using appropriate medical guidelines. 

    :param age: Age in years.
    :param sex: 'male' or 'female'.
    :param weight: Weight in kilograms.
    :param height: Height in centimeters.
    :param activity_level: Activity level.
    :param goal: User's goal.
    :return: Dictionary of estimated nutritional needs.
    """
    # Mifflin St Jeor Equation for BMR
    bmr = 10 * weight + 6.25 * height - 5 * age + (5 if sex.lower() == 'male' else -161)
    activity_factors = {
        "sedentary": 1.2, "lightly active": 1.375, "moderately active": 1.55, 
        "very active": 1.725, "extra active": 1.9
    }
    maintenance_calories = bmr * activity_factors[activity_level.lower()]

    # Goal adjustment
    goal_adjustments = {
        "light weight loss": 0.9, "moderate weight loss": 0.8, "extreme weight loss": 0.75,
        "light muscle gain": 1.1, "moderate muscle gain": 1.2, "extreme muscle gain": 1.25,
        "maintain weight": 1.0
    }
    calories = maintenance_calories * goal_adjustments[goal.lower()]

    # Macronutrient distribution
    protein_pct = 0.3 if 'muscle gain' in goal else 0.25
    fat_pct = 0.25
    carbs_pct = 1 - protein_pct - fat_pct

    protein = (calories * protein_pct) / 4
    fat = (calories * fat_pct) / 9
    carbs = (calories * carbs_pct) / 4

    # Estimating saturated fat intake (5-6% of total calories)
    saturated_fat_pct = 0.05  # 6% for a conservative estimate
    saturated_fat = (calories * saturated_fat_pct) / 9  # 9 calories per gram

    # Fiber intake based on age and sex
    fiber = 38 if age <= 50 and sex.lower() == 'male' else 25 if age <= 50 else 30 if sex.lower() == 'male' else 21

    # Micronutrient guidelines (general estimates)
    sugar = calories * 0.10 / 4  # 10% of calories as sugar
    cholesterol = 300            # milligrams per day (general guideline)
    sodium = 2300                # milligrams per day (general guideline)

    return {
        "calories": calories,
        "proteinContent": protein,
        "fatContent": fat,
        "saturatedFatContent": saturated_fat,
        "carbohydrateContent": carbs,
        "fiberContent": fiber,
        "sugarContent": sugar,
        "cholesterolContent": cholesterol,
        "sodiumContent": sodium
    }

def recommend_meals(processed_data, user_nutrients, meal_type, nutritional_columns, k=5):
    """ Recommend meals based on the user's nutritional needs and meal type. """
    meal_data = processed_data[processed_data['mealType'] == meal_type]
    meal_features = meal_data[nutritional_columns]

    # Fit the scaler on this meal type's features
    scaler = StandardScaler()
    scaled_meal_features = scaler.fit_transform(meal_features)

    # Scale user's nutritional needs using the same scaler
    user_features_scaled = scaler.transform(user_nutrients)

    # k-NN model
    knn = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='euclidean')
    knn.fit(scaled_meal_features)

    # Find nearest neighbors for this meal type
    distances, indices = knn.kneighbors(user_features_scaled)
    return meal_data.iloc[indices[0]]

In [4]:
# Example usage
nutrients = estimate_daily_nutritional_needs(age=58, sex='male', weight=73, height=182, 
                                       activity_level='sedentary', goal='light weight loss')

# Number of meals per day
meals_per_day = 3  # Example
nutritional_columns = ['calories', 'proteinContent', 'fatContent', 'saturatedFatContent', 'carbohydrateContent', 'fiberContent', 'sugarContent', 'cholesterolContent', 'sodiumContent']
# User's per-meal nutritional needs as a list
user_per_meal_needs = [value / meals_per_day for key, value in nutrients.items()]
user_per_meal_needs_df = pd.DataFrame([user_per_meal_needs], columns= nutritional_columns)

# Meal plan configuration
meal_types = ['Breakfast', 'Lunch', 'Dinner'] if meals_per_day == 3 else \
             ['Breakfast', 'Snack', 'Lunch', 'Dinner'] if meals_per_day == 4 else \
             ['Breakfast', 'Snack', 'Lunch', 'Snack', 'Dinner']

# Get recommendations for each meal type
meal_recommendations = {}
for meal in meal_types:
    meal_recommendations[meal] = recommend_meals(processed_data, user_per_meal_needs_df, meal, nutritional_columns)

# Print or process the recommendations
for meal, recommendations in meal_recommendations.items():
    print(f"Recommendations for {meal}:")
    print(recommendations[['name', 'mealType']])
    print()


Recommendations for Breakfast:
                              name   mealType
265931           Arbroath Toasties  Breakfast
91841                Pizza Oatmeal  Breakfast
166397    Savoury Protein Pancakes  Breakfast
233253  Egg White Protein Pancakes  Breakfast
267879         Zone Easy Breakfast  Breakfast

Recommendations for Lunch:
                                      name mealType
400841                  Busy Girl's Supper    Lunch
341961  Hajar's Lamby Friday Food Couscous    Lunch
373124             Greek Chicken Casserole    Lunch
283524                 Mushy Pea Fishcakes    Lunch
70408                  Mas Macho Meatballs    Lunch

Recommendations for Dinner:
                                              name mealType
232826  Horseradish Spiced Fishcakes With Coleslaw   Dinner
146690                  Mom's Tangy Beef Casserole   Dinner
97597               Chicken Kabobs With Fettuccini   Dinner
167908                 Italian Chicken and Peppers   Dinner
143917     Chicken Stew 