In [1]:
import pandas as pd
import numpy as np
import requests
import time
import pickle
import os
import re
import json
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# API Configuration
API_KEY = "3f0fd610d9c74241b8bb44ceb0332e7e"  # Replace with your actual API key
BASE_URL = "https://api.spoonacular.com"

def fetch_recipes(n_recipes=10000):
    """
    Fetch recipe data from Spoonacular API

    Args:
        n_recipes: Number of recipes to fetch (minimum 10000)

    Returns:
        DataFrame with recipe data
    """
    print(f"Fetching {n_recipes} recipes from Spoonacular API...")

    all_recipes = []
    offset = 0
    batch_size = 100  # Maximum allowed by Spoonacular in one request

    # Calculate how many API calls we need to make
    n_calls = (n_recipes + batch_size - 1) // batch_size

    for i in range(n_calls):
        if i % 10 == 0:
            print(f"Progress: {i * batch_size}/{n_recipes} recipes fetched")

        # Endpoint to search recipes
        endpoint = f"/recipes/complexSearch"
        params = {
            "apiKey": API_KEY,
            "number": min(batch_size, n_recipes - len(all_recipes)),
            "offset": offset,
            "addRecipeNutrition": True,  # Get nutrition info
            "fillIngredients": True,     # Get detailed ingredient info
            "instructionsRequired": True # Ensure we get recipes with instructions
        }

        try:
            response = requests.get(BASE_URL + endpoint, params=params)
            response.raise_for_status()  # Raise exception for 4XX/5XX responses
            data = response.json()

            # Extract the recipes from the response
            recipes = data.get('results', [])

            if not recipes:
                print(f"No more recipes returned at offset {offset}")
                break

            all_recipes.extend(recipes)
            offset += len(recipes)

            # Rate limiting - Spoonacular has strict rate limits
            # Basic plan: 150 requests per day, ~1 request per minute
            time.sleep(1.5)  # Sleep to avoid hitting rate limits

        except Exception as e:
            print(f"Error fetching recipes: {e}")
            # Wait longer if we hit an error (could be rate limiting)
            time.sleep(5)

    print(f"Successfully fetched {len(all_recipes)} recipes")

    # Convert to DataFrame
    if not all_recipes:
        raise ValueError("No recipes were fetched. Check your API key and rate limits.")

    return process_recipe_data(all_recipes)

def process_recipe_data(recipes):
    """
    Process the raw recipe data from Spoonacular API

    Args:
        recipes: List of recipe dictionaries from API

    Returns:
        Processed DataFrame
    """
    # Initialize lists to store the extracted data
    processed_data = []

    for recipe in recipes:
        # Extract basic recipe info
        recipe_id = recipe.get('id')
        title = recipe.get('title')

        # Extract nutrition info
        nutrition = recipe.get('nutrition', {})
        nutrients = {n['name']: n['amount'] for n in nutrition.get('nutrients', [])}

        # Create a record with all the data we need
        record = {
            'RecipeId': recipe_id,
            'Name': title,
            'SourceUrl': recipe.get('sourceUrl', ''),
            'Image': recipe.get('image', ''),
            'ReadyInMinutes': recipe.get('readyInMinutes', 0),
            'Servings': recipe.get('servings', 0),
            'Vegetarian': recipe.get('vegetarian', False),
            'Vegan': recipe.get('vegan', False),
            'GlutenFree': recipe.get('glutenFree', False),
            'DairyFree': recipe.get('dairyFree', False),
            'Healthy': recipe.get('veryHealthy', False),
            'Cheap': recipe.get('cheap', False),
            'Popular': recipe.get('veryPopular', False),
            'CookingMinutes': recipe.get('cookingMinutes', 0),
            'PreparationMinutes': recipe.get('preparationMinutes', 0),
            'Cuisines': ','.join(recipe.get('cuisines', [])),
            'DishTypes': ','.join(recipe.get('dishTypes', [])),
            'Diets': ','.join(recipe.get('diets', [])),
            'Occasions': ','.join(recipe.get('occasions', [])),
            'IngredientCount': len(recipe.get('extendedIngredients', [])),
            'Instructions': recipe.get('instructions', ''),
            'HealthScore': recipe.get('healthScore', 0),
            'PricePerServing': recipe.get('pricePerServing', 0),

            # Nutrition data - our target variables
            'Calories': nutrients.get('Calories', 0),
            'FatContent': nutrients.get('Fat', 0),
            'SaturatedFatContent': nutrients.get('Saturated Fat', 0),
            'CarbohydrateContent': nutrients.get('Carbohydrates', 0),
            'SugarContent': nutrients.get('Sugar', 0),
            'FiberContent': nutrients.get('Fiber', 0),
            'ProteinContent': nutrients.get('Protein', 0),
            'SodiumContent': nutrients.get('Sodium', 0),
            'CholesterolContent': nutrients.get('Cholesterol', 0)
        }

        # Add ingredient information
        ingredients = recipe.get('extendedIngredients', [])
        ingredient_names = []
        ingredient_amounts = []
        ingredient_units = []

        for ing in ingredients:
            ingredient_names.append(ing.get('name', ''))
            ingredient_amounts.append(str(ing.get('amount', '')))
            ingredient_units.append(ing.get('unit', ''))

        record['RecipeIngredientParts'] = ','.join(ingredient_names)
        record['RecipeIngredientQuantities'] = ','.join(ingredient_amounts)
        record['RecipeIngredientUnits'] = ','.join(ingredient_units)

        processed_data.append(record)

    # Convert to DataFrame
    df = pd.DataFrame(processed_data)

    # Save raw data for reference
    with open('raw_recipe_data.pkl', 'wb') as f:
        pickle.dump(recipes, f)

    return df

def preprocess_data(df):
    """
    Preprocess the recipe data for model training

    Args:
        df: DataFrame with recipe data from Spoonacular

    Returns:
        Preprocessed DataFrame
    """
    print("Preprocessing recipe data...")

    # Create a copy of display data
    display_df = df[['RecipeId', 'Name', 'Image', 'SourceUrl', 'Instructions', 'Servings']].copy()

    # Save display data for later use
    with open('display_data.pkl', 'wb') as f:
        pickle.dump(display_df, f)

    # Handle missing values
    numeric_cols = ['ReadyInMinutes', 'Servings', 'CookingMinutes', 'PreparationMinutes',
                    'IngredientCount', 'HealthScore', 'PricePerServing', 'Calories',
                    'FatContent', 'SaturatedFatContent', 'CarbohydrateContent',
                    'SugarContent', 'FiberContent', 'ProteinContent',
                    'SodiumContent', 'CholesterolContent']

    # Impute missing values for numeric columns
    imputer = SimpleImputer(strategy='median')
    df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

    # Fill missing values for categorical columns
    categorical_cols = ['Cuisines', 'DishTypes', 'Diets', 'Occasions',
                        'RecipeIngredientParts', 'RecipeIngredientQuantities',
                        'RecipeIngredientUnits']

    for col in categorical_cols:
        df[col] = df[col].fillna('')

    # Feature engineering
    df = feature_engineering(df)

    return df

def feature_engineering(df):
    """
    Create new features from the recipe data

    Args:
        df: DataFrame with recipe data

    Returns:
        DataFrame with engineered features
    """
    print("Performing feature engineering...")

    # Create binary features from text columns
    # Extract cuisine types
    cuisines = set()
    for cuisine_list in df['Cuisines'].str.split(',').dropna():
        cuisines.update([c.strip() for c in cuisine_list if c.strip()])

    # Add cuisine binary features (for top 10 most common cuisines)
    if cuisines:
        # Count occurrences of each cuisine
        cuisine_counts = {}
        for cuisine_list in df['Cuisines'].str.split(',').dropna():
            for cuisine in cuisine_list:
                cuisine = cuisine.strip()
                if cuisine:
                    cuisine_counts[cuisine] = cuisine_counts.get(cuisine, 0) + 1

        # Get top 10 cuisines
        top_cuisines = sorted(cuisine_counts.items(), key=lambda x: x[1], reverse=True)[:10]
        top_cuisines = [c[0] for c in top_cuisines]

        # Create binary features
        for cuisine in top_cuisines:
            df[f'Cuisine_{cuisine}'] = df['Cuisines'].str.contains(cuisine, case=False, regex=False).astype(int)

    # Similar approach for dish types
    dish_types = set()
    for dish_list in df['DishTypes'].str.split(',').dropna():
        dish_types.update([d.strip() for d in dish_list if d.strip()])

    if dish_types:
        # Count occurrences of each dish type
        dish_counts = {}
        for dish_list in df['DishTypes'].str.split(',').dropna():
            for dish in dish_list:
                dish = dish.strip()
                if dish:
                    dish_counts[dish] = dish_counts.get(dish, 0) + 1

        # Get top 10 dish types
        top_dishes = sorted(dish_counts.items(), key=lambda x: x[1], reverse=True)[:10]
        top_dishes = [d[0] for d in top_dishes]

        # Create binary features
        for dish in top_dishes:
            df[f'DishType_{dish}'] = df['DishTypes'].str.contains(dish, case=False, regex=False).astype(int)

    # Simplify dietary restrictions to binary features
    dietary_features = ['Vegetarian', 'Vegan', 'GlutenFree', 'DairyFree', 'Healthy', 'Cheap', 'Popular']
    for col in dietary_features:
        df[col] = df[col].astype(int)

    # Create time-related features
    df['TotalTime'] = df['CookingMinutes'] + df['PreparationMinutes']
    df['PrepCookRatio'] = df['PreparationMinutes'] / (df['CookingMinutes'] + 1)  # Add 1 to avoid division by zero

    # Create nutrition-related features
    df['CaloriesPerServing'] = df['Calories'] / df['Servings']
    df['ProteinRatio'] = df['ProteinContent'] / (df['Calories'] + 1) * 100
    df['FatRatio'] = df['FatContent'] / (df['Calories'] + 1) * 100
    df['CarbRatio'] = df['CarbohydrateContent'] / (df['Calories'] + 1) * 100

    # Create ingredient complexity features
    df['IngredientPerMinute'] = df['IngredientCount'] / (df['TotalTime'] + 1)

    # Text length features
    df['InstructionLength'] = df['Instructions'].str.len()
    df['IngredientLength'] = df['RecipeIngredientParts'].str.len()

    # Ingredient diversity (number of unique ingredients)
    def count_unique_ingredients(ingredient_str):
        if not ingredient_str:
            return 0
        return len(set(ingredient_str.split(',')))

    df['UniqueIngredientCount'] = df['RecipeIngredientParts'].apply(count_unique_ingredients)

    # Extract top ingredients and create binary features
    all_ingredients = []
    for ingredients in df['RecipeIngredientParts'].str.split(',').dropna():
        all_ingredients.extend([ing.strip() for ing in ingredients if ing.strip()])

    from collections import Counter
    ingredient_counter = Counter(all_ingredients)
    top_ingredients = [item[0] for item in ingredient_counter.most_common(20)]

    for ingredient in top_ingredients:
        df[f'Has_{ingredient}'] = df['RecipeIngredientParts'].str.contains(ingredient, case=False, regex=False).astype(int)

    # Convert remaining text columns to numeric using label encoding
    text_cols = [col for col in df.columns if df[col].dtype == 'object' and col not in ['RecipeId', 'Name', 'Instructions', 'Image', 'SourceUrl']]

    for col in text_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))

    # Remove unnecessary columns for modeling
    columns_to_drop = ['Name', 'Image', 'SourceUrl', 'Instructions', 'Cuisines', 'DishTypes',
                       'Diets', 'Occasions', 'RecipeIngredientParts', 'RecipeIngredientQuantities',
                       'RecipeIngredientUnits']

    modeling_df = df.drop(columns_to_drop, axis=1, errors='ignore')

    return modeling_df

def train_recipe_recommendation_model(df):
    """
    Train a LightGBM model to recommend recipes based on nutritional constraints

    Args:
        df: Preprocessed DataFrame with recipe data

    Returns:
        Trained models and evaluation metrics
    """
    print("Training models...")

    # Create nutrition targets for the model (multi-target regression)
    targets = ['ProteinContent', 'CarbohydrateContent', 'FiberContent', 'FatContent']

    # Ensure all targets are available
    for target in targets:
        if target not in df.columns:
            print(f"Target column {target} not found in dataset!")
            return

    # Keep recipe IDs for later retrieval
    recipe_ids = df['RecipeId'].values

    # Remove ID and target columns from features
    features = df.drop(['RecipeId'] + targets, axis=1)

    # Fill any remaining NaN values with 0 for stability
    features = features.fillna(0)

    # Make sure target values are numeric
    for target in targets:
        df[target] = pd.to_numeric(df[target], errors='coerce')

    # Fill NaN values in target columns after conversion
    df[targets] = df[targets].fillna(df[targets].mean())

    # Scale the nutritional targets to improve model performance
    scaler = MinMaxScaler()
    targets_scaled = scaler.fit_transform(df[targets])

    # Save the scaler for prediction
    with open('nutrition_scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)

    # Train models for each nutritional target
    models = {}
    metrics = {target: {} for target in targets}

    # Create bins for classification metrics
    target_bins = {}
    for i, target in enumerate(targets):
        target_vals = df[target].values
        bins = np.linspace(target_vals.min(), target_vals.max(), 6)  # 5 bins
        target_bins[target] = bins

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        features, targets_scaled, test_size=0.2, random_state=42
    )

    # Save feature columns for prediction
    with open('feature_columns.pkl', 'wb') as f:
        pickle.dump(features.columns.tolist(), f)

    # Train a separate model for each target
    for i, target in enumerate(targets):
        print(f"Training model for {target}...")

        # Get the target values
        y_train_target = y_train[:, i]
        y_test_target = y_test[:, i]

        # Define the model parameters
        # Using a more robust parameter set for better performance
        lgb_params = {
            'objective': 'regression',
            'metric': 'rmse',
            'boosting_type': 'gbdt',
            'num_leaves': 64,  # Higher for better accuracy
            'learning_rate': 0.05,  # Lower for better accuracy
            'feature_fraction': 0.9,  # Higher for better generalization
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': -1,
            'lambda_l1': 0.1,  # L1 regularization
            'lambda_l2': 0.1,  # L2 regularization
            'min_child_samples': 20  # Prevent overfitting
        }

        # Create dataset for LightGBM
        train_data = lgb.Dataset(X_train, label=y_train_target)
        valid_data = lgb.Dataset(X_test, label=y_test_target, reference=train_data)

        # Train the model with early stopping
        model = lgb.train(
            lgb_params,
            train_data,
            num_boost_round=1000,  # More rounds for better learning
            valid_sets=[valid_data],
            callbacks=[
                lgb.early_stopping(stopping_rounds=50),  # More patient early stopping
                lgb.log_evaluation(period=100)
            ]
        )

        # Save the model
        models[target] = model
        with open(f'model_{target}.pkl', 'wb') as f:
            pickle.dump(model, f)

        # Make predictions
        y_pred = model.predict(X_test)

        # Calculate regression metrics
        rmse = np.sqrt(mean_squared_error(y_test_target, y_pred))
        mae = mean_absolute_error(y_test_target, y_pred)
        r2 = r2_score(y_test_target, y_pred)

        print(f"{target} model evaluation:")
        print(f"  RMSE: {rmse:.5f}, MAE: {mae:.5f}, R² Score: {r2:.5f}")

        # For evaluation metrics, convert regression values to classes using bins
        y_test_target_reshaped = y_test_target.reshape(-1, 1)
        y_test_bins = np.digitize(scaler.inverse_transform(np.concatenate([y_test_target_reshaped] * 4, axis=1))[:, i],
                             target_bins[target]) - 1

        y_pred_reshaped = y_pred.reshape(-1, 1)
        temp = np.zeros((len(y_pred), 4))
        temp[:, i] = y_pred
        y_pred_bins = np.digitize(scaler.inverse_transform(temp)[:, i], target_bins[target]) - 1

        # Calculate classification metrics
        metrics[target]['accuracy'] = accuracy_score(y_test_bins, y_pred_bins)
        metrics[target]['precision'] = precision_score(y_test_bins, y_pred_bins, average='weighted')
        metrics[target]['recall'] = recall_score(y_test_bins, y_pred_bins, average='weighted')
        metrics[target]['f1'] = f1_score(y_test_bins, y_pred_bins, average='weighted')

        # Print classification scores
        print(f"{target} model evaluation:")
        for metric, score in metrics[target].items():
            print(f"  {metric}: {score:.4f}")

    # Save recipe IDs and target bins
    with open('recipe_ids.pkl', 'wb') as f:
        pickle.dump(recipe_ids, f)

    with open('target_bins.pkl', 'wb') as f:
        pickle.dump(target_bins, f)

    with open('target_columns.pkl', 'wb') as f:
        pickle.dump(targets, f)

    with open('model_metrics.pkl', 'wb') as f:
        pickle.dump(metrics, f)

    print("Model training complete. Models saved to disk.")

    return models, metrics

def predict_recipes(protein_constraint, carbs_constraint, fiber_constraint, fat_constraint, limit=10):
    """
    Predict recipes that match the given nutritional constraints

    Args:
        protein_constraint: Target protein content (g)
        carbs_constraint: Target carbohydrate content (g)
        fiber_constraint: Target fiber content (g)
        fat_constraint: Target fat content (g)
        limit: Maximum number of recipes to return

    Returns:
        DataFrame with recommended recipes
    """
    # Load the models and other necessary data
    models = {}
    with open('target_columns.pkl', 'rb') as f:
        targets = pickle.load(f)

    for target in targets:
        with open(f'model_{target}.pkl', 'rb') as f:
            models[target] = pickle.load(f)

    with open('recipe_ids.pkl', 'rb') as f:
        recipe_ids = pickle.load(f)

    with open('nutrition_scaler.pkl', 'rb') as f:
        nutrition_scaler = pickle.load(f)

    with open('feature_columns.pkl', 'rb') as f:
        feature_columns = pickle.load(f)

    # Load the preprocessed data
    df = pd.read_csv("preprocessed_recipes.csv")

    # Extract features
    features = df[feature_columns].fillna(0)

    # Make predictions for each recipe
    predictions = np.zeros((len(recipe_ids), len(targets)))
    for i, target in enumerate(targets):
        preds = models[target].predict(features)
        predictions[:, i] = preds

    # Convert predictions back to original scale
    predictions_original = nutrition_scaler.inverse_transform(predictions)

    # Create a DataFrame with recipe IDs and predictions
    results_df = pd.DataFrame({
        'RecipeId': recipe_ids,
        'Predicted_Protein': predictions_original[:, 0],
        'Predicted_Carbs': predictions_original[:, 1],
        'Predicted_Fiber': predictions_original[:, 2],
        'Predicted_Fat': predictions_original[:, 3]
    })

    # Initial filtering with wider ranges (±20%)
    filtered_recipes = results_df[
        (results_df['Predicted_Protein'] >= protein_constraint * 0.8) &
        (results_df['Predicted_Protein'] <= protein_constraint * 1.2) &
        (results_df['Predicted_Carbs'] >= carbs_constraint * 0.8) &
        (results_df['Predicted_Carbs'] <= carbs_constraint * 1.2) &
        (results_df['Predicted_Fiber'] >= fiber_constraint * 0.8) &
        (results_df['Predicted_Fiber'] <= fiber_constraint * 1.2) &
        (results_df['Predicted_Fat'] >= fat_constraint * 0.8) &
        (results_df['Predicted_Fat'] <= fat_constraint * 1.2)
    ]

    # Calculate distance
    filtered_recipes['Distance'] = (
        ((filtered_recipes['Predicted_Protein'] - protein_constraint) / max(protein_constraint, 1)) ** 2 +
        ((filtered_recipes['Predicted_Carbs'] - carbs_constraint) / max(carbs_constraint, 1)) ** 2 +
        ((filtered_recipes['Predicted_Fiber'] - fiber_constraint) / max(fiber_constraint, 1)) ** 2 +
        ((filtered_recipes['Predicted_Fat'] - fat_constraint) / max(fat_constraint, 1)) ** 2
    )

    # Sort by distance
    filtered_recipes = filtered_recipes.sort_values('Distance')

    # If we don't have enough recipes, relax constraints
    if len(filtered_recipes) < 5:
        filtered_recipes = results_df
        filtered_recipes['Distance'] = (
            ((filtered_recipes['Predicted_Protein'] - protein_constraint) / max(protein_constraint, 1)) ** 2 +
            ((filtered_recipes['Predicted_Carbs'] - carbs_constraint) / max(carbs_constraint, 1)) ** 2 +
            ((filtered_recipes['Predicted_Fiber'] - fiber_constraint) / max(fiber_constraint, 1)) ** 2 +
            ((filtered_recipes['Predicted_Fat'] - fat_constraint) / max(fat_constraint, 1)) ** 2
        )
        filtered_recipes = filtered_recipes.sort_values('Distance')

    # Get the top N recipes
    top_recipes = filtered_recipes.head(limit)

    # Load display data to get recipe details
    with open('display_data.pkl', 'rb') as f:
        display_data = pickle.load(f)

    # Get recipe details
    recipe_details = display_data[display_data['RecipeId'].isin(top_recipes['RecipeId'].tolist())]
    final_results = pd.merge(recipe_details, top_recipes, on='RecipeId', how='inner')

    return final_results

def main():
    """
    Main function to execute the entire process
    """
    # Check if we already have data from Spoonacular
    if os.path.exists("preprocessed_recipes.csv"):
        print("Found existing preprocessed data. Loading...")
        df = pd.read_csv("preprocessed_recipes.csv")
    else:
        # Fetch recipes from Spoonacular API
        raw_df = fetch_recipes(n_recipes=10000)

        # Preprocess the data
        df = preprocess_data(raw_df)

        # Save the preprocessed data
        df.to_csv("preprocessed_recipes.csv", index=False)
        print("Preprocessing complete. Data saved to preprocessed_recipes.csv")

    # Train the models
    models, metrics = train_recipe_recommendation_model(df)

    # Example of using the trained model
    # Get recipes with approximately: 30g protein, 50g carbs, 8g fiber, 15g fat
    print("\nExample recommendations:")
    recommendations = predict_recipes(30, 50, 8, 15, limit=5)

    for idx, row in recommendations.iterrows():
        print(f"Recipe: {row['Name']}")
        print(f"Nutritional Content (predicted):")
        print(f"  Protein: {row['Predicted_Protein']:.1f}g")
        print(f"  Carbs: {row['Predicted_Carbs']:.1f}g")
        print(f"  Fiber: {row['Predicted_Fiber']:.1f}g")
        print(f"  Fat: {row['Predicted_Fat']:.1f}g")
        print(f"Source: {row['SourceUrl']}")
        print("-" * 50)

if __name__ == "__main__":
    main()


Fetching 10000 recipes from Spoonacular API...
Progress: 0/10000 recipes fetched
Error fetching recipes: Response ended prematurely
Progress: 1000/10000 recipes fetched
Error fetching recipes: 402 Client Error: Payment Required for url: https://api.spoonacular.com/recipes/complexSearch?apiKey=3f0fd610d9c74241b8bb44ceb0332e7e&number=100&offset=1200&addRecipeNutrition=True&fillIngredients=True&instructionsRequired=True
Error fetching recipes: 402 Client Error: Payment Required for url: https://api.spoonacular.com/recipes/complexSearch?apiKey=3f0fd610d9c74241b8bb44ceb0332e7e&number=100&offset=1200&addRecipeNutrition=True&fillIngredients=True&instructionsRequired=True
Error fetching recipes: 402 Client Error: Payment Required for url: https://api.spoonacular.com/recipes/complexSearch?apiKey=3f0fd610d9c74241b8bb44ceb0332e7e&number=100&offset=1200&addRecipeNutrition=True&fillIngredients=True&instructionsRequired=True
Error fetching recipes: 402 Client Error: Payment Required for url: https:/