In [10]:
import pandas as pd
import csv
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import re
import pickle
import os
from multiprocessing import Pool, cpu_count
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def load_and_preprocess_data(data_path):
    """
    Load and preprocess the recipe dataset with optimizations
    """
    print("Loading data...")
    # Use low_memory=False and only parse necessary columns to speed up loading
    column_dtypes = {
        'RecipeId': 'str',
        'Name': 'str',
        'AuthorName': 'str',
        'Description': 'str',
        'Images': 'str',
        'RecipeYield': 'str',
        'RecipeInstructions': 'str',
    }

    # First, read just the columns we need for display to save memory
    display_columns = ['RecipeId', 'Name', 'AuthorName', 'Description', 'Images',
                      'RecipeYield', 'RecipeInstructions']

    display_df = pd.read_csv(data_path, usecols=display_columns, dtype=column_dtypes,
                            low_memory=False)

    # Save the display dataframe for later use
    pickle.dump(display_df, open('display_data.pkl', 'wb'))
    del display_df  # Free up memory

    # Now read only the model columns
    model_columns = ['RecipeId', 'CookTime', 'PrepTime', 'TotalTime', 'DatePublished',
                    'RecipeCategory', 'Keywords', 'RecipeIngredientQuantities',
                    'RecipeIngredientParts', 'AggregatedRating', 'ReviewCount',
                    'Calories', 'FatContent', 'SaturatedFatContent', 'CholesterolContent',
                    'SodiumContent', 'CarbohydrateContent', 'FiberContent',
                    'SugarContent', 'ProteinContent', 'RecipeServings']

    # Use chunks to process large files more efficiently
    chunk_size = 10000
    chunks = []

    for chunk in pd.read_csv(data_path, usecols=model_columns, chunksize=chunk_size, low_memory=False, on_bad_lines="skip", quoting=csv.QUOTE_NONE):
        # Process each chunk
        chunk = preprocess_chunk(chunk)
        chunks.append(chunk)

    # Combine chunks
    df = pd.concat(chunks, ignore_index=True)

    # Feature engineering - only do this once on the combined data
    df = feature_engineering(df)

    return df

def preprocess_chunk(df):
    """
    Process a chunk of the dataframe
    """
    # Convert time columns efficiently
    for col in ['CookTime', 'PrepTime', 'TotalTime']:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: convert_time_to_minutes(x) if isinstance(x, str) else x)

    # Handle missing values more efficiently using SimpleImputer instead of KNN for speed
    df = handle_missing_values(df)

    return df

def handle_missing_values(df):
    """
    Handle missing values efficiently
    """
    # Numerical columns to be imputed
    num_cols = ['CookTime', 'PrepTime', 'TotalTime', 'AggregatedRating',
                'ReviewCount', 'Calories', 'FatContent', 'SaturatedFatContent',
                'CholesterolContent', 'SodiumContent', 'CarbohydrateContent',
                'FiberContent', 'SugarContent', 'ProteinContent', 'RecipeServings']

    # Check which columns exist in the dataframe
    num_cols = [col for col in num_cols if col in df.columns and pd.api.types.is_numeric_dtype(df[col])]

    # Use median imputation instead of KNN for speed
    if num_cols:
        imputer = SimpleImputer(strategy='median')
        df_num = df[num_cols].copy()

        # Only apply imputation if there are missing values
        if df_num.isna().any().any():
            df_num_imputed = pd.DataFrame(imputer.fit_transform(df_num), columns=num_cols)
            df[num_cols] = df_num_imputed

    # Handle categorical columns with faster operations
    for col, default in [
        ('RecipeCategory', 'Unknown'),
        ('Keywords', ''),
        ('RecipeIngredientQuantities', ''),
        ('RecipeIngredientParts', '')
    ]:
        if col in df.columns and df[col].isna().any():
            df[col] = df[col].fillna(default)

    # Handle DatePublished more efficiently
    if 'DatePublished' in df.columns and df['DatePublished'].isna().any():
        # Use a default date instead of calculating mode for speed
        df['DatePublished'] = df['DatePublished'].fillna('2000-01-01')

        # Only process if column exists and extract only the year and month
        # This is faster than full datetime parsing
        try:
            df['Year'] = pd.to_datetime(df['DatePublished'], errors='coerce').dt.year
            df['Month'] = pd.to_datetime(df['DatePublished'], errors='coerce').dt.month

            # Fill NA values with medians
            df['Year'] = df['Year'].fillna(df['Year'].median() if not df['Year'].empty else 2000)
            df['Month'] = df['Month'].fillna(df['Month'].median() if not df['Month'].empty else 6)
        except:
            # If datetime conversion fails, use default values
            df['Year'] = 2000
            df['Month'] = 6

    return df

def convert_time_to_minutes(time_str):
    """
    Optimized function to convert ISO 8601 duration format to minutes
    """
    if not isinstance(time_str, str):
        return np.nan

    # Fast pattern matching
    total_minutes = 0

    # Extract hours (look for pattern like "1H")
    h_match = re.search(r'(\d+)H', time_str)
    if h_match:
        total_minutes += int(h_match.group(1)) * 60

    # Extract minutes (look for pattern like "30M")
    m_match = re.search(r'(\d+)M', time_str)
    if m_match:
        total_minutes += int(m_match.group(1))

    return total_minutes

# Modify your feature_engineering function to preserve the target columns
def feature_engineering(df):
    """
    Create new features efficiently
    """
    print("Performing feature engineering...")

    # ... rest of your function stays the same ...

    # Apply PCA only if we really need to reduce dimensionality
    if df.shape[1] > 25:  # Only apply if we have many features
        print("Applying PCA...")

        # Define target columns that should be preserved
        target_cols = ['ProteinContent', 'CarbohydrateContent', 'FiberContent', 'FatContent']

        # Get numeric columns that aren't target columns
        numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
        feature_cols = [col for col in numeric_cols if col not in target_cols and col != 'RecipeId']

        if len(feature_cols) > 10:
            # Standardize the data
            scaler = StandardScaler()
            df_numeric = df[feature_cols]
            df_numeric_scaled = scaler.fit_transform(df_numeric)

            imputer = SimpleImputer(strategy='median')
            df_numeric_scaled = imputer.fit_transform(df_numeric_scaled)

            # Apply PCA with fewer components for speed
            pca = PCA(n_components=min(5, len(feature_cols)))
            pca_result = pca.fit_transform(df_numeric_scaled)

            # Create PCA dataframe
            pca_df = pd.DataFrame(
                data=pca_result,
                columns=[f'PCA_{i+1}' for i in range(pca_result.shape[1])]
            )

            # Save the PCA object and scaler for later use
            pickle.dump(pca, open('pca_model.pkl', 'wb'))
            pickle.dump(scaler, open('scaler_model.pkl', 'wb'))
            pickle.dump(feature_cols, open('numeric_cols.pkl', 'wb'))

            # Replace original numeric data with PCA components but KEEP target columns
            df = df.drop(feature_cols, axis=1)
            df = pd.concat([df, pca_df], axis=1)

    return df
if __name__ == "__main__":
    data_path = "recipes.csv"  # Update with your actual file path
    processed_df = load_and_preprocess_data(data_path)

    # Save the preprocessed data for model training
    processed_df.to_csv("preprocessed_recipes.csv", index=False)
    print(f"Preprocessing complete. Data saved to preprocessed_recipes.csv")

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

def train_recipe_recommendation_model():
    """
    Train a LightGBM model to recommend recipes based on nutritional constraints
    with optimizations for speed and performance
    """
    print("Loading preprocessed data...")
    # Load data in chunks if it's large
    try:
        df = pd.read_csv("preprocessed_recipes.csv", low_memory=False)
    except:
        print("Error: Could not find preprocessed data. Please run data preprocessing first.")
        return

    # Create nutrition targets for the model (multi-target regression)
    targets = ['ProteinContent', 'CarbohydrateContent', 'FiberContent', 'FatContent']

    # Ensure all targets are available
    for target in targets:
        if target not in df.columns:
            print(f"Target column {target} not found in dataset!")
            return

    # Keep recipe IDs for later retrieval
    recipe_ids = df['RecipeId'].values

    # Remove ID and target columns from features
    features = df.drop(['RecipeId'] + targets, axis=1)

    for col in features.select_dtypes(include=['object']).columns:
          le = LabelEncoder()
          features[col] = le.fit_transform(features[col].astype(str)) # Convert to string to handle mixed types

    # Fill any remaining NaN values with 0 for stability
    features = features.fillna(0)
    for target in targets:
        df[target] = pd.to_numeric(df[target], errors='coerce')

    # Fill NaN values in target columns after conversion
    df[targets] = df[targets].fillna(df[targets].mean())
    # Scale the nutritional targets to improve model performance
    scaler = MinMaxScaler()
    targets_scaled = scaler.fit_transform(df[targets])

    # Save the scaler for prediction
    pickle.dump(scaler, open('nutrition_scaler.pkl', 'wb'))

    # Train models for each nutritional target
    print("Training models...")
    models = {}
    metrics = {target: {} for target in targets}

    # Convert regression to classification for evaluation (simplified binning)
    # Create only 5 bins instead of 10 for faster processing
    target_bins = {}
    for i, target in enumerate(targets):
        target_vals = df[target].values
        bins = np.linspace(target_vals.min(), target_vals.max(), 6)  # 5 bins instead of 10
        target_bins[target] = bins

    # Single train/test split instead of full cross-validation for speed
    X_train, X_test, y_train, y_test = train_test_split(
        features, targets_scaled, test_size=0.2, random_state=42
    )

    # Train a separate model for each target
    for i, target in enumerate(targets):
        print(f"Training model for {target}...")

        # Get the target values
        y_train_target = y_train[:, i]
        y_test_target = y_test[:, i]

        # Define the model with target-specific parameters
        # Use faster training settings
        lgb_params = {
            'objective': 'regression',
            'metric': 'rmse',
            'boosting_type': 'gbdt',
            'num_leaves': 31,
            'learning_rate': 0.1,  # Higher learning rate for faster training
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': -1
        }

        # Create dataset for LightGBM
        train_data = lgb.Dataset(X_train, label=y_train_target)
        valid_data = lgb.Dataset(X_test, label=y_test_target, reference=train_data)

        # Train the model with fewer rounds and early stopping
        model = lgb.train(
            lgb_params,
            train_data,
            num_boost_round=500,  # Reduced from 1000
            valid_sets=[valid_data],
             callbacks=[lgb.early_stopping(stopping_rounds=30),
               lgb.log_evaluation(period=100)]
        )

        # Save the model
        models[target] = model

        # Make predictions
        y_pred = model.predict(X_test)

        from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
        rmse = np.sqrt(mean_squared_error(y_test_target, y_pred))
        mae = mean_absolute_error(y_test_target, y_pred)
        r2 = r2_score(y_test_target, y_pred)

        print(f"{target} model evaluation:")
        print(f"  RMSE: {rmse:.5f}, MAE: {mae:.5f}, R² Score: {r2:.5f}")

        # For evaluation metrics, convert regression values to classes using bins
        y_test_target_reshaped = y_test_target.reshape(-1, 1)
        y_test_bins = np.digitize(scaler.inverse_transform(np.repeat(y_test_target_reshaped, 4, axis=1))[:, 0],
                                target_bins[target]) - 1

        # Similarly, reshape y_pred before inverse_transform
        y_pred_reshaped = y_pred.reshape(-1, 1)
        y_pred_bins = np.digitize(scaler.inverse_transform(np.repeat(y_pred_reshaped, 4, axis=1))[:, 0],
                                target_bins[target]) - 1

        # Calculate metrics (using binned values for classification metrics)
        metrics[target]['accuracy'] = accuracy_score(y_test_bins, y_pred_bins)
        metrics[target]['precision'] = precision_score(y_test_bins, y_pred_bins, average='weighted')
        metrics[target]['recall'] = recall_score(y_test_bins, y_pred_bins, average='weighted')
        metrics[target]['f1'] = f1_score(y_test_bins, y_pred_bins, average='weighted')

        # Print scores
        print(f"{target} model evaluation:")
        for metric, score in metrics[target].items():
            print(f"  {metric}: {score:.4f}")

    # Save the models and recipe IDs
    for target, model in models.items():
        pickle.dump(model, open(f'model_{target}.pkl', 'wb'))

    # Save recipe IDs and target bins
    pickle.dump(recipe_ids, open('recipe_ids.pkl', 'wb'))
    pickle.dump(target_bins, open('target_bins.pkl', 'wb'))
    pickle.dump(targets, open('target_columns.pkl', 'wb'))
    pickle.dump(metrics, open('model_metrics.pkl', 'wb'))

    print("Model training complete. Models saved to disk.")

    # Save feature list for prediction
    pickle.dump(features.columns.tolist(), open('feature_columns.pkl', 'wb'))

def predict_recipes(protein_constraint, carbs_constraint, fiber_constraint, fat_constraint, limit=10):
    """
    Predict recipes that match the given nutritional constraints
    Optimized for speed
    """
    # Load the models and other necessary data
    models = {}
    targets = pickle.load(open('target_columns.pkl', 'rb'))
    for target in targets:
        models[target] = pickle.load(open(f'model_{target}.pkl', 'rb'))

    recipe_ids = pickle.load(open('recipe_ids.pkl', 'rb'))
    nutrition_scaler = pickle.load(open('nutrition_scaler.pkl', 'rb'))

    # Load the preprocessed data to get the features
    df = pd.read_csv("preprocessed_recipes.csv")

    # Extract the features - fill NaN with 0 for prediction stability
    features = df.drop(['RecipeId'] + targets, axis=1).fillna(0)

    # Get the feature columns from saved list if available
    try:
        feature_columns = pickle.load(open('feature_columns.pkl', 'rb'))
        features = features[feature_columns]
    except:
        # If feature columns aren't available, use what we have
        pass

    # Make predictions for each recipe - vectorized approach
    predictions = np.zeros((len(recipe_ids), len(targets)))
    for i, target in enumerate(targets):
        preds = models[target].predict(features)
        predictions[:, i] = preds

    # Convert predictions back to original scale
    predictions_original = nutrition_scaler.inverse_transform(predictions)

    # Create a DataFrame with recipe IDs and predictions - optimized creation
    results_df = pd.DataFrame({
        'RecipeId': recipe_ids,
        'Predicted_Protein': predictions_original[:, 0],
        'Predicted_Carbs': predictions_original[:, 1],
        'Predicted_Fiber': predictions_original[:, 2],
        'Predicted_Fat': predictions_original[:, 3]
    })

    # Make filtering more efficient by:
    # 1. Setting wider ranges for the first filter to get enough recipes
    # 2. Then sorting by distance and taking the top N

    # Initial filtering with wider ranges (±20%)
    filtered_recipes = results_df[
        (results_df['Predicted_Protein'] >= protein_constraint * 0.8) &
        (results_df['Predicted_Protein'] <= protein_constraint * 1.2) &
        (results_df['Predicted_Carbs'] >= carbs_constraint * 0.8) &
        (results_df['Predicted_Carbs'] <= carbs_constraint * 1.2) &
        (results_df['Predicted_Fiber'] >= fiber_constraint * 0.8) &
        (results_df['Predicted_Fiber'] <= fiber_constraint * 1.2) &
        (results_df['Predicted_Fat'] >= fat_constraint * 0.8) &
        (results_df['Predicted_Fat'] <= fat_constraint * 1.2)
    ]

    # Calculate distance using vectorized operations
    filtered_recipes['Distance'] = (
        ((filtered_recipes['Predicted_Protein'] - protein_constraint) / protein_constraint) ** 2 +
        ((filtered_recipes['Predicted_Carbs'] - carbs_constraint) / carbs_constraint) ** 2 +
        ((filtered_recipes['Predicted_Fiber'] - fiber_constraint) / fiber_constraint) ** 2 +
        ((filtered_recipes['Predicted_Fat'] - fat_constraint) / fat_constraint) ** 2
    )

    # Sort by distance
    filtered_recipes = filtered_recipes.sort_values('Distance')

    # If we still don't have enough recipes, relax constraints further
    if len(filtered_recipes) < 5:
        filtered_recipes = results_df
        filtered_recipes['Distance'] = (
            ((filtered_recipes['Predicted_Protein'] - protein_constraint) / max(protein_constraint, 1)) ** 2 +
            ((filtered_recipes['Predicted_Carbs'] - carbs_constraint) / max(carbs_constraint, 1)) ** 2 +
            ((filtered_recipes['Predicted_Fiber'] - fiber_constraint) / max(fiber_constraint, 1)) ** 2 +
            ((filtered_recipes['Predicted_Fat'] - fat_constraint) / max(fat_constraint, 1)) ** 2
        )
        filtered_recipes = filtered_recipes.sort_values('Distance')

    # Get the top N recipes
    top_recipes = filtered_recipes.head(limit)

    # Load display data to get recipe details
    display_data = pickle.load(open('display_data.pkl', 'rb'))

    # Merge with display data to get recipe details - optimize the merge
    recipe_details = display_data[display_data['RecipeId'].isin(top_recipes['RecipeId'].tolist())]
    final_results = pd.merge(recipe_details, top_recipes, on='RecipeId', how='inner')

    return final_results

if __name__ == "__main__":
    train_recipe_recommendation_model()







Loading data...
Performing feature engineering...
Preprocessing complete. Data saved to preprocessed_recipes.csv
Loading preprocessed data...
Training models...
Training model for ProteinContent...
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[4]	valid_0's rmse: 0.00115498
ProteinContent model evaluation:
  RMSE: 0.00115, MAE: 0.00004, R² Score: 0.00281
ProteinContent model evaluation:
  accuracy: 1.0000
  precision: 1.0000
  recall: 1.0000
  f1: 1.0000
Training model for CarbohydrateContent...
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[15]	valid_0's rmse: 0.00251646
CarbohydrateContent model evaluation:
  RMSE: 0.00252, MAE: 0.00003, R² Score: 0.00489
CarbohydrateContent model evaluation:
  accuracy: 0.9999
  precision: 1.0000
  recall: 0.9999
  f1: 0.9999
Training model for FiberContent...
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration 