In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Imports

In [None]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler

# Read the datasets

In [None]:
recipes_df = pd.read_parquet("/kaggle/input/foodcom-recipes-and-reviews/recipes.parquet")
reviews_df = pd.read_parquet("/kaggle/input/foodcom-recipes-and-reviews/reviews.parquet")

# Display

In [None]:
recipes_df.head()

In [None]:
reviews_df.head()

# Pre-processing

In [None]:
nutritional_cols = ["RecipeId", "Name", "Calories", "FatContent", "SaturatedFatContent", "CholesterolContent", "SodiumContent", "CarbohydrateContent", "FiberContent", "SugarContent", "ProteinContent", "RecipeServings"]
nutritional_df = recipes_df[nutritional_cols]

In [None]:
nutritional_df.head()

In [None]:
nutritional_df = nutritional_df.dropna(axis=0)

Divide all columns in nutritional_df by the by RecipeServings

In [None]:
cols_to_divide = ["Calories", "FatContent", "SaturatedFatContent", "CholesterolContent", "SodiumContent", "CarbohydrateContent", "FiberContent", "SugarContent", "ProteinContent"]
nutritional_df[cols_to_divide] = nutritional_df[cols_to_divide].div(nutritional_df["RecipeServings"], axis=0)
nutritional_df.drop("RecipeServings", axis=1)

# Train a kNN model to identify the 50 most similar recipes to a given recipe

In [None]:
class RecipeRecommender:
    def __init__(self, nutritional_df, cols_to_divide):
        self.original_nutritional_df = nutritional_df
        self.nutritional_df = nutritional_df.copy()
        self.cols_to_divide = cols_to_divide
        
        # normalize the attributes
        self.scaler = MinMaxScaler()
        self.nutritional_df[self.cols_to_divide] = self.scaler.fit_transform(self.nutritional_df[self.cols_to_divide])
        
        self.knn = NearestNeighbors(metric='euclidean')
        self.knn.fit(self.nutritional_df[self.cols_to_divide])

    def find_closest_recipes(self, recipe_id, k=50):
        input_recipe = self.nutritional_df.loc[self.nutritional_df["RecipeId"] == recipe_id, self.cols_to_divide]
        distances, indices = self.knn.kneighbors(input_recipe, n_neighbors=k+1)  # +1 to exclude the recipe itself
        
        closest_indices = indices[0][1:]  # Exclude the first element (recipe itself)
        return self.original_nutritional_df.iloc[closest_indices]

    def get_trained_model(self):
        return self.knn

In [None]:
# Initialize the RecipeRecommender with the prepared nutritional_df
recommender = RecipeRecommender(nutritional_df, cols_to_divide)

In [None]:
# Find the closest recipes for a given input recipe
input_recipe_id = 41.0 # enter the input recipe number here
result = recommender.find_closest_recipes(input_recipe_id)
print("Closest recipes for RecipeId", input_recipe_id, "from the original nutritional_df:")
result