# **Recipes Recommendation System Using Pyspark**

In [7]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=9be466d17454486b12e5c8ccadcc80d7fb3ded3dbef1043e26fcef771b50ec2d
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3


# **Raw Data**

In [33]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.sql.types import StructType, StructField, StringType, FloatType
from pyspark.sql.functions import col, abs, regexp_extract
import numpy as np
import re

df = pd.read_csv('recipes.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,recipe_name,prep_time,cook_time,total_time,servings,yield,ingredients,directions,rating,url,cuisine_path,nutrition,timing,img_src
0,0,Apple-Cranberry Crostada,,,,8,6 to 8 - servings,"3 tablespoons butter, 2 pounds Granny Smith ap...",Heat butter in a large skillet over medium-hig...,4.4,https://www.allrecipes.com/recipe/76931/apple-...,/Desserts/Fruit Desserts/Apple Dessert Recipes/,"Total Fat 18g 23%, Saturated Fat 7g 34%, Chole...","Servings: 8, Yield: 6 to 8 - servings",https://www.allrecipes.com/thmb/Tf1wH73bfH6Oql...
1,1,Apple Pie by Grandma Ople,30 mins,1 hrs,1 hrs 30 mins,8,1 9-inch pie,"8 small Granny Smith apples, or as needed, ½ c...","Peel and core apples, then thinly slice. Set a...",4.8,https://www.allrecipes.com/recipe/12682/apple-...,/Desserts/Pies/Apple Pie Recipes/,"Total Fat 19g 24%, Saturated Fat 9g 46%, Chole...","Prep Time: 30 mins, Cook Time: 1 hrs, Total Ti...",https://www.allrecipes.com/thmb/1I95oiTGz6aEpu...
2,2,Sarah's Homemade Applesauce,10 mins,15 mins,25 mins,4,,"4 apples - peeled, cored and chopped, ¾ cup w...","Combine apples, water, sugar, and cinnamon in ...",4.8,https://www.allrecipes.com/recipe/51301/sarahs...,/Side Dish/Applesauce Recipes/,"Total Fat 0g 0%, Sodium 3mg 0%, Total Carbohyd...","Prep Time: 10 mins, Cook Time: 15 mins, Total ...",https://www.allrecipes.com/thmb/VY5d0tZHB8xz6y...
3,3,Apple Crisp,30 mins,45 mins,1 hrs 15 mins,12,1 9x13-inch pan,"10 cups all-purpose apples, peeled, cored and ...",Preheat the oven to 350 degrees F (175 degrees...,4.7,https://www.allrecipes.com/recipe/12409/apple-...,/Desserts/Crisps and Crumbles Recipes/Apple Cr...,"Total Fat 8g 11%, Saturated Fat 5g 25%, Choles...","Prep Time: 30 mins, Cook Time: 45 mins, Total ...",https://www.allrecipes.com/thmb/uAzhPOh86PfR-N...
4,4,Apple Pie Filling,20 mins,20 mins,2 hrs 40 mins,40,5 9-inch pies,"18 cups thinly sliced apples, 3 tablespoons le...",Toss apples with lemon juice in a large bowl a...,4.7,https://www.allrecipes.com/recipe/12681/apple-...,/Desserts/Pies/Apple Pie Recipes/,"Total Fat 0g 0%, Sodium 61mg 3%, Total Carbohy...","Prep Time: 20 mins, Cook Time: 20 mins, Additi...",https://www.allrecipes.com/thmb/c0bbYaS1V_mTt_...


# **Cleaned Data**

In [34]:

df_cleaned = df.drop(columns=['url', 'img_src', 'rating', 'yield','prep_time',	'cook_time',	'total_time','cuisine_path'])
# Dropping rows with missing values in important columns
df_cleaned = df_cleaned.dropna(subset=['recipe_name', 'ingredients'])

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text


df_cleaned['ingredients'] = df_cleaned['ingredients'].apply(clean_text)
df_cleaned['recipe_name'] = df_cleaned['recipe_name'].apply(clean_text)
if 'directions' in df_cleaned.columns:
    df_cleaned['directions'] = df_cleaned['directions'].apply(clean_text)

# Tokenizing ingredients by splitting them into lists
df_cleaned['ingredients_list'] = df_cleaned['ingredients'].apply(lambda x: x.split(', '))

df_cleaned.head()
df.cleaned.to_csv("df_cleaned.csv")


Unnamed: 0.1,Unnamed: 0,recipe_name,servings,ingredients,directions,nutrition,timing,ingredients_list
0,0,applecranberry crostada,8,3 tablespoons butter 2 pounds granny smith app...,heat butter in a large skillet over mediumhigh...,"Total Fat 18g 23%, Saturated Fat 7g 34%, Chole...","Servings: 8, Yield: 6 to 8 - servings",[3 tablespoons butter 2 pounds granny smith ap...
1,1,apple pie by grandma ople,8,8 small granny smith apples or as needed ½ cup...,peel and core apples then thinly slice set asi...,"Total Fat 19g 24%, Saturated Fat 9g 46%, Chole...","Prep Time: 30 mins, Cook Time: 1 hrs, Total Ti...",[8 small granny smith apples or as needed ½ cu...
2,2,sarahs homemade applesauce,4,4 apples peeled cored and chopped ¾ cup water...,combine apples water sugar and cinnamon in a s...,"Total Fat 0g 0%, Sodium 3mg 0%, Total Carbohyd...","Prep Time: 10 mins, Cook Time: 15 mins, Total ...",[4 apples peeled cored and chopped ¾ cup wate...
3,3,apple crisp,12,10 cups allpurpose apples peeled cored and sli...,preheat the oven to 350 degrees f 175 degrees ...,"Total Fat 8g 11%, Saturated Fat 5g 25%, Choles...","Prep Time: 30 mins, Cook Time: 45 mins, Total ...",[10 cups allpurpose apples peeled cored and sl...
4,4,apple pie filling,40,18 cups thinly sliced apples 3 tablespoons lem...,toss apples with lemon juice in a large bowl a...,"Total Fat 0g 0%, Sodium 61mg 3%, Total Carbohy...","Prep Time: 20 mins, Cook Time: 20 mins, Additi...",[18 cups thinly sliced apples 3 tablespoons le...


# **TF-IDF Transformations**
When a user inputs ingredients, the TF-IDF transformation helps represent those ingredients numerically.
By calculating the TF-IDF vectors for all recipes, we can easily compare these vectors to find recipes that are similar to the user's input.

In [None]:
spark = SparkSession.builder.appName("RecipeRecommendation").getOrCreate()
data = spark.read.csv("df_cleaned.csv", header=True, inferSchema=True)

data = data.withColumn("TotalFat", regexp_extract(col("nutrition"), r'Total Fat (\d+)g', 1).cast(FloatType())) \
           .withColumn("Protein", regexp_extract(col("nutrition"), r'Protein (\d+)g', 1).cast(FloatType())) \
           .withColumn("Carbohydrates", regexp_extract(col("nutrition"), r'Carbohydrates (\d+)g', 1).cast(FloatType()))

# Tokenize ingredients for Transformation into list of lists
tokenizer = Tokenizer(inputCol="ingredients", outputCol="TokenizedWords")
tokenized_data = tokenizer.transform(data)

# Applying TF transformation to get important words
TF = HashingTF(inputCol="TokenizedWords", outputCol="rawFeatures", numFeatures=20)
featurized_data = TF.transform(tokenized_data)

# Apply IDF transformation
idf = IDF(inputCol="rawFeatures", outputCol="IDF_Output")
idf_model = idf.fit(featurized_data)
TFIDF = idf_model.transform(featurized_data)

#**Centered Cosine Similarity**
The mean of each vector is subtracted from the respective vector before computing the similarity. This centering removes the influence of the magnitude of the vectors, focusing instead on their relative distributions.

centered_cosine_similarity(A,B)=
(A−
A
ˉ
 )(B−
B
ˉ
 )/∥
(A−
A
ˉ
 )∥⋅∥(B−
B
ˉ)∥

# **Why Not Simple Cosine Similarity?**

Centered cosine similarity is preferred over traditional cosine similarity in recommendation systems because it normalizes the impact of vector magnitude by focusing on deviations from the mean. This approach reduces bias from common ingredients, emphasizes unique features, and better captures user preferences by evaluating similarity based on relative differences rather than absolute counts. As a result, it yields more meaningful and relevant recommendations.

In [None]:
def centered_cosine_similarity(a, b):
    a_bar = np.mean(a)
    b_bar = np.mean(b)
    a_centered = a - a_bar
    b_centered = b - b_bar
    dot_product = np.dot(a_centered, b_centered)
    norm_a = np.linalg.norm(a_centered)
    norm_b = np.linalg.norm(b_centered)

    if norm_a == 0 or norm_b == 0:
        return 0.0
    else:
        return float(dot_product / (norm_a * norm_b))

# **Recommendation by Nutrition Value**
The user specifies their dietary preferences, such as target values for fat, protein, and carbohydrates.Recipes that fit the user’s nutritional requirements are recommended, providing options that match their dietary goals.

In [None]:
def recommend_by_nutrition(fat=None, protein=None, carbs=None):
    conditions = []
    if fat:
        conditions.append((abs(col("TotalFat") - fat) <= 5))
    if protein:
        conditions.append((abs(col("Protein") - protein) <= 5))
    if carbs:
        conditions.append((abs(col("Carbohydrates") - carbs) <= 5))

    if conditions:
        condition = conditions[0]
        for cond in conditions[1:]:
            condition = condition & cond

        recommended_nutrition = data.filter(condition)
        return recommended_nutrition.select("recipe_name", "ingredients", "nutrition", "timing", "directions")
    else:
        return None

# **Recommendation By Recipe Name**
The user enters the name of a recipe they like.Recommendations are generated based on the highest similarity scores, showcasing recipes that the user might also enjoy based on their stated preference.

In [None]:
def recommend_by_recipe_name(recipe_name):
    recipe = data.filter(col("recipe_name").contains(recipe_name)).first()

    if recipe:
        input_ingredients = recipe['ingredients']

        input_df = spark.createDataFrame([("User Recipe", input_ingredients)], ["recipeNames", "ingredients"])
        input_wordsData = tokenizer.transform(input_df)
        input_featurizedData = TF.transform(input_wordsData)
        input_rescaledData = idf_model.transform(input_featurizedData)
        input_features = input_rescaledData.select("IDF_Output").first()[0].toArray()

        # Calculate similarities using centered cosine similarity
        similarity_data = TFIDF.rdd.map(
            lambda row: (
                row["recipe_name"],
                float(centered_cosine_similarity(row["IDF_Output"].toArray(), input_features))
            )
        )


        schema = StructType([
            StructField("recipeNames", StringType(), True),
            StructField("similarity", FloatType(), True)
        ])

        similarities_df = spark.createDataFrame(similarity_data, schema)

        # Join the similarities with recipe details to display the full recipe
        recommendations = similarities_df.filter(col("similarity") > 0.2).join(data, similarities_df.recipeNames == data.recipe_name)

        return recommendations.orderBy(col("similarity").desc()).select("recipe_name", "ingredients", "nutrition", "timing", "directions", "similarity")

    else:
        print(f"No recipe found with the name {recipe_name}")
        return None


# **Recommendation by Ingredients**
The user provides a list of ingredients they have on hand or want to use.Recipes with the highest similarity scores are recommended to the user, allowing them to discover dishes they can make with the ingredients they have.

In [None]:
def recommend_by_ingredients(user_input):
    input_df = spark.createDataFrame([("User Recipe", user_input)], ["recipeNames", "ingredients"])

    # Tokenize and process user input
    input_wordsData = tokenizer.transform(input_df)
    input_featurizedData = TF.transform(input_wordsData)
    input_rescaledData = idf_model.transform(input_featurizedData)
    input_features = input_rescaledData.select("IDF_Output").first()[0].toArray()

    # Calculate similarities using centered cosine similarity
    similarity_data = TFIDF.rdd.map(
        lambda row: (
            row["recipe_name"],
            float(centered_cosine_similarity(row["IDF_Output"].toArray(), input_features))
        )
    )

    # Define schema for similarities
    schema = StructType([
        StructField("recipeNames", StringType(), True),
        StructField("similarity", FloatType(), True)
    ])

    similarities_df = spark.createDataFrame(similarity_data, schema)

    recommendations = similarities_df.filter(col("similarity") > 0.7).join(data, similarities_df.recipeNames == data.recipe_name)
    return recommendations.orderBy(col("similarity").desc()).select("recipe_name", "ingredients", "nutrition", "timing", "directions", "similarity")


In [30]:
while True:

    print("\nSelect an option:")
    print("1. Search by Ingredients")
    print("2. Search by Recipe Name")
    print("3. Search by Nutrition")
    print("4. Exit")


    try:
        choice = int(input("Enter your choice (1-4): ").strip())
    except ValueError:
        print("Invalid input. Please enter a number between 1 and 4.")
        continue

    if choice == 1:
        user_input = input("Enter the recipe ingredients (separated by commas): ")
        recommendations = recommend_by_ingredients(user_input)
        if recommendations:
            recommendations.show(truncate=False)

    elif choice == 2:

        recipe_name = input("Enter the recipe name: ")
        recipe_recommendations = recommend_by_recipe_name(recipe_name)
        if recipe_recommendations:
            recipe_recommendations.show(truncate=False)

    elif choice == 3:

        fat = float(input("Enter the target fat content (in grams): "))
        protein = float(input("Enter the target protein content (in grams): "))
        carbs = float(input("Enter the target carbohydrates content (in grams): "))

        nutrition_recommendations = recommend_by_nutrition(fat=fat, protein=protein, carbs=carbs)
        if nutrition_recommendations:
            nutrition_recommendations.show(truncate=False)

    elif choice == 4:
        print("Exiting the program. Goodbye!")
        break

    else:
        print("Invalid choice. Please select a number between 1 and 4.")


    continue_choice = input("Do you want to continue? (yes/no): ").strip().lower()
    if continue_choice != 'yes':
        print("Thank you for using the recipe recommendation system!")
        break



Select an option:
1. Search by Ingredients
2. Search by Recipe Name
3. Search by Nutrition
4. Exit
Enter your choice (1-4): 1
Enter the recipe ingredients (separated by commas): apple
+----------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------