## Recipe Recommendation System

In [13]:
!pip install pyspark
!pip install nltk



In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, RegexTokenizer
from pyspark.ml.feature import StopWordsRemover
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Create a Spark session
spark = SparkSession.builder \
    .appName("Recipe Vectorize") \
    .getOrCreate()

# Load the CSV file into a DataFrame
df = spark.read.csv("recipes_combined_dataset.csv", header=True, inferSchema=True)

# Tokenize the ingredients column
tokenizer = Tokenizer(inputCol="ingredients", outputCol="words")
df = tokenizer.transform(df)

# Remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
df = remover.transform(df)

# Apply lemmatization
def lemmatize(words):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]

lemmatize_udf = udf(lemmatize)
df = df.withColumn("lemmatized_words", lemmatize_udf(col("filtered_words")))

# Tokenize lemmatized words into an array
regexTokenizer = RegexTokenizer(inputCol="lemmatized_words", outputCol="tokenized_words", pattern="\\W")
df = regexTokenizer.transform(df)

# Apply HashingTF to convert tokenized words to raw term frequency vectors
hashingTF = HashingTF(inputCol="tokenized_words", outputCol="raw_features", numFeatures=4000)
df = hashingTF.transform(df)

# Apply IDF to compute the TF-IDF vectors
idf = IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(df)
df = idfModel.transform(df)

# Select the relevant columns
result_df = df.select("recipeNames", "features")

# Show the first few rows of the resulting DataFrame
result_df.show(truncate=False)

# Collect features into a list
features_list = result_df.select("features").collect()

# Calculate and display the first 10 cosine similarities for each pair of recipes
print("Cosine Similarities:")
count = 0
for i, entry1 in enumerate(features_list):
    for j, entry2 in enumerate(features_list):
        if i < j:
            raw_features_entry1 = entry1['features']
            raw_features_entry2 = entry2['features']
            entry1_array = np.array(raw_features_entry1.toArray())
            entry2_array = np.array(raw_features_entry2.toArray())
            similarity = cosine_similarity([entry1_array], [entry2_array])
            print(f"Cosine similarity between recipe {i+1} and recipe {j+1}: {similarity}")
            count += 1
            if count >= 10:
                break
    if count >= 10:
        break

# Calculate and display the first 10 centered cosine similarities for each pair of recipes
print("\nCentered Cosine Similarities:")
count = 0
for i, entry1 in enumerate(features_list):
    for j, entry2 in enumerate(features_list):
        if i < j:
            raw_features_entry1 = entry1['features']
            raw_features_entry2 = entry2['features']
            entry1_array = np.array(raw_features_entry1.toArray())
            entry2_array = np.array(raw_features_entry2.toArray())

            # Compute mean centering
            mean_entry1 = np.mean(entry1_array)
            mean_entry2 = np.mean(entry2_array)
            entry1_centered = entry1_array - mean_entry1
            entry2_centered = entry2_array - mean_entry2

            # Calculate centered cosine similarity
            similarity = cosine_similarity([entry1_centered], [entry2_centered])
            print(f"Centered cosine similarity between recipe {i+1} and recipe {j+1}: {similarity}")

            count += 1
            if count >= 10:
                break
    if count >= 10:
        break

# Stop the Spark session
spark.stop()


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


+-----------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|recipeNames                        |features                                                                                                                                                                   