In [4]:
import pandas as pd




   recipeId               Category                                      Title  \
0         1            Main Course                   Swedish Meatballs Recipe   
1         2  Appetizer,Main Course  Baked Crispy Buffalo Chicken Wings Recipe   
2         3              Appetizer                           Guacamole Recipe   
3         4              Appetizer                       Perfect Queso Recipe   
4         5              Appetizer                 Buffalo Chicken Dip Recipe   

   Total Time                                       Instructions  \
0          30  For the meatballs:\r\nMix together all ingredi...   
1          60  Preheat the oven to 425º F. Line a baking shee...   
2           5  Mash avocado with a fork or potato masher in a...   
3           5  Stovetop Queso:\r\nAdd cheeses and half of the...   
4          25  Preheat oven to 350º F.\r\nMix together all in...   

                                           Nutrition   Cuisine       Yields  \
0  {'calories': '220 kcal

In [3]:
# filtered_df = recipes_df.rename(columns={"IngredientsRemovedAdj": "Ingredients","ID":"recipeId"})

# # Save the filtered DataFrame to a new CSV file
# filtered_df.to_csv("recipes_dataset.csv", index=False)

In [7]:
from surprise import AlgoBase, PredictionImpossible
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
import numpy as np
import math
import heapq
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Load datasets
recipes_df = pd.read_csv('C:/Users/arsen/Healthylicious/data/cleaned/csv/recipes_dataset.csv')
ratings_df = pd.read_csv('C:/Users/arsen/Healthylicious/data/cleaned/csv/ratings_dataset.csv')

# Filter out unnecessary columns if needed (assuming this is not needed here)

recipes_df['recipeId'] = recipes_df['recipeId'].astype(int)
ratings_df['recipeId'] = ratings_df['recipeId'].astype(int)
ratings_df['userId'] = ratings_df['userId'].astype(int)
# Initialize TF-IDF Vectorizer and compute TF-IDF matrix
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(','))
tfidf_matrix = vectorizer.fit_transform(recipes_df['Ingredients'])

class ContentKNNAlgorithm(AlgoBase):
    def __init__(self, k=40, sim_options={}):
        AlgoBase.__init__(self)
        self.k = k

    def fit(self, trainset):
        AlgoBase.fit(self, trainset)

        # Load up content attributes for every recipe
        self.recipes = {row['recipeId']: row for _, row in recipes_df.iterrows()}
        self.recipes_index = {row['recipeId']: i for i, row in recipes_df.iterrows()}  # Initialize recipes_index
        
        print("Computing content-based similarity matrix...")
        
        # Compute content similarity for every recipe combination
        self.similarities = np.zeros((self.trainset.n_items, self.trainset.n_items))
        for thisRating in range(self.trainset.n_items):
            if (thisRating % 100 == 0):
                print(thisRating, " of ", self.trainset.n_items)
            for otherRating in range(thisRating + 1, self.trainset.n_items):
                thisRecipeID = int(self.trainset.to_raw_iid(thisRating))
                otherRecipeID = int(self.trainset.to_raw_iid(otherRating))
                similarity = self.computeContentSimilarity(thisRecipeID, otherRecipeID)
                self.similarities[thisRating, otherRating] = similarity
                self.similarities[otherRating, thisRating] = similarity
                
        print("...done.")
                
        return self
    
    def computeContentSimilarity(self, recipe1_id, recipe2_id):
        recipe1 = self.recipes[recipe1_id]
        recipe2 = self.recipes[recipe2_id]
        
        # Get the index from the recipes_index
        recipe1_index = self.recipes_index[recipe1_id]
        recipe2_index = self.recipes_index[recipe2_id]

        # Compute ingredient similarity
        ingredient_similarity = cosine_similarity(tfidf_matrix[recipe1_index], tfidf_matrix[recipe2_index]).flatten()[0]

        # Compute similarity based on category
        category_similarity = 1 if recipe1['Category'] == recipe2['Category'] else 0

        # Compute similarity based on total time
        time_diff = abs(recipe1['Total Time'] - recipe2['Total Time'])
        time_similarity = math.exp(-time_diff / 10.0)

        # Combine similarities (adjust weights as necessary)
        combined_similarity = 0.2 * ingredient_similarity + 0.6 * category_similarity + 0.2 * time_similarity

        return combined_similarity
    
    def estimate(self, u, i):
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unknown.')

        # Build up similarity scores between this item and everything the user rated
        neighbors = []
        for rating in self.trainset.ur[u]:
            content_similarity = self.similarities[i, rating[0]]
            neighbors.append((content_similarity, rating[1]))

        # Extract the top-K most-similar ratings
        k_neighbors = heapq.nlargest(self.k, neighbors, key=lambda t: t[0])

        if not k_neighbors:
            raise PredictionImpossible('No neighbors')

        # Compute average sim score of K neighbors weighted by user ratings
        simTotal = weightedSum = 0
        for (simScore, rating) in k_neighbors:
            if simScore > 0:
                simTotal += simScore
                weightedSum += simScore * rating

        if simTotal == 0:
            raise PredictionImpossible('No neighbors')

        predictedRating = weightedSum / simTotal

        return predictedRating

# Convert ratings DataFrame to Surprise dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['userId', 'recipeId', 'rating']], reader)

# Split data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.25, random_state=1)

# Initialize and train the content-based recommender
contentKNN = ContentKNNAlgorithm(k=10)  # Adjust k as necessary
contentKNN.fit(trainset)

# Make predictions on the test set
predictions = contentKNN.test(testset)

# Evaluate the model
def rmse(predictions):
    return np.sqrt(np.mean([(true_r - est_r)**2 for (_, _, true_r, est_r, _) in predictions]))

def mae(predictions):
    return np.mean([abs(true_r - est_r) for (_, _, true_r, est_r, _) in predictions])

print(f'RMSE: {rmse(predictions)}')
print(f'MAE: {mae(predictions)}')

def get_top_n_recommendations(predictions, n=10):
    top_n = {}
    for uid, iid, true_r, est_r, _ in predictions:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est_r))

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

top_n_recommendations = get_top_n_recommendations(predictions, n=10)

# Display top-10 recommendations for a specific user
user_id = 1
if user_id in top_n_recommendations:
    print(f"Top-10 recommendations for user {user_id}:")
    for recipe_id, estimated_rating in top_n_recommendations[user_id]:
        recipe_title = recipes_df[recipes_df['recipeId'] == recipe_id]['Title'].values[0]
        print(f"{recipe_title} (estimated rating: {estimated_rating})")
else:
    print(f"No recommendations for user {user_id}")




Computing content-based similarity matrix...
0  of  1097
100  of  1097
200  of  1097
300  of  1097
400  of  1097
500  of  1097
600  of  1097
700  of  1097
800  of  1097
900  of  1097
1000  of  1097
...done.
RMSE: 1.0190964729625442
MAE: 0.7930028209915896
Top-10 recommendations for user 1:
Creamed Spinach Recipe (estimated rating: 3.2745073219538097)
Balsamic Baked Pork Chops Recipe (estimated rating: 2.9697924340503588)
Chocolate Haystacks Recipe (estimated rating: 2.9461831860215697)
Blackberry Tartlets Recipe (estimated rating: 2.940363477618144)
Roast Chicken Pasta with Sundried Tomatoes Recipe (estimated rating: 2.927727611831583)
Simple Caprese Salad Skewers (estimated rating: 2.9174677948741894)
Zucchini Chips Recipe (estimated rating: 2.9029987516434663)


In [38]:
import pickle

# Save the trained model to a file
with open('content_knn_model.pkl', 'wb') as f:
    pickle.dump(contentKNN, f)


In [8]:
import joblib

# Save the trained model to a file
joblib.dump(contentKNN, 'content_knn_model.joblib')


['content_knn_model.joblib']

In [29]:
# # Check the ratings for user 1 in the training set
# user_1_ratings_train = [rating for rating in trainset.ur[trainset.to_inner_uid(1)]]
# print(f"User 1 ratings in training set: {user_1_ratings_train}")


User 1 ratings in training set: [(370, 2.0), (42, 4.0), (436, 3.5), (504, 3.0), (414, 2.5), (1042, 4.0), (721, 2.5), (938, 3.0), (974, 2.0), (823, 3.0), (127, 4.0), (276, 1.0), (243, 3.0)]


In [30]:
# # Make predictions for user 1 on all items
# testset_user_1 = [[1, item, 0] for item in trainset.all_items()]
# predictions_user_1 = contentKNN.test(testset_user_1)

# # Print the predictions for user 1
# for prediction in predictions_user_1:
#     print(prediction)


user: 1          item: 0          r_ui = 0.00   est = 3.55   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1          item: 1          r_ui = 0.00   est = 2.97   {'was_impossible': False}
user: 1          item: 2          r_ui = 0.00   est = 1.92   {'was_impossible': False}
user: 1          item: 3          r_ui = 0.00   est = 2.76   {'was_impossible': False}
user: 1          item: 4          r_ui = 0.00   est = 2.74   {'was_impossible': False}
user: 1          item: 5          r_ui = 0.00   est = 2.90   {'was_impossible': False}
user: 1          item: 6          r_ui = 0.00   est = 2.92   {'was_impossible': False}
user: 1          item: 7          r_ui = 0.00   est = 2.91   {'was_impossible': False}
user: 1          item: 8          r_ui = 0.00   est = 2.96   {'was_impossible': False}
user: 1          item: 9          r_ui = 0.00   est = 2.50   {'was_impossible': False}
user: 1          item: 10         r_ui = 0.00   est = 2.75   {'was_impossible': False}
us

In [36]:
# # Display top-10 recommendations for another user
# user_id = 2 # Change this to a different user ID
# top_n_recommendations = get_top_n_recommendations(predictions, n=10)
# if user_id in top_n_recommendations:
#     print(f"Top-10 recommendations for user {user_id}:")
#     for recipe_id, estimated_rating in top_n_recommendations[user_id]:
#         recipe_title = recipes_df[recipes_df['recipeId'] == recipe_id]['Title'].values[0]
#         print(f"{recipe_title} (estimated rating: {estimated_rating})")
# else:
#     print(f"No recommendations for user {user_id}")


Top-10 recommendations for user 2:
Apple Cider Waffles Recipe (estimated rating: 3.9982385037231993)
Lemon Blueberry Cornbread Muffins (estimated rating: 3.9034955203975787)
Huevos Rancheros Recipe (estimated rating: 3.8886936974486654)
Classic Caesar Dressing Recipe (estimated rating: 3.7604355888155045)
Chocolate Chip Muffins Recipe (estimated rating: 3.7274027244643255)
Corn Casserole (estimated rating: 3.7101108111263974)
Blackberry Syrup Recipe (estimated rating: 3.695200152493039)
Salted Caramel Blondies Recipe (estimated rating: 3.47744503195459)
Fresh Pineapple Sorbet Recipe (estimated rating: 3.4003186681069937)
Chili Pasta Skillet Recipe (estimated rating: 3.394149395939436)


In [14]:
# # Check the number of ratings for user 1
# user_1_ratings = ratings_df[ratings_df['userId'] == 1]
# print(f"Number of ratings for user 1: {len(user_1_ratings)}")
# print(user_1_ratings)


Number of ratings for user 1: 3
   userId  recipeId  rating   timestamp
0       1        31     2.5  1260759144
1       1      1029     3.0  1260759179
2       1      1061     3.0  1260759182
