In [2]:
from surprise import AlgoBase, PredictionImpossible
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
import numpy as np
import math
import heapq
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

In [3]:
# Load datasets
recipes_df = pd.read_csv('C:/Users/arsen/Healthylicious/data/cleaned/csv/recipes_dataset.csv')
ratings_df = pd.read_csv('C:/Users/arsen/Healthylicious/data/cleaned/csv/ratings_dataset.csv')

recipes_df['recipeId'] = recipes_df['recipeId'].astype(int)
ratings_df['recipeId'] = ratings_df['recipeId'].astype(int)
ratings_df['userId'] = ratings_df['userId'].astype(int)

In [4]:
# Initialize TF-IDF Vectorizer and compute TF-IDF matrix
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(','))
tfidf_matrix = vectorizer.fit_transform(recipes_df['Ingredients'])



In [5]:
class ContentKNNAlgorithm(AlgoBase):
    def __init__(self, k=40, sim_options={}):
        AlgoBase.__init__(self)
        self.k = k

    def fit(self, trainset):
        AlgoBase.fit(self, trainset)
        self.recipes = {row['recipeId']: row for _, row in recipes_df.iterrows()}
        self.recipes_index = {row['recipeId']: i for i, row in recipes_df.iterrows()}
        
        print("Computing content-based similarity matrix...")
        self.similarities = np.zeros((self.trainset.n_items, self.trainset.n_items))
        for thisRating in range(self.trainset.n_items):
            if (thisRating % 100 == 0):
                print(thisRating, " of ", self.trainset.n_items)
            for otherRating in range(thisRating + 1, self.trainset.n_items):
                thisRecipeID = int(self.trainset.to_raw_iid(thisRating))
                otherRecipeID = int(self.trainset.to_raw_iid(otherRating))
                similarity = self.computeContentSimilarity(thisRecipeID, otherRecipeID)
                self.similarities[thisRating, otherRating] = similarity
                self.similarities[otherRating, thisRating] = similarity
        print("...done.")
        return self
    
    def computeContentSimilarity(self, recipe1_id, recipe2_id):
        recipe1 = self.recipes[recipe1_id]
        recipe2 = self.recipes[recipe2_id]
        recipe1_index = self.recipes_index[recipe1_id]
        recipe2_index = self.recipes_index[recipe2_id]
        ingredient_similarity = cosine_similarity(tfidf_matrix[recipe1_index], tfidf_matrix[recipe2_index]).flatten()[0]
        category_similarity = 1 if recipe1['Category'] == recipe2['Category'] else 0
        time_diff = abs(recipe1['Total Time'] - recipe2['Total Time'])
        time_similarity = np.exp(-time_diff / 10.0)
        combined_similarity = 0.2 * ingredient_similarity + 0.6 * category_similarity + 0.2 * time_similarity
        return combined_similarity
    
    def estimate(self, u, i):
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unknown.')
        neighbors = []
        for rating in self.trainset.ur[u]:
            content_similarity = self.similarities[i, rating[0]]
            neighbors.append((content_similarity, rating[1]))
        k_neighbors = heapq.nlargest(self.k, neighbors, key=lambda t: t[0])
        if not k_neighbors:
            raise PredictionImpossible('No neighbors')
        simTotal = weightedSum = 0
        for (simScore, rating) in k_neighbors:
            if simScore > 0:
                simTotal += simScore
                weightedSum += simScore * rating
        if simTotal == 0:
            raise PredictionImpossible('No neighbors')
        predictedRating = weightedSum / simTotal
        return predictedRating


In [6]:
# Convert ratings DataFrame to Surprise dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['userId', 'recipeId', 'rating']], reader)

# Split data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.25, random_state=1)

# Initialize and train the content-based recommender
contentKNN = ContentKNNAlgorithm(k=10)
contentKNN.fit(trainset)


Computing content-based similarity matrix...
0  of  1097
100  of  1097
200  of  1097
300  of  1097
400  of  1097
500  of  1097
600  of  1097
700  of  1097
800  of  1097
900  of  1097
1000  of  1097
...done.


<__main__.ContentKNNAlgorithm at 0x292b4b7d040>

In [7]:
# Make predictions on the test set
predictions = contentKNN.test(testset)

In [8]:
# Evaluate the model
def rmse(predictions):
    return np.sqrt(np.mean([(true_r - est_r)**2 for (_, _, true_r, est_r, _) in predictions]))

def mae(predictions):
    return np.mean([abs(true_r - est_r) for (_, _, true_r, est_r, _) in predictions])

print(f'RMSE: {rmse(predictions)}')
print(f'MAE: {mae(predictions)}')

RMSE: 1.0190964729625442
MAE: 0.7930028209915896


In [9]:
def get_top_n_recommendations(predictions, n=10):
    top_n = {}
    for uid, iid, true_r, est_r, _ in predictions:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est_r))

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [10]:
def get_popular_recipes(n=10):
    recipe_stats = ratings_df.groupby('recipeId').agg({'rating': ['mean', 'count']}).reset_index()
    recipe_stats.columns = ['recipeId', 'mean_rating', 'rating_count']
    popular_recipes = pd.merge(recipe_stats, recipes_df, on='recipeId')
    popular_recipes = popular_recipes.sort_values(['rating_count', 'mean_rating'], ascending=False)
    return popular_recipes.head(n)

In [11]:
top_n_recommendations = get_top_n_recommendations(predictions, n=10)

In [13]:
# Display recommendations for a specific user
user_id = 900
if user_id in top_n_recommendations:
    print(f"Top-10 recommendations for user {user_id}:")
    for recipe_id, estimated_rating in top_n_recommendations[user_id]:
        recipe_title = recipes_df[recipes_df['recipeId'] == recipe_id]['Title'].values[0]
        print(f"{recipe_title} (estimated rating: {estimated_rating})")
else:
    print(f"No recommendations for user {user_id}. Showing popular recipes:")
    popular_recipes = get_popular_recipes(n=10)
    for _, row in popular_recipes.iterrows():
        print(f"{row['Title']} (average rating: {row['mean_rating']}, count: {row['rating_count']})")

No recommendations for user 900. Showing popular recipes:
Pineapple Upside Down Cake Recipe (average rating: 3.9204819277108434, count: 415)
Tomato Pie Recipe (average rating: 4.137592137592137, count: 407)
Ham Frittata Recipe (average rating: 4.3138297872340425, count: 376)
Lemon Blueberry Pancakes Recipe (average rating: 4.107734806629834, count: 362)
Over the Top Hot Chocolate Recipe (average rating: 4.034722222222222, count: 360)
Peanut Butter Chocolate Chip Cookies Recipe (average rating: 3.6714697406340058, count: 347)
Swedish Meatballs Recipe (average rating: 3.780952380952381, count: 315)
Divinity Recipe (average rating: 3.888888888888889, count: 306)
Apple Breakfast Cake Recipe (average rating: 3.59672131147541, count: 305)
Shortbread Crust Recipe (average rating: 4.152960526315789, count: 304)


In [14]:
# Save the trained model to a file using pickle
import pickle
with open('content_knn_model.pkl', 'wb') as f:
    pickle.dump(contentKNN, f)


In [15]:
# Save the trained model to a file using joblib (as an alternative)
import joblib
joblib.dump(contentKNN, 'content_knn_model.joblib')

['content_knn_model.joblib']

In [6]:
# # Check the ratings for user 1 in the training set
# user_1_ratings_train = [rating for rating in trainset.ur[trainset.to_inner_uid(1)]]
# print(f"User 1 ratings in training set: {user_1_ratings_train}")


In [7]:
# # Make predictions for user 1 on all items
# testset_user_1 = [[1, item, 0] for item in trainset.all_items()]
# predictions_user_1 = contentKNN.test(testset_user_1)

# # Print the predictions for user 1
# for prediction in predictions_user_1:
#     print(prediction)


In [8]:
# # Display top-10 recommendations for another user
# user_id = 2 # Change this to a different user ID
# top_n_recommendations = get_top_n_recommendations(predictions, n=10)
# if user_id in top_n_recommendations:
#     print(f"Top-10 recommendations for user {user_id}:")
#     for recipe_id, estimated_rating in top_n_recommendations[user_id]:
#         recipe_title = recipes_df[recipes_df['recipeId'] == recipe_id]['Title'].values[0]
#         print(f"{recipe_title} (estimated rating: {estimated_rating})")
# else:
#     print(f"No recommendations for user {user_id}")


In [9]:
# # Check the number of ratings for user 1
# user_1_ratings = ratings_df[ratings_df['userId'] == 1]
# print(f"Number of ratings for user 1: {len(user_1_ratings)}")
# print(user_1_ratings)
