In [20]:
import numpy as np
import heapq
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from surprise import AlgoBase, PredictionImpossible
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
import logging

In [21]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [22]:
# Load datasets
user_recipe_ratings_df = pd.read_csv('C:/Users/arsen/Healthylicious/data/cleaned/csv/ratings_dataset.csv')
recipe_details_df = pd.read_csv('C:/Users/arsen/Healthylicious/data/cleaned/csv/recipes_cleaned_with_ids.csv')

recipe_details_df['recipeId'] = recipe_details_df['recipeId'].astype(int)
user_recipe_ratings_df['recipeId'] = user_recipe_ratings_df['recipeId'].astype(int)
user_recipe_ratings_df['userId'] = user_recipe_ratings_df['userId'].astype(int)

In [23]:
# Initialize TF-IDF Vectorizer and compute TF-IDF matrix
ingredient_vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(','))
ingredient_tfidf_matrix = ingredient_vectorizer.fit_transform(recipe_details_df['Ingredients'])



In [24]:
class RecipeRecommender(AlgoBase):
    def __init__(self, num_neighbors=40, sim_options={}):
        AlgoBase.__init__(self)
        self.num_neighbors = num_neighbors

    def fit(self, trainset):
        AlgoBase.fit(self, trainset)
        self.recipe_data = {row['recipeId']: row for _, row in recipe_details_df.iterrows()}
        self.recipe_index_map = {row['recipeId']: i for i, row in recipe_details_df.iterrows()}
        
        logging.info("Calculating similarity matrix...")
        self.similarity_matrix = np.zeros((self.trainset.n_items, self.trainset.n_items))
        for i in range(self.trainset.n_items):
            if (i % 50 == 0):
                logging.info(f"Processing {i} of {self.trainset.n_items}")
            for j in range(i + 1, self.trainset.n_items):
                recipe1_id = int(self.trainset.to_raw_iid(i))
                recipe2_id = int(self.trainset.to_raw_iid(j))
                similarity_score = self.compute_similarity(recipe1_id, recipe2_id)
                self.similarity_matrix[i, j] = similarity_score
                self.similarity_matrix[j, i] = similarity_score
        logging.info("---- calculating --- is --- done.")
        return self
    
    def compute_similarity(self, recipe1_id, recipe2_id):
        recipe1 = self.recipe_data[recipe1_id]
        recipe2 = self.recipe_data[recipe2_id]
        recipe1_index = self.recipe_index_map[recipe1_id]
        recipe2_index = self.recipe_index_map[recipe2_id]
        ingredient_similarity_score = cosine_similarity(ingredient_tfidf_matrix[recipe1_index], ingredient_tfidf_matrix[recipe2_index]).flatten()[0]
        category_similarity_score = 1 if recipe1['Category'] == recipe2['Category'] else 0
        time_difference = abs(recipe1['Total Time'] - recipe2['Total Time'])
        time_similarity_score = np.exp(-time_difference / 10.0)
        total_similarity_score = 0.2 * ingredient_similarity_score + 0.6 * category_similarity_score + 0.2 * time_similarity_score
        return total_similarity_score
    
    def estimate(self, user, item):
        if not (self.trainset.knows_user(user) and self.trainset.knows_item(item)):
            raise PredictionImpossible('User and/or item is unknown.')
        neighbors = []
        for rating in self.trainset.ur[user]:
            similarity_score = self.similarity_matrix[item, rating[0]]
            neighbors.append((similarity_score, rating[1]))
        top_k_neighbors = heapq.nlargest(self.num_neighbors, neighbors, key=lambda t: t[0])
        if not top_k_neighbors:
            raise PredictionImpossible('No similar neighbors.')
        sim_total = weighted_sum = 0
        for (sim_score, rating) in top_k_neighbors:
            if sim_score > 0:
                sim_total += sim_score
                weighted_sum += sim_score * rating
        if sim_total == 0:
            raise PredictionImpossible('No similar neighbors.')
        predicted_rating = weighted_sum / sim_total
        return predicted_rating


In [25]:
# Convert ratings DataFrame to Surprise dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(user_recipe_ratings_df[['userId', 'recipeId', 'rating']], reader)


In [26]:
# Split data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

# Initialize and train the content-based recommender
recipe_recommender = RecipeRecommender(num_neighbors=10)
recipe_recommender.fit(trainset)


2024-06-21 14:34:31,212 - INFO - Calculating similarity matrix...
2024-06-21 14:34:31,214 - INFO - Processing 0 of 1097
2024-06-21 14:34:58,572 - INFO - Processing 50 of 1097
2024-06-21 14:35:28,505 - INFO - Processing 100 of 1097
2024-06-21 14:35:54,158 - INFO - Processing 150 of 1097
2024-06-21 14:36:20,074 - INFO - Processing 200 of 1097
2024-06-21 14:36:46,700 - INFO - Processing 250 of 1097
2024-06-21 14:37:08,762 - INFO - Processing 300 of 1097
2024-06-21 14:37:29,537 - INFO - Processing 350 of 1097
2024-06-21 14:37:47,521 - INFO - Processing 400 of 1097
2024-06-21 14:38:05,150 - INFO - Processing 450 of 1097
2024-06-21 14:38:20,267 - INFO - Processing 500 of 1097
2024-06-21 14:38:35,784 - INFO - Processing 550 of 1097
2024-06-21 14:38:49,481 - INFO - Processing 600 of 1097
2024-06-21 14:39:04,979 - INFO - Processing 650 of 1097
2024-06-21 14:39:15,888 - INFO - Processing 700 of 1097
2024-06-21 14:39:25,040 - INFO - Processing 750 of 1097
2024-06-21 14:39:33,195 - INFO - Processi

<__main__.RecipeRecommender at 0x16a8d3442e0>

In [27]:
# Make predictions on the test set
predictions = recipe_recommender.test(testset)

In [28]:
# Evaluate the model
def compute_rmse(predictions):
    return np.sqrt(np.mean([(true_r - est_r)**2 for (_, _, true_r, est_r, _) in predictions]))

def compute_mae(predictions):
    return np.mean([abs(true_r - est_r) for (_, _, true_r, est_r, _) in predictions])

logging.info(f'RMSE: {compute_rmse(predictions)}')
logging.info(f'MAE: {compute_mae(predictions)}')

2024-06-21 14:40:03,493 - INFO - RMSE: 1.0200559984180124
2024-06-21 14:40:03,506 - INFO - MAE: 0.7951441249508928


In [29]:
def generate_top_n_recommendations(predictions, n=10):
    top_n_recommendations = {}
    for user_id, item_id, true_rating, est_rating, _ in predictions:
        if user_id not in top_n_recommendations:
            top_n_recommendations[user_id] = []
        top_n_recommendations[user_id].append((item_id, est_rating))

    for user_id, user_ratings in top_n_recommendations.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n_recommendations[user_id] = user_ratings[:n]

    return top_n_recommendations

In [30]:
def get_top_popular_recipes(n=10):
    recipe_statistics = user_recipe_ratings_df.groupby('recipeId').agg({'rating': ['mean', 'count']}).reset_index()
    recipe_statistics.columns = ['recipeId', 'mean_rating', 'rating_count']
    popular_recipes = pd.merge(recipe_statistics, recipe_details_df, on='recipeId')
    popular_recipes = popular_recipes.sort_values(['rating_count', 'mean_rating'], ascending=False)
    return popular_recipes.head(n)

In [31]:
top_n_recommendations = generate_top_n_recommendations(predictions, n=10)

In [32]:
# Display recommendations for a specific user
specific_user_id = 2
if specific_user_id in top_n_recommendations:
    logging.info(f"Top-10 recommendations for user {specific_user_id}:")
    for recipe_id, estimated_rating in top_n_recommendations[specific_user_id]:
        recipe_title = recipe_details_df[recipe_details_df['recipeId'] == recipe_id]['Title'].values[0]
        logging.info(f"{recipe_title} (estimated rating: {estimated_rating})")
else:
    logging.info(f"No recommendations for user {specific_user_id}. Showing popular recipes:")
    popular_recipes = get_top_popular_recipes(n=10)
    for _, row in popular_recipes.iterrows():
        logging.info(f"{row['Title']} (average rating: {row['mean_rating']}, count: {row['rating_count']})")

2024-06-21 14:40:03,629 - INFO - Top-10 recommendations for user 2:
2024-06-21 14:40:03,669 - INFO - Classic Caesar Dressing Recipe (estimated rating: 3.9913576177216714)
2024-06-21 14:40:03,670 - INFO - Roasted Sweet Potato Wedges Recipe (estimated rating: 3.6837830326182326)
2024-06-21 14:40:03,672 - INFO - Skinny No-Bake Chocolate Cookie Recipe (estimated rating: 3.6387948073964926)
2024-06-21 14:40:03,673 - INFO - Fresh Pineapple Sorbet Recipe (estimated rating: 3.605132845294333)
2024-06-21 14:40:03,674 - INFO - Chocolate Espresso Ganache Pie Recipe (estimated rating: 3.478857015263494)
2024-06-21 14:40:03,675 - INFO - Perfect Whipped Cream Recipe (estimated rating: 3.4481130386189)
2024-06-21 14:40:03,678 - INFO - Lemon Buttermilk Waffles Recipe (estimated rating: 3.4312400119345443)
2024-06-21 14:40:03,679 - INFO - Cranberry Salsa over Cream Cheese Recipe (estimated rating: 3.3568863750981897)
2024-06-21 14:40:03,679 - INFO - Peanut Butter Sheet Cake Recipe (estimated rating: 3.

In [33]:
# Save the trained model to a file using pickle
import pickle
with open('recipe_recommender_KNN.pkl', 'wb') as f:
    pickle.dump(recipe_recommender, f)


In [34]:
# Save the trained model to a file using joblib (as an alternative)
import joblib
joblib.dump(recipe_recommender, 'recipe_recommender_KNN.joblib')

['recipe_recommender_KNN.joblib']

## Debugging

In [35]:
# # Check the ratings for user 1 in the training set
user_1_ratings_train = [rating for rating in trainset.ur[trainset.to_inner_uid(1)]]
print(f"User 1 ratings in training set: {user_1_ratings_train}")


User 1 ratings in training set: [(620, 3.5), (881, 4.0), (102, 2.0), (989, 2.0), (511, 4.0), (1017, 2.5), (764, 1.0), (136, 3.0), (461, 2.0), (1042, 2.0), (952, 2.5), (1000, 3.0), (532, 4.0), (973, 2.5)]


In [36]:
# Make predictions for user 1 on all items
testset_user_1 = [[1, item, 0] for item in trainset.all_items()]
predictions_user_1 = recipe_recommender.test(testset_user_1)

# Print the predictions for user 1
for prediction in predictions_user_1:
    print(prediction)


user: 1          item: 0          r_ui = 0.00   est = 3.55   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 1          item: 1          r_ui = 0.00   est = 2.88   {'was_impossible': False}
user: 1          item: 2          r_ui = 0.00   est = 1.91   {'was_impossible': False}
user: 1          item: 3          r_ui = 0.00   est = 2.47   {'was_impossible': False}
user: 1          item: 4          r_ui = 0.00   est = 2.45   {'was_impossible': False}
user: 1          item: 5          r_ui = 0.00   est = 2.54   {'was_impossible': False}
user: 1          item: 6          r_ui = 0.00   est = 2.86   {'was_impossible': False}
user: 1          item: 7          r_ui = 0.00   est = 2.53   {'was_impossible': False}
user: 1          item: 8          r_ui = 0.00   est = 2.89   {'was_impossible': False}
user: 1          item: 9          r_ui = 0.00   est = 2.33   {'was_impossible': False}
user: 1          item: 10         r_ui = 0.00   est = 2.45   {'was_impossible': False}
us

In [37]:
# Display top-10 recommendations for another user
user_id = 2 # Change this to a different user ID
top_n_recommendations = generate_top_n_recommendations(predictions, n=10)
if user_id in top_n_recommendations:
    print(f"Top-10 recommendations for user {user_id}:")
    for recipe_id, estimated_rating in top_n_recommendations[user_id]:
        recipe_title = recipe_details_df [recipe_details_df ['recipeId'] == recipe_id]['Title'].values[0]
        print(f"{recipe_title} (estimated rating: {estimated_rating})")
else:
    print(f"No recommendations for user {user_id}")


Top-10 recommendations for user 2:
Classic Caesar Dressing Recipe (estimated rating: 3.9913576177216714)
Roasted Sweet Potato Wedges Recipe (estimated rating: 3.6837830326182326)
Skinny No-Bake Chocolate Cookie Recipe (estimated rating: 3.6387948073964926)
Fresh Pineapple Sorbet Recipe (estimated rating: 3.605132845294333)
Chocolate Espresso Ganache Pie Recipe (estimated rating: 3.478857015263494)
Perfect Whipped Cream Recipe (estimated rating: 3.4481130386189)
Lemon Buttermilk Waffles Recipe (estimated rating: 3.4312400119345443)
Cranberry Salsa over Cream Cheese Recipe (estimated rating: 3.3568863750981897)
Peanut Butter Sheet Cake Recipe (estimated rating: 3.309578833390095)
Toffee Apple Pie Cookies Recipe (estimated rating: 3.3024148959730297)


In [38]:
# Check the number of ratings for user 2
user_2_ratings = user_recipe_ratings_df[user_recipe_ratings_df['userId'] == 2]
print(f"Number of ratings for user 1: {len(user_1_ratings)}")
print(user_2_ratings)


Number of ratings for user 1: 20
    userId  recipeId  rating  timestamp
20       2        10     4.0  835355493
21       2        17     5.0  835355681
22       2        39     5.0  835355604
23       2        47     4.0  835355552
24       2        50     4.0  835355586
..     ...       ...     ...        ...
91       2       592     5.0  835355395
92       2       593     3.0  835355511
93       2       616     3.0  835355932
94       2       661     4.0  835356141
95       2       720     4.0  835355978

[76 rows x 4 columns]
