# Dependencies

In [1]:
import pandas as pd
import numpy as np
import warnings

# Disable the warning
warnings.filterwarnings('ignore')

# Part 1 : Simple recommendations

## Popularity based recommendation

In [2]:
from config import raw_interactions_path

raw_interactions = pd.read_csv(raw_interactions_path)
raw_interactions.head(2)

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."


In [3]:
def get_review_count(df):
    # Create a dictionary with the count of each type of review per recipe_id
    # Example: {recipe_id: {review: count, review: count, ...}, ...}
    # Example: {240488: {'0': 1, '2': 6, '3': 0, '4': 3, '5': 0}, ...}
    ratings = df.groupby('recipe_id')['rating'].value_counts().unstack(fill_value=0).to_dict(orient = 'index')

    # Create a dictionary with the count of reviews per recipe_id
    # Example: {recipe_id: reviews_count, ...}
    # Example: {240488: 10, ...}
    ratings_count = df.groupby('recipe_id')['rating'].count().to_dict()

    # Create a dataframe with the recipe_id and the ratings and ratings_count dictionaries
    recipe_ids = df['recipe_id'].unique()
    new_df = pd.DataFrame({'recipe_id': recipe_ids})
    new_df['ratings'] = new_df['recipe_id'].map(ratings)
    new_df['ratings_count'] = new_df['recipe_id'].map(ratings_count)
    return new_df


ratings_df = get_review_count(raw_interactions)
ratings_df.head(2)

Unnamed: 0,recipe_id,ratings,ratings_count
0,40893,"{0: 0, 1: 0, 2: 0, 3: 0, 4: 1, 5: 1}",2
1,44394,"{0: 0, 1: 0, 2: 0, 3: 0, 4: 1, 5: 0}",1


In [4]:
# Compute the global average rating
def compute_global_average_rating(df):
    # total number of ratings for all recipes
    total_count = sum([sum(rating_dict.values()) for rating_dict in ratings_df['ratings']])
    # total weighted sum of ratings for all recipes
    total_weighted_sum = sum([int(rating) * count for rating_dict in df['ratings'] for rating, count in rating_dict.items()])
    # compute the global average rating
    global_average = total_weighted_sum / total_count if total_count != 0 else 0
    return global_average


global_average_rating = compute_global_average_rating(ratings_df)

In [5]:
# Compute the weighted average of ratings
def weighted_rating(ratings_dict):
    # minimum number of ratings required to be listed in the chart
    min_number_ratings = 5
    # total number of ratings
    number_ratings = sum(ratings_dict.values())
    # the average rating
    average_rating = sum(int(rating) * count for rating, count in ratings_dict.items()) / number_ratings
    # compute the weighted rating (popularity score)
    weighted_rating = (number_ratings / (number_ratings + min_number_ratings)) * average_rating + (min_number_ratings / (number_ratings + min_number_ratings)) * global_average_rating
    return weighted_rating

def compute_popularity_score(df):
    # Create a dictionary with the popularity score per recipe_id
    # Example: {recipe_id: popularity_score, ...}
    # Example: {240488: 0.5, ...}
    # Note: we will use a weighted average similar to that used by IMDB
    df['popularity'] = df['ratings'].apply(weighted_rating)
    return df

In [6]:
# Compute the popularity score for each recipe
ratings_df = compute_popularity_score(ratings_df)

# Sort the recipes by popularity score
ratings_df = ratings_df.sort_values(by='popularity', ascending=False)
ratings_df.head(10)

Unnamed: 0,recipe_id,ratings,ratings_count,popularity
39572,486261,"{0: 0, 1: 0, 2: 0, 3: 0, 4: 2, 5: 215}",217,4.977726
40002,55309,"{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 52}",52,4.948335
7176,24768,"{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 37}",37,4.929883
56166,121941,"{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 36}",36,4.928173
221730,166669,"{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 36}",36,4.928173
164386,63621,"{0: 0, 1: 0, 2: 0, 3: 1, 4: 1, 5: 72}",74,4.924748
199988,62754,"{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 32}",32,4.920408
19449,42038,"{0: 0, 1: 0, 2: 0, 3: 0, 4: 2, 5: 55}",57,4.920243
178184,269136,"{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 31}",31,4.918197
213836,199171,"{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 31}",31,4.918197


In [7]:
from config import raw_recipes_path

# Get the top 10 most popular recipes
top_10 = ratings_df.head(10)

# Create a dict with the top 10 most popular recipes (keys are the recipe ids and values are the popularity scores)
top_10_popularity_dict = top_10.set_index('recipe_id')['popularity'].to_dict()
top_10_ratings_dict = top_10.set_index('recipe_id')['ratings'].to_dict()

# Load the raw recipes dataset to get recipe information (name, ingredients, ...)
raw_recipes = pd.read_csv(raw_recipes_path)

# Get the names of the top 10 most popular recipes
top_10_popular_recipes = raw_recipes[raw_recipes['id'].isin(top_10['recipe_id'])][["id", "name"]]

# Add the popularity score as a column
top_10_popular_recipes["popularity"] = top_10_popular_recipes["id"].map(top_10_popularity_dict)

# Add the ratings as a column
top_10_popular_recipes["ratings"] = top_10_popular_recipes["id"].map(top_10_ratings_dict)

# Display the top 10 most popular recipes
top_10_popular_recipes = top_10_popular_recipes.sort_values(by='popularity', ascending=False)
top_10_popular_recipes

Unnamed: 0,id,name,popularity,ratings
134684,486261,mexican stack up rsc,4.977726,"{0: 0, 1: 0, 2: 0, 3: 0, 4: 2, 5: 215}"
35255,55309,caprese salad tomatoes italian marinated toma...,4.948335,"{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 52}"
21941,24768,berry cream cheese coffee cake,4.929883,"{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 37}"
118005,166669,kittencal s caesar tortellini salad,4.928173,"{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 36}"
118290,121941,kittencal s soft white baguette style bread,4.928173,"{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 36}"
129662,63621,mango salsa 1,4.924748,"{0: 0, 1: 0, 2: 0, 3: 1, 4: 1, 5: 72}"
124551,62754,linda s special potato salad,4.920408,"{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 32}"
207459,42038,syrup for blueberry pancakes,4.920243,"{0: 0, 1: 0, 2: 0, 3: 0, 4: 2, 5: 55}"
29819,199171,broiled cinnamon toast,4.918197,"{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 31}"
74262,269136,earth bread,4.918197,"{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 31}"


## Recipe steps based recommendation

In [8]:
# Get steps of recipes
steps_df = raw_recipes[["id", "name", "steps"]]

# Change name of column id to recipe_id
steps_df = steps_df.rename(columns={"id": "recipe_id"})

steps_df.head(2)

Unnamed: 0,recipe_id,name,steps
0,137739,arriba baked winter squash mexican style,"['make a choice and proceed with recipe', 'dep..."
1,31490,a bit different breakfast pizza,"['preheat oven to 425 degrees f', 'press dough..."


In [9]:
import ast

steps_df.loc[:, "steps"] = steps_df["steps"].apply(lambda x: ast.literal_eval(x))
steps_df["joined_steps"] = steps_df["steps"].apply(lambda x: ". ".join(x))

steps_df.head(2)

Unnamed: 0,recipe_id,name,steps,joined_steps
0,137739,arriba baked winter squash mexican style,"[make a choice and proceed with recipe, depend...",make a choice and proceed with recipe. dependi...
1,31490,a bit different breakfast pizza,"[preheat oven to 425 degrees f, press dough in...",preheat oven to 425 degrees f. press dough int...


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances

# Here I will limit my self to 10000 recipes, 
# otherwise it will take too much time to compute the cosine similarity matrix
steps_df = steps_df.head(10000)

# Instantiate the vectorizer object to transform the text into a vector of numbers
# We will use the TF-IDF vectorizer
# We will limit the number of features to 4000 (Same as IMDB dataset)
tfidf = TfidfVectorizer(stop_words='english', max_features=4000)
tfidf_matrix = tfidf.fit_transform(steps_df['joined_steps'])

# Compute the cosine similarity matrix
cosine_sim = cosine_distances(tfidf_matrix, tfidf_matrix)

In [11]:
def get_steps_based_recommendations(recipe_id, steps_df, similarity_matrix):
    # The recipe_id could go way beyond 10000
    # So we need to get the index of the recipe_id
    # We will use the index to get the similarity scores
    idx = steps_df[steps_df['recipe_id'] == recipe_id].index[0]
    print(f'original recipe: {steps_df.iloc[idx]["name"]}')
    
    # Get the indices of the top 5 similar recipes
    recommendations_indices = similarity_matrix[idx].argsort()[1:6]
    recommendations_indices = recommendations_indices[::-1]
    recommendations_scores = similarity_matrix[idx][recommendations_indices]
    
    # Get the names of the top 5 similar recipes using list of recommended indices
    recommendations = steps_df[steps_df.index.isin(recommendations_indices)][["recipe_id", "name"]]
    recommendations["similarity_score"] = recommendations_scores

    return recommendations


recipe_id = 137739
get_steps_based_recommendations(recipe_id, steps_df, cosine_sim)

original recipe: arriba   baked winter squash mexican style


Unnamed: 0,recipe_id,name,similarity_score
2485,299091,acorn squash with bacon,0.450325
2491,330611,acorn squash with cranberry apple stuffing,0.447769
2496,102749,acorn squash with raisin sauce,0.439692
2497,117886,acorn squash with roasted garlic custard,0.421851
6779,3805,apple filled acorn squash,0.397336


All the recommended recipes seem to have "squash" in them. Seems to be good.

## Recipe description based recommendations

In [29]:
description_df = raw_recipes[["id", "name", "description"]]

# Change name of column id to recipe_id
description_df = description_df.rename(columns={"id": "recipe_id"})

# Here I will limit my self to 10000 recipes, 
# otherwise it will take too much time to compute the cosine similarity matrix
description_df = description_df.head(10000)


### We can either fill NaN values with empty string or with the name of the recipe
# Fill NaN values with empty string
description_df["description"] = description_df["description"].fillna("")

# Fill NaN description values with the name of the recipe
# description_df["description"] = description_df.apply(lambda x: x["name"] if x["description"] == "" else x["description"], axis=1)

# Visualize the first 3 rows
description_df.head(3)

Unnamed: 0,recipe_id,name,description
0,137739,arriba baked winter squash mexican style,autumn is my favorite time of year to cook! th...
1,31490,a bit different breakfast pizza,this recipe calls for the crust to be prebaked...
2,112140,all in the kitchen chili,this modified version of 'mom's' chili was a h...


In [30]:
# Instantiate the vectorizer object to transform the text into a vector of numbers
# We will use the TF-IDF vectorizer
# We will limit the number of features to 4000 (Same as IMDB dataset)
tfidf = TfidfVectorizer(stop_words='english', max_features=4000)
tfidf_matrix = tfidf.fit_transform(description_df['description'])

# Compute the cosine similarity matrix
cosine_sim = cosine_distances(tfidf_matrix, tfidf_matrix)

In [31]:
def get_description_based_recommendations(recipe_id, description_df, similarity_matrix):
    # The recipe_id could go way beyond 10000
    # So we need to get the index of the recipe_id
    # We will use the index to get the similarity scores
    idx = description_df[steps_df['recipe_id'] == recipe_id].index[0]
    print(f'original recipe: {description_df.iloc[idx]["name"]}')
    print(f'original recipe description: {description_df.iloc[idx]["description"]}')
    
    # Get the indices of the top 5 similar recipes
    recommendations_indices = similarity_matrix[idx].argsort()[1:6]
    recommendations_indices = recommendations_indices[::-1]
    recommendations_scores = similarity_matrix[idx][recommendations_indices]
    
    # Get the names of the top 5 similar recipes using list of recommended indices
    recommendations = description_df[description_df.index.isin(recommendations_indices)][["recipe_id", "name", "description"]]
    recommendations["similarity_score"] = recommendations_scores

    return recommendations


recipe_id = 137739
get_description_based_recommendations(recipe_id, description_df, cosine_sim)

original recipe: arriba   baked winter squash mexican style
original recipe description: autumn is my favorite time of year to cook! this recipe 
can be prepared either spicy or sweet, your choice!
two of my posted mexican-inspired seasoning mix recipes are offered as suggestions.


Unnamed: 0,recipe_id,name,description,similarity_score
2061,174190,a side of black beans and corn,i served this insanely easy side-dish with my ...,0.784475
4947,59394,amish chili,i found this recipe in 'favorite recipes from ...,0.780692
7528,130992,apple banana jam,"this one is really sweet, kids love it and it'...",0.776429
7750,266874,apples sweet potatoes with honey ginger,spicy & sweet.,0.773203
8684,204640,arroz de mexico mexican rice,"another favorite from my little ""low fat mexic...",0.718743
