In [3]:
import pandas as pd
import numpy as np
from surprise import Dataset
from surprise import Reader
from surprise.prediction_algorithms.matrix_factorization import SVD as FunkSVD
from surprise.model_selection import train_test_split
import matplotlib.pyplot as plt
import ast
from gensim.models import Word2Vec
import string
from nltk import WordNetLemmatizer
import re
import unidecode
from sklearn.metrics.pairwise import cosine_similarity
import warnings


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
warnings.filterwarnings('ignore')

# Hybrid Recommendation Approach
Based on our findings from our collaborative and content-based recommender, we determined that our best plan going to use a hybrid model and combine both models. As discovered when preparing our content-based and collaborative model, our intital goal of recommending multiple recipes with the same ingredients to produce a groceyr list is not going to work. What we are going to do going forward will be to simply our plan. We will recommend one new recipe to our selected user based on their input of ingredients. <br>

The way we will do this is to generate a list of recipes first through the collaborative filtering system, which will refine the list of recipes with new recipes that user may like. With the refined list of recipes, we will then use the content-based recommender system to filter top N recipes that are most similar to what the user has inputted.

In [4]:
#Import recipes data
df = pd.read_csv('Cleaned_recipes.csv')

In [5]:
#Import review data
df_review = pd.read_csv('Cleaned_reviews.csv')

In [6]:
df.drop(columns = 'Unnamed: 0', inplace = True)

In [7]:
df_review.drop(columns = 'Unnamed: 0', inplace = True)

## Collaborative model FunkSVD - refining the initial list of recipes

In [8]:
#Initializing FunkSVD algorithm
my_reader = Reader(rating_scale = (1,5))

df_review_data = Dataset.load_from_df(df_review, my_reader)

#Build full trainset
full_trainset = df_review_data.build_full_trainset()

#Build Funk SVD algorithm with optimized parameters
review_algo = FunkSVD(n_factors = 10,
                     n_epochs = 100,
                     lr_all = 0.005,
                     biased = False,
                     verbose = 0)

#Fit with full trainset
review_algo.fit(full_trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1d9b98d35e0>

In [9]:
#Getting Latent Variables
User = review_algo.pu
Item = review_algo.qi.T

In [12]:
#FunkSVD recommendation
def get_recommendations_1(input_user, N):
    """
    Function will provide top N recipes based on ratings for the input_user.
    
    input_user: User id
    N: parameters for top number of recipes to return
    
    Output: Dataframe with top N recipes sort by ratings
    
    """
    inner_user_id = full_trainset.to_inner_uid(input_user)
    user_profile = User[inner_user_id]

    rated_recipes = df_review[df_review['User'] == input_user]['RecipeId']

    unrated_recipes = df_review[~df_review['RecipeId'].isin(rated_recipes)]['RecipeId'].unique()

    rating_list = []
    for val in unrated_recipes:
        input_recipe = val

        inner_recipe_id = full_trainset.to_inner_iid(input_recipe)
        recipe_profile = Item[:, inner_recipe_id]

        rating = np.dot(user_profile, recipe_profile)
        rating_list.append(rating)

    min_rating = np.min(rating_list)
    max_rating = np.max(rating_list)

    mapped_rating_list = []
    for rating in rating_list:
        # Map the rating to a 1-5 scale
        mapped_rating = 1 + (rating - min_rating) / (max_rating - min_rating) * 4
        mapped_rating_list.append(mapped_rating)

    top_indices = np.argsort(mapped_rating_list)[::-1][:N]
    top_recipes = [unrated_recipes[i] for i in top_indices]
    top_ratings = [mapped_rating_list[i] for i in top_indices]
    top_recipes_names = [df.loc[df['RecipeId'] == i, 'RecipeName'].iloc[0] for i in top_recipes]
    ingredients = [df.loc[df['RecipeId'] == i, 'ingredients'].iloc[0] for i in top_recipes]
    
    recommendations_df = pd.DataFrame({'RecipeName': top_recipes_names, 'Rating': top_ratings, 'ingredients': ingredients})
    return recommendations_df

In [13]:
df_refined = get_recommendations_1(2312,100)

In [14]:
df_refined

Unnamed: 0,RecipeName,Rating,ingredients
0,Extremely Soft White Bread (Bread Machine),5.0,"['hot water', 'yeast', 'sugar', 'vegetable oil..."
1,Bahama Mama Banana Rum Cake,4.867263,"['unsalted butter', 'water', 'brown sugar', 'g..."
2,A1b Bacon Burger Bites on White Polenta Cakes #A1,4.853836,"['milk', 'water', 'cayenne pepper', 'salt', 'p..."
3,Copy-Cat Panera Cream of Chicken and Wild Rice...,4.823795,"['chicken broth', 'chicken breast halves', 'lo..."
4,Texas Chewy Pralines,4.761721,"['nonstick cooking spray', 'white sugar', 'bro..."
5,Elderberry Pie,4.749467,"['elderberry', 'sugar', 'flour', 'lemon juice'..."
6,Easy Cream of Potato Soup,4.73081,"['chicken broth', 'onion', 'potato', 'dried di..."
7,Old Fashioned Molasses &quot;caramel&quot; Corn,4.730153,"['popped popcorn', 'butter', 'molasses', 'brow..."
8,Donair Meat Sandwiches,4.716243,"['lean hamburger', 'breadcrumb', 'pepper', 're..."
9,Oven Baked Tacos,4.70342,"['ground beef', 'taco seasoning', 'tomato sauc..."


In [15]:
type(df_refined['ingredients'][0])

str

# Content-Based Recommender

In [16]:
#Change String to List
df_refined['ingredients'] = df_refined['ingredients'].apply(lambda s: list(ast.literal_eval(s)))

In [17]:
#Ingredient preprocess function
def ingredient_preprocess(ingredients):
        
    ingrd_list = []
    translator = str.maketrans('','', string.punctuation)
    num_pattern = r'[0-9]'
    non_alphabet = r'[\W_]'
    lemmatizer = WordNetLemmatizer()
    for i in ingredients:
        #remove punctuations
        items = i.translate(translator)
        
        #Making all characters lowercase
        items = i.lower()
        
        #remove any numbers
        items = re.sub(num_pattern, ' ', items)
        
        #remove accents
        items = unidecode.unidecode(items)
        
        #remove any non-alphabet characters
        items = re.sub(non_alphabet, ' ', items)
        
        #Lemmatize words
        items = lemmatizer.lemmatize(items)
        
        ingrd_list.append(items)
    return ingrd_list
   

In [18]:
#Clean up output of ingredients from recommendation
def ingredient_parser_final(ingredient):
    """
    cleanup ingredients output
    """
    if isinstance(ingredient, list):
        ingredients = ingredient
    else:
        ingredients = ast.literal_eval(ingredient)

    ingredients = ",".join(ingredients)
    ingredients = unidecode.unidecode(ingredients)
    return ingredients

In [19]:
#To sort ingredients list in alphabetical order
def get_and_sort_corpus(data):
    corpus_sorted = []
    for doc in data['ingredients'].values:
        doc.sort()
        corpus_sorted.append(doc)
    return corpus_sorted

In [20]:
type(df_refined['ingredients'][0])

list

In [21]:
ingredients_corpus = get_and_sort_corpus(df_refined)

In [22]:
#Building Word2Vec model with vec size of 150
total_lengths = [len(ingredients) for ingredients in df_refined['ingredients']]
avg_len = sum(total_lengths) / len(total_lengths)

model_Word2Vec = Word2Vec(ingredients_corpus, 
                          sg = 0, 
                          workers = 3, 
                          min_count = 1, 
                          window = avg_len, 
                          vector_size = 150)

w2v = {word: model_Word2Vec.wv[word] for word in model_Word2Vec.wv.key_to_index}

In [23]:
class MeanEmbeddingVectorizer(object):
    
    def __init__(self, model_Word2Vec):
        self.model_Word2Vec = model_Word2Vec
        self.vector_size = model_Word2Vec.wv.vector_size
    
    def transform(self, docs):
        doc_word_vector = self.doc_average_list(docs)
        return doc_word_vector
    
    def doc_average(self, doc):
        """
        Compute average word vector for a recipe's ingredient
        
        :param doc: list of ingredients
        :return
            mean: float of average word vectors
        
        """
        
        
        mean = []
        for word in doc:
            if word in self.model_Word2Vec.wv.index_to_key:
                mean.append(self.model_Word2Vec.wv.get_vector(word))
                
        if not mean: #empty words
            #If text empty, return vector of zeros
            return np.zeros(self.vector_size)
        else:
            mean = np.array(mean).mean(axis = 0)
            return mean
        
    def doc_average_list(self, docs):
        """
        Compute average word vector for multiple docs (doc has been tokenized)
        
        :param docs: list of recipes in list of tokens
        :return
            array of average word vector
        
        """
        return np.vstack([self.doc_average(doc) for doc in docs])

In [24]:
#Cosine Recommendation
def get_recommendations(N, scores):
    """
    Rank scores and output a pandas data frame containing all the details of the top N recipes.
    :param scores: list of cosine similarities
    """
    # order the scores with and filter to get the highest N scores
    top = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:N]
    # create dataframe to load in recommendations
    recommendation = pd.DataFrame(columns=["recipe", "ingredients", "score","rating"])
    count = 0
    for i in top:
        recommendation.loc[count, "recipe"] = df_refined["RecipeName"][i]
        recommendation.loc[count, "ingredients"] = df_refined["ingredients"][i]
        recommendation.loc[count, "score"] = f"{scores[i]}"
        recommendation.loc[count, "rating"] = df_refined['Rating'][i]
        count += 1
    return recommendation

def get_recs_cosine(ingredients, N=5):
    """
    Get the top N recipe recomendations.
    :param ingredients: comma seperated string listing ingredients
    :param N: number of recommendations
    """
    # load in word2vec model
    model = model_Word2Vec
    # normalize embeddings
    model.init_sims(replace=True)
    # load in data
    data = df_refined
    # create corpus
    corpus = get_and_sort_corpus(data)

    # get average embdeddings for each document
    mean_vec_tr = MeanEmbeddingVectorizer(model_Word2Vec)
    doc_vec = mean_vec_tr.transform(corpus)
    doc_vec = [doc.reshape(1, -1) for doc in doc_vec]
    assert len(doc_vec) == len(corpus)
    

    # create embeddings for input text
    input = ingredients
    # create tokens with elements
    input = input.split(",")
    # parse ingredient list
    input = ingredient_preprocess(input)
    # get embeddings for ingredient doc
    input_embedding = mean_vec_tr.transform([input])[0].reshape(1, -1)
   
    # get cosine similarity between input embedding and all the document embeddings
    cos_sim = map(lambda x: cosine_similarity(input_embedding, x)[0][0], doc_vec)
    scores = list(cos_sim)
    # Filter top N recommendations
    recommendations = get_recommendations(N, scores)
    return recommendations

  

In [25]:
input_ingredient = "chicken, onion, spinach, garlic, pasta"
rec_cosine = get_recs_cosine(input_ingredient, N=10)
display(rec_cosine)

Unnamed: 0,recipe,ingredients,score,rating
0,Extremely Soft White Bread (Bread Machine),"[hot water, salt, sugar, vegetable oil, white ...",0.0,5.0
1,Bahama Mama Banana Rum Cake,"[all purpose flour, baking powder, baking soda...",0.0,4.867263
2,A1b Bacon Burger Bites on White Polenta Cakes #A1,"[a original sauce, bacon, brown sugar, caye...",0.0,4.853836
3,Copy-Cat Panera Cream of Chicken and Wild Rice...,"[all purpose flour, butter, carrot, celery, ch...",0.0,4.823795
4,Texas Chewy Pralines,"[brown sugar, butter, heavy cream, light corn ...",0.0,4.761721
5,Elderberry Pie,"[butter, cinnamon, elderberry, flour, lemon ju...",0.0,4.749467
6,Easy Cream of Potato Soup,"[butter, chicken broth, dried dill weed, flour...",0.0,4.73081
7,Old Fashioned Molasses &quot;caramel&quot; Corn,"[baking soda, brown sugar, butter, molasses, p...",0.0,4.730153
8,Donair Meat Sandwiches,"[breadcrumb, canned milk, garlic powder, garli...",0.0,4.716243
9,Oven Baked Tacos,"[diced onion, diced tomato, flour tortillas, g...",0.0,4.70342


# Testing hybrid recommender system

In [26]:
#List of unique user id
df_review['User'].value_counts()

424680        4536
37449         3439
383346        2654
128473        2418
169430        2409
89831         2207
58104         2130
133174        2053
199848        1966
4470          1856
305531        1829
226863        1738
369715        1686
95743         1642
176615        1618
498271        1511
39835         1498
286566        1422
80353         1400
1072593       1397
107583        1375
88099         1371
131126        1363
140132        1349
166642        1318
126440        1315
222564        1314
107135        1296
157425        1228
280271        1226
5060          1187
486725        1137
104295        1132
593927        1131
461834        1086
482376        1073
174096        1071
679953        1031
6357          1026
197023        1021
101823         994
428885         986
163112         986
8688           984
53932          976
136997         973
542159         960
47892          958
204024         949
17803          947
67656          938
29196          938
173579      

In [27]:
df_refined = get_recommendations_1(37449,20)
#Change String to List
df_refined['ingredients'] = df_refined['ingredients'].apply(lambda s: list(ast.literal_eval(s)))
#Sort data corpus
ingredients_corpus = get_and_sort_corpus(df_refined)


In [28]:
input_ingredient = "shrimp, brocolli, rice"
rec_cosine = get_recs_cosine(input_ingredient, N=10)
display(rec_cosine)

Unnamed: 0,recipe,ingredients,score,rating
0,"Milk-Free, Egg-Free Pancakes","[baking powder, egg, margarine, salt, water, w...",0.1238976418972015,4.743179
1,"Cheesy Chicken, Bacon &amp; Tater Tot Crock Po...","[bacon bits, boneless skinless chicken breasts...",0.0830330401659011,4.522532
2,Amazing Gluten-Free Buttermilk Donuts / Doughnuts,"[baking powder, baking soda, butter, buttermil...",0.0653682425618171,4.601734
3,Black Bean Chocolate Cake,"[baking powder, baking soda, black beans, coco...",0.0586577393114566,4.608158
4,Shelly's Baked Ricotta,"[egg, italian seasoning, marinara sauce, mozza...",0.0427568927407264,4.501546
5,Basic Vanilla Frosting,"[butter, milk, powdered sugar, vanilla]",0.0378900952637195,4.543732
6,Fluffy Eggless Sugar Cookies (Breathtaking Del...,"[all purpose flour, baking powder, butter, mil...",0.0374253131449222,4.504581
7,Pumpkin Cheesecake With Gluten Free Gingersnap...,"[cream cheese, egg, gingersnap crumbs, ground ...",0.0267467144876718,4.499891
8,Spinach and Cheese Stuffed Chicken Breast #RSC,"[black olives, boneless skinless chicken breas...",0.0254810005426406,4.612216
9,Corned Beef and Cabbage (Crock Pot),"[apple cider vinegar, cabbage, carrot, corned ...",0.0096791051328182,4.486461


In [29]:
input_ingredient = "beef, potato, rice, pepper"
rec_cosine = get_recs_cosine(input_ingredient, N=10)
display(rec_cosine)

Unnamed: 0,recipe,ingredients,score,rating
0,Cheesecake Factory Key Lime Cheesecake--My Ver...,"[butter, cream cheese, egg, flour, graham crac...",0.0,5.0
1,Nigella Lawson Brownies,"[caster sugar, dark chocolate, egg, plain flou...",0.0,4.821746
2,"Milk-Free, Egg-Free Pancakes","[baking powder, egg, margarine, salt, water, w...",0.0,4.743179
3,Caramel Pecan Pound Cake,"[all purpose flour, baking powder, brown sugar...",0.0,4.688012
4,Spinach and Cheese Stuffed Chicken Breast #RSC,"[black olives, boneless skinless chicken breas...",0.0,4.612216
5,Black Bean Chocolate Cake,"[baking powder, baking soda, black beans, coco...",0.0,4.608158
6,Amazing Gluten-Free Buttermilk Donuts / Doughnuts,"[baking powder, baking soda, butter, buttermil...",0.0,4.601734
7,Butter-Less Chocolate Chip Cookies,"[almond extract, baking powder, baking soda, b...",0.0,4.593264
8,Incredible Boneless Pork Roast With Vegetables,"[baby carrots, beef broth, black pepper, bonel...",0.0,4.572661
9,Malva Pudding,"[apricot jam, baking powder, bicarbonate of so...",0.0,4.545641


In [31]:
input_ingredient = "fish, carrot, rice, celery"
rec_cosine = get_recs_cosine(input_ingredient, N=10)
display(rec_cosine)

Unnamed: 0,recipe,ingredients,score,rating
0,Cheesecake Factory Key Lime Cheesecake--My Ver...,"[butter, cream cheese, egg, flour, graham crac...",0.0,5.0
1,Nigella Lawson Brownies,"[caster sugar, dark chocolate, egg, plain flou...",0.0,4.821746
2,"Milk-Free, Egg-Free Pancakes","[baking powder, egg, margarine, salt, water, w...",0.0,4.743179
3,Caramel Pecan Pound Cake,"[all purpose flour, baking powder, brown sugar...",0.0,4.688012
4,Spinach and Cheese Stuffed Chicken Breast #RSC,"[black olives, boneless skinless chicken breas...",0.0,4.612216
5,Black Bean Chocolate Cake,"[baking powder, baking soda, black beans, coco...",0.0,4.608158
6,Amazing Gluten-Free Buttermilk Donuts / Doughnuts,"[baking powder, baking soda, butter, buttermil...",0.0,4.601734
7,Butter-Less Chocolate Chip Cookies,"[almond extract, baking powder, baking soda, b...",0.0,4.593264
8,Incredible Boneless Pork Roast With Vegetables,"[baby carrots, beef broth, black pepper, bonel...",0.0,4.572661
9,Malva Pudding,"[apricot jam, baking powder, bicarbonate of so...",0.0,4.545641


# Conclusion
All scores that are attained through the hybrid model are very low. This is largely due to the fact that we are refining the dataset that the content-based recommender is running to under 100 rows. As such, from this list these recipes may not match well with the input ingredients. We need to determine if we what is the best threshold number of recipes for the collaborative filtering method to help improve prediction when running the content-based recommender.

When we run input ingredients with beef and fish, we receive a score of 0.

The recipes being recommended for the fish and beef follow what our EDA process tells us that most recipes are dessert recipes. It seems like when there is no match, the most recommended recipe would be a dessert recipe. This is a huge flaw. 

In future state, we will need to find non-dessert recipes.
