### 1. Content Based Recommender System 

We will try to build a system that recommends recipes that are similar to a particular recipe. We will use recipe ingredients and categories to determine similarity and then list the top 10 closest matches.

In [5]:
%load_ext Cython
%time
%cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs


UsageError: Line magic function `%cython` not found (But cell magic `%%cython` exists, did you mean that instead?).


In [6]:
#import needed packages
from gensim.models import doc2vec

import re
import pandas as pd
import numpy as np
import copy

# nltk processing
import nltk
from nltk.tag import pos_tag
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.classify import NaiveBayesClassifier

In [9]:
# read in fields title, ingredients and categories
data = pd.read_csv("dataRecTitleCatsIngred.csv") 

#data.duplicated()


In [10]:
data.drop_duplicates()

Unnamed: 0,title,categories,ingred
0,1-Dish Taco Bake,"['Trusted Brands: Recipes and Tips', ""ARGO&#17...","['Taco Meat Filling:', '1 pound ground beef', ..."
1,15-Minute Dinner Nachos Supreme,"['Trusted Brands: Recipes and Tips', ""Campbell...","['1 pound ground beef', '1 (1.12 ounce) packag..."
2,A Firefighter's Meatloaf,"['Main Dish', 'Meatloaf', 'Beef Meatloaf']","['2 slices whole wheat bread', '2 pounds groun..."
3,Aaron's Missouri Burger,"['Main Dish', 'Burgers', 'Hamburgers']","['1 pound lean ground beef', '2 teaspoons onio..."
4,Absolute Best Liver and Onions,['Main Dish'],"['2 pounds sliced beef liver', '1 1/2 cups mil..."
5,Adobo Sirloin,"['World Cuisine', 'Latin American', 'Mexican']","['1 lime, juiced', '1 tablespoon minced garlic..."
6,Albondigas,"['Soups, Stews and Chili', 'Soup', 'Beef Soup']","['1 quart water', '4 carrots, sliced', '2 smal..."
7,Albondigas Soup I,"['Soups, Stews and Chili', 'Soup', 'Pork Soup']","['1 pound lean ground beef', '1/4 pound pork s..."
8,Albondigas Soup II,"['Soups, Stews and Chili', 'Soup', 'Beef Soup']","['1 pound lean ground beef', '2 cubes beef bou..."
9,Alison's Slow Cooker Vegetable Beef Soup,"['Soups, Stews and Chili', 'Soup', 'Beef Soup']","['1 1/2 pounds cubed beef stew meat', '2 cups ..."


We will define a food vocabulary. Non-food words, such as measurements, punctuation, and descriptions about the state of a food item should be removed.

In [11]:
### removed measurements, descriptive words and anything that's not an ingredient
def clean_data_ingred(ds):
    arr = ['tablespoons','tablespoon','teaspoons','teaspoon','cups','cup','ounces','ounce','pounds','pound','pinches','pinch','tastes','taste','dashes','dash','/',' or ',' as ',' and ',
          '-','[',']','(',')','.','slices','sliced','slice', '0','1','2','3','4','5','6','7','8','9','juiced','diced','minced','chopped',' cans',' canned ',' can ', ' inches ', ' inch ',
          'package', 'finely', 'crushed', 'shredded', 'divided', ' melted ', ' jar ','thawed', ' cut ', ' to ', ' thick ', ' across ', ' bones ', 'optional', ' split ', ' torn ',
          ' into ', ' strips ', 'peeled',' pieces', ' lightly ', ' beaten ','ground black', 'scrubbed', 'jars', 'beaten','pitted','halved', 'thinly','as needed','large','medium','small'
          'roughly','fluid','drained','crumbled','rinsed','cubed','seeded','fresh',' plus ', ' more ', 'deboned', 'prepared', 'cubes', 'uncooked', 'cooked', 'grated', ' lean ','into chunks',
          'coarsely','seeded','freshly',' freshly', 'fresh', ' with ', 'liquid','quartered','lengthwise','small',' into ',' bite ', ' sized ', ' ly ', ' ed ', ' s ','cut into', ' up ',
          'trimmed','frozen','chunks','segments','softened',' cut ','pieces','broken',' rings',' torn ', ' ',"'"]

    ### merge multiple words, for example 'ground beef' becomes 'groundbeef'
    for item in arr:
        ds = ds.str.replace(item,'')
     
    return ds

In [12]:
### Remove measurements and anything else that is not a food ingredient
data['ingred'] = clean_data_ingred(data['ingred'])

#data['title'] = clean_data(data['title'])
#dfIngred.to_csv('ingredClean.csv', sep=' ')

data['ingred'].head(10)

0    TacoMeatFilling:,groundbeef,tacoseasoning,Batt...
1    groundbeef,Pace®TacoSeasoningMix,"Campbells®Co...
2    wholewheatbread,groundbeef,greenonion,,onion,s...
3    groundbeef,onionpowder,honeymustard,garlicpowd...
4    beefliver,milk,,butter,,Vidaliaonions,,allpurp...
5    lime,,garlic,driedoregano,groundcumin,chipotle...
6    quartwater,carrots,,potatoes,,onion,,salsa,hot...
7    groundbeef,porksausage,onion,,egg,,salt,pepper...
8    groundbeef,beefbouilloncube,water,clovesgarlic...
9    beefstewmeat,water,onion,,tomatoes,mixedvegeta...
Name: ingred, dtype: object

In [13]:
### separate each term with a space
def create_soup(ds):
    ds = ds.str.replace(',',' ')
    return ds

In [14]:
### Separate each ingredient with a space
data['ingredSpaces'] = create_soup(data['ingred'])
data['ingredSpaces'].head(10)

0    TacoMeatFilling: groundbeef tacoseasoning Batt...
1    groundbeef Pace®TacoSeasoningMix "Campbells®Co...
2    wholewheatbread groundbeef greenonion  onion s...
3    groundbeef onionpowder honeymustard garlicpowd...
4    beefliver milk  butter  Vidaliaonions  allpurp...
5    lime  garlic driedoregano groundcumin chipotle...
6    quartwater carrots  potatoes  onion  salsa hot...
7    groundbeef porksausage onion  egg  salt pepper...
8    groundbeef beefbouilloncube water clovesgarlic...
9    beefstewmeat water onion  tomatoes mixedvegeta...
Name: ingredSpaces, dtype: object

#### Compute Term Frequency-Inverse Document Frequency (TF-IDF) vectors for each recipe
This will give us a matrix where each column represents a word in the overview vocabulary 
(all the words that appear in at least one recipe) and each column represents a recipe

The TF-IDF score is the frequency of an ingredient occurring in a recipe, down-weighted 
by the number of recipes in which it occurs.


In [16]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(data['ingredSpaces'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(8110, 5459)

In [17]:
print("5,459 different words were used to describe 8,110 recipes in the dataset")

5,459 different words were used to describe 8,110 recipes in the dataset


Using this matrix we can compute a similarity score. We will use a cosine similarity to calculate a number that denotes the similarity between two recipes.

cosine(x,y0 = x.y^T / ||x||.||y||

Since we have used the TF-IDF vectorizer, calculating the dot product will give us the cosine similarity score. We will use sklearn's linear_kernel() because it is faster.


In [18]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

We're going to define a function that takes in a recipe title as an input and outputs a list of the 10 most similar recipes. 

To do this, we will need a reverse mapping of recipe titles and dataframe indices. This will give us a way to identify the index of a recipe, given its title.

In [19]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(data.index, index=data['ingredSpaces']).drop_duplicates()

indices.head()

ingredSpaces
TacoMeatFilling: groundbeef tacoseasoning Batter: MazolaPure®CookingSpray allpurposeflour masacornflourORcornmeal "envelopesFleischmanns®RapidRiseYeast" sugar salt verywarmmilkdegreesFdegreesF Mazola®CornOil egg Topping: chunkysalsa Mexicanstylecheese cornchips partially    0
groundbeef Pace®TacoSeasoningMix "Campbells®CondensedTomatoSoup" water instantwhiterice Pace®Thick&ChunkySalsa ShreddedCheddarcheese Shreddedlettuce Tortillachips                                                                                                                 1
wholewheatbread groundbeef greenonion  onion salsa wholegrainDijonmustard Worcestershiresauce garlic saltpepper barbequesauce                                                                                                                                                      2
groundbeef onionpowder honeymustard garlicpowder redpepper salt brownsugar oliveoil Swisscheese hamburgerbuns                                               

In [20]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(data['ingredSpaces'])


In [21]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [22]:
# Reset index of our main DataFrame and construct reverse mapping
data = data.reset_index()
indices = pd.Series(data.index, index=data['ingred'])

In [23]:
# Function that takes in ingredient as input and outputs most similar ingredients
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the recipe title that matches the title
    idx = indices[title]

    print("idx[title] = ",idx)
    
    # Get the pairwise similarity scores of all titles with that title
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the similarity scores    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
 
    # Get the movie indices
    recipe_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar recipes
    return data['title'].iloc[recipe_indices]

In [24]:
get_recommendations(20, cosine_sim2)  #Ann's Dirty Rice


idx[title] =  20


1257                 Ann's Dirty Rice
848                        Porcupines
2085                       Porcupines
236                 Cabbage Rolls III
557               Hamburger Rice Soup
1473                Cabbage Rolls III
1794              Hamburger Rice Soup
3489                   Zucchini Saute
728     Melinda's Porcupine Meatballs
1965    Melinda's Porcupine Meatballs
Name: title, dtype: object

In [25]:
get_recommendations(8083) #Shrimp and Quinoa

idx[title] =  8083


3166                      Grilled Asparagus
3170                     Asparagus Parmesan
2536    Cranberry and Cilantro Quinoa Salad
8030                          Shrimp Quinoa
4300                Spicy and Sweet Spinach
3151                    Pan-Fried Asparagus
7880                  Shrimp and Okra Gumbo
5972       Goat Cheese and Asparagus Quinoa
3520             The Best Steamed Asparagus
5988                 Spiced Quinoa Porridge
Name: title, dtype: object

In [26]:
get_recommendations(8100) #tuna salad with cranberries

idx[title] =  8100


7980                            Quick Tuna Salad
7750                      Avocado and Tuna Tapas
2568                                 Egg Salad I
2543                        Mom's Cucumber Salad
7958                              Halibut Weaver
3568    Asparagus with Cranberries and Pine Nuts
4324                   Creamy Buttered Cucumbers
2473                       Holiday Chicken Salad
2722                       Holiday Chicken Salad
8062                         Cheesy Baked Salmon
Name: title, dtype: object

In [27]:
data['title'].head()

0                   1-Dish Taco Bake
1    15-Minute Dinner Nachos Supreme
2           A Firefighter's Meatloaf
3            Aaron's Missouri Burger
4     Absolute Best Liver and Onions
Name: title, dtype: object

In [28]:
get_recommendations(14) #american lasagne

idx[title] =  14


1251          American Lasagna
6180          American Lasagna
669     Lasagna Stuffed Shells
1906    Lasagna Stuffed Shells
410             Easy Lasagna I
1647            Easy Lasagna I
1164         That's-a Meatloaf
2400         That's-a Meatloaf
1042          Spaghetti Pie II
2278          Spaghetti Pie II
Name: title, dtype: object

In [29]:
get_recommendations(8108) #Mainely Fish

idx[title] =  8108


3354             Summer Vegetable Ratatouille
4087                          Sauteed Cabbage
253                      Carne Asada Marinade
1490                     Carne Asada Marinade
3390                         Roasted Broccoli
3337    Olive Oil Roasted Eggplant with Lemon
7892       Pan-Poached Alaskan Salmon Piccata
7493         Grilled Tilapia with Mango Salsa
4924     Spring Fiddleheads and Sweet Peppers
7857                    Pasta With Tuna Sauce
Name: title, dtype: object

### Next, we'll add more features to help improve recommendations. We'll add Categories and Title.

In [30]:
def clean_categories(ds):
#    for item in ds:
    ds = ds.str.replace(' ','')
    ds = ds.str.lower()
    ds = ds.str.replace('[','')
    ds = ds.str.replace(']','')
    ds = ds.str.replace("''",' ')
    ds = ds.str.replace(',',' ')
    ds = ds.str.replace("'","")
    return ds



In [31]:
data['categories_clean'] = clean_categories(data['categories'])
data['categories_clean'].head()

0    trustedbrands:recipesandtips "argo&#174; karo&...
1      trustedbrands:recipesandtips "campbellskitchen"
2                       maindish meatloaf beefmeatloaf
3                          maindish burgers hamburgers
4                                             maindish
Name: categories_clean, dtype: object

We will use the CountVectorizer() instead of TF-IDF. This is because we do not want to down-weight the presence of an ingredient if it appears in relatively more recipes.

In [32]:
# Merge ingredients and categories into one column
data['ingred_cat'] = data['ingredSpaces'] + ' ' + data['categories_clean'] 


In [33]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(data['ingred_cat'])

In [34]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [35]:
# Reset index of the main DataFrame and construct reverse mapping as before
data = data.reset_index()
indices = pd.Series(data.index, index=data['ingred_cat'])

In [36]:
indices.head()

ingred_cat
TacoMeatFilling: groundbeef tacoseasoning Batter: MazolaPure®CookingSpray allpurposeflour masacornflourORcornmeal "envelopesFleischmanns®RapidRiseYeast" sugar salt verywarmmilkdegreesFdegreesF Mazola®CornOil egg Topping: chunkysalsa Mexicanstylecheese cornchips partially trustedbrands:recipesandtips "argo&#174; karo&#174; fleischmanns&#174;"    0
groundbeef Pace®TacoSeasoningMix "Campbells®CondensedTomatoSoup" water instantwhiterice Pace®Thick&ChunkySalsa ShreddedCheddarcheese Shreddedlettuce Tortillachips trustedbrands:recipesandtips "campbellskitchen"                                                                                                                                         1
wholewheatbread groundbeef greenonion  onion salsa wholegrainDijonmustard Worcestershiresauce garlic saltpepper barbequesauce maindish meatloaf beefmeatloaf                                                                                                                                       

In [37]:
get_recommendations(19, cosine_sim2)

idx[title] =  19


1256    Angel's Old Fashioned Beef Stew
827          Pauline Werner's Beef Stew
2064         Pauline Werner's Beef Stew
477                    French Beef Stew
1714                   French Beef Stew
806             Old-Fashioned Beef Stew
2043            Old-Fashioned Beef Stew
600                     Irish Beef Stew
1837                    Irish Beef Stew
979             Slow Cooker Beef Stew I
Name: title, dtype: object

In [38]:
get_recommendations(477, cosine_sim2)

idx[title] =  477


1714                   French Beef Stew
160                   Beer Beef Stew II
568                    Hearty Beef Stew
1397                  Beer Beef Stew II
1805                   Hearty Beef Stew
19      Angel's Old Fashioned Beef Stew
1256    Angel's Old Fashioned Beef Stew
979             Slow Cooker Beef Stew I
2215            Slow Cooker Beef Stew I
6060            Slow Cooker Beef Stew I
Name: title, dtype: object

In [39]:
get_recommendations(8108, cosine_sim2) #Mainely Fish

idx[title] =  8108


8090                                 Spicy Red Snapper
7493                  Grilled Tilapia with Mango Salsa
7791                             Red Snapper Livornese
8026                                 Easiest Spicy Cod
7784                                     Fish Roll-Ups
7857                             Pasta With Tuna Sauce
7745                               Haddock Bubbly Bake
7892                Pan-Poached Alaskan Salmon Piccata
3354                      Summer Vegetable Ratatouille
7781    Pan Fried Halibut Steak with Light Green Sauce
Name: title, dtype: object

In [40]:
get_recommendations(0, cosine_sim2) #1 dish taco bake

idx[title] =  0


1237                                     1-Dish Taco Bake
2766                                    Blackened Chicken
5875                              Mango- Coconut Smoothie
3099    Original Nestle® Toll House Chocolate Chip Coo...
4076                          Creamed Spinach from Oikos®
511                              Grands!® Easy Taco Melts
1748                             Grands!® Easy Taco Melts
5874                          Beet-Red Raspberry Smoothie
596                      Impossibly Easy Cheeseburger Pie
1833                     Impossibly Easy Cheeseburger Pie
Name: title, dtype: object

In [41]:
get_recommendations(8083, cosine_sim2) #Shrimp and Quinoa

idx[title] =  8083


8030                                     Shrimp Quinoa
8040                                      Shrimp Verde
7966              Shrimp with Tomatoes and Feta Cheese
8034                        Lemon Pepper Pasta Seafood
7853                     Lemony Shrimp over Brown Rice
7909               Camarones al Ajillo (Garlic Shrimp)
7833                                Sexy Shrimp Scampi
7681    Fire Roasted Tomato and Feta Pasta with Shrimp
7441                  Garlicky Appetizer Shrimp Scampi
7789                  Garlicky Appetizer Shrimp Scampi
Name: title, dtype: object

## 2 - Recommender System Considering Ratings and Number of Reviews

Since ratings can be different based on the number of reviews, for example should a recipe with only 3 ratings, where the average rating is 5.0, be evaluated as having a truly higher rating than a recipe with 500 ratings, where the average rating is about 4.0?<br>

In order to be fair, we are going to use a weighted ratings.<br>

v: number of reviews for the recipe<br>

m: minimum number of votes required to be listed<br>

R: average rating of the recipe<br>

C: mean vote across all recipes<br>

We will use 90th percentile as the cutoff in terms of number of reviews.<br>


In [42]:
# read in fields title, ingredients and categories
dataR = pd.read_csv("dataRecTitleCatsIngredRatRev.csv") 

dataR.duplicated()
dataR.drop_duplicates()

Unnamed: 0,title,categories,ingred,rating,review_cnt
0,1-Dish Taco Bake,"['Trusted Brands: Recipes and Tips', ""ARGO&#17...","['Taco Meat Filling:', '1 pound ground beef', ...",4.41,86
1,15-Minute Dinner Nachos Supreme,"['Trusted Brands: Recipes and Tips', ""Campbell...","['1 pound ground beef', '1 (1.12 ounce) packag...",4.00,1
2,A Firefighter's Meatloaf,"['Main Dish', 'Meatloaf', 'Beef Meatloaf']","['2 slices whole wheat bread', '2 pounds groun...",4.37,262
3,Aaron's Missouri Burger,"['Main Dish', 'Burgers', 'Hamburgers']","['1 pound lean ground beef', '2 teaspoons onio...",4.67,150
4,Absolute Best Liver and Onions,['Main Dish'],"['2 pounds sliced beef liver', '1 1/2 cups mil...",4.17,9
5,Adobo Sirloin,"['World Cuisine', 'Latin American', 'Mexican']","['1 lime, juiced', '1 tablespoon minced garlic...",4.00,4
6,Albondigas,"['Soups, Stews and Chili', 'Soup', 'Beef Soup']","['1 quart water', '4 carrots, sliced', '2 smal...",4.25,1362
7,Albondigas Soup I,"['Soups, Stews and Chili', 'Soup', 'Pork Soup']","['1 pound lean ground beef', '1/4 pound pork s...",4.28,34
8,Albondigas Soup II,"['Soups, Stews and Chili', 'Soup', 'Beef Soup']","['1 pound lean ground beef', '2 cubes beef bou...",4.00,2
9,Alison's Slow Cooker Vegetable Beef Soup,"['Soups, Stews and Chili', 'Soup', 'Beef Soup']","['1 1/2 pounds cubed beef stew meat', '2 cups ...",4.71,6


In [43]:
C = dataR['rating'].mean()
print("average rating of all recipes: ",C)

average rating of all recipes:  4.158191122071517


In [44]:
m = dataR['review_cnt'].quantile(0.5)
print("50th percentile number of reviews: ",m)

50th percentile number of reviews:  74.0


In [45]:
# Filter out all qualified recipes into a new DataFrame
q_data = dataR.copy().loc[dataR['review_cnt'] >= m]
q_data.shape

(4068, 5)

In [46]:
q_data.duplicated()
q_data.drop_duplicates()

Unnamed: 0,title,categories,ingred,rating,review_cnt
0,1-Dish Taco Bake,"['Trusted Brands: Recipes and Tips', ""ARGO&#17...","['Taco Meat Filling:', '1 pound ground beef', ...",4.41,86
2,A Firefighter's Meatloaf,"['Main Dish', 'Meatloaf', 'Beef Meatloaf']","['2 slices whole wheat bread', '2 pounds groun...",4.37,262
3,Aaron's Missouri Burger,"['Main Dish', 'Burgers', 'Hamburgers']","['1 pound lean ground beef', '2 teaspoons onio...",4.67,150
6,Albondigas,"['Soups, Stews and Chili', 'Soup', 'Beef Soup']","['1 quart water', '4 carrots, sliced', '2 smal...",4.25,1362
23,Apple Flavored Pot Roast,"['Main Dish', 'Roasts']","['1 (3 pound) boneless beef chuck roast', '1 (...",4.39,137
27,Asian Beef Skewers,['Everyday Cooking'],"['3 tablespoons hoisin sauce', '3 tablespoons ...",4.65,3837
28,Asian Beef with Snow Peas,"['World Cuisine', 'Asian', 'Chinese']","['3 tablespoons soy sauce', '2 tablespoons ric...",4.65,3837
29,Asian Fire Meat,"['World Cuisine', 'Asian', 'Chinese']","['1/2 cup soy sauce', '1 tablespoon sesame oil...",4.64,3837
32,Asian Style Meatloaf,"['World Cuisine', 'Asian']","['1 1/2 pounds ground beef', '1/2 pound ground...",4.47,115
33,Atomic Canuck Chili,"['Soups, Stews and Chili', 'Chili', 'Beef Chili']","['2 pounds lean ground beef', '1/2 large onion...",4.61,325


There are 4068 recipes which qualify to be in this list. Now, we need to calculate our metric for each qualified recipe. To do this, we will define a function, weighted_rating() and define a new feature score, of which we'll calculate the value by applying this function to our DataFrame of qualified recipes:

In [47]:
# Function that computes the weighted rating of each recipe
def weighted_rating(x, m=m, C=C):
    v = x['review_cnt']
    R = x['rating']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)




In [48]:
# Define a new feature 'score' and calculate its value with `weighted_rating()`
q_data['score'] = q_data.apply(weighted_rating, axis=1)

In [49]:
#Sort movies based on score calculated above
q_data = q_data.sort_values('score', ascending=False)

#Print the top 15 movies
q_data[['title', 'review_cnt', 'rating', 'score']].head(25)

Unnamed: 0,title,review_cnt,rating,score
1947,Meatball Soup,1023,4.91,4.859285
1948,Meatball Spaghetti Sauce,1019,4.91,4.8591
3203,Dad's Pan-Fried Green Beans,9837,4.84,4.834909
3202,Cream Peas,9836,4.84,4.834909
7021,Pork Chops with Apple Curry Sauce,1390,4.86,4.824526
1232,Zippy Beef Casserole,1777,4.85,4.822343
5600,Coconut Raspberry Smoothie,9933,4.82,4.815106
5601,Chia Ginger Smoothie,9933,4.82,4.815106
5602,Paleo Apple Pie Smoothie,9933,4.82,4.815106
2495,Delicious Egg Salad for Sandwiches,544,4.9,4.811175


After evaluating the Ratings and number of reviews, it does not seem that these values are reliable indicators of how popular a recipe would actually be.  For example, it is highly unlikely that "Cream Peas" would be a family's fourth most favorite recipe.  The list above shows Meatball Soup, as the top rated recipe, with a 4.91 average rating. I find it hard to believe that a soup recipe is truly the most popular recipe.  Meatball Spaghetti Sauce, at number 2, possibly may be, but Dad's Pan-Fried Green Beans in third place and Cream Peas in fourth? Not likely. Therefore, we will not be using the average rating and number of reviews in our recommender system.

## Word2Vec, Doc2Vec

In [50]:
from string import punctuation
from os import listdir
from numpy import array
#from array import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords
from gensim.models import Word2Vec

data

Using TensorFlow backend.


Unnamed: 0,level_0,index,title,categories,ingred,ingredSpaces,categories_clean,ingred_cat
0,0,0,1-Dish Taco Bake,"['Trusted Brands: Recipes and Tips', ""ARGO&#17...","TacoMeatFilling:,groundbeef,tacoseasoning,Batt...",TacoMeatFilling: groundbeef tacoseasoning Batt...,"trustedbrands:recipesandtips ""argo&#174; karo&...",TacoMeatFilling: groundbeef tacoseasoning Batt...
1,1,1,15-Minute Dinner Nachos Supreme,"['Trusted Brands: Recipes and Tips', ""Campbell...","groundbeef,Pace®TacoSeasoningMix,""Campbells®Co...","groundbeef Pace®TacoSeasoningMix ""Campbells®Co...","trustedbrands:recipesandtips ""campbellskitchen""","groundbeef Pace®TacoSeasoningMix ""Campbells®Co..."
2,2,2,A Firefighter's Meatloaf,"['Main Dish', 'Meatloaf', 'Beef Meatloaf']","wholewheatbread,groundbeef,greenonion,,onion,s...",wholewheatbread groundbeef greenonion onion s...,maindish meatloaf beefmeatloaf,wholewheatbread groundbeef greenonion onion s...
3,3,3,Aaron's Missouri Burger,"['Main Dish', 'Burgers', 'Hamburgers']","groundbeef,onionpowder,honeymustard,garlicpowd...",groundbeef onionpowder honeymustard garlicpowd...,maindish burgers hamburgers,groundbeef onionpowder honeymustard garlicpowd...
4,4,4,Absolute Best Liver and Onions,['Main Dish'],"beefliver,milk,,butter,,Vidaliaonions,,allpurp...",beefliver milk butter Vidaliaonions allpurp...,maindish,beefliver milk butter Vidaliaonions allpurp...
5,5,5,Adobo Sirloin,"['World Cuisine', 'Latin American', 'Mexican']","lime,,garlic,driedoregano,groundcumin,chipotle...",lime garlic driedoregano groundcumin chipotle...,worldcuisine latinamerican mexican,lime garlic driedoregano groundcumin chipotle...
6,6,6,Albondigas,"['Soups, Stews and Chili', 'Soup', 'Beef Soup']","quartwater,carrots,,potatoes,,onion,,salsa,hot...",quartwater carrots potatoes onion salsa hot...,soups stewsandchili soup beefsoup,quartwater carrots potatoes onion salsa hot...
7,7,7,Albondigas Soup I,"['Soups, Stews and Chili', 'Soup', 'Pork Soup']","groundbeef,porksausage,onion,,egg,,salt,pepper...",groundbeef porksausage onion egg salt pepper...,soups stewsandchili soup porksoup,groundbeef porksausage onion egg salt pepper...
8,8,8,Albondigas Soup II,"['Soups, Stews and Chili', 'Soup', 'Beef Soup']","groundbeef,beefbouilloncube,water,clovesgarlic...",groundbeef beefbouilloncube water clovesgarlic...,soups stewsandchili soup beefsoup,groundbeef beefbouilloncube water clovesgarlic...
9,9,9,Alison's Slow Cooker Vegetable Beef Soup,"['Soups, Stews and Chili', 'Soup', 'Beef Soup']","beefstewmeat,water,onion,,tomatoes,mixedvegeta...",beefstewmeat water onion tomatoes mixedvegeta...,soups stewsandchili soup beefsoup,beefstewmeat water onion tomatoes mixedvegeta...


Word2Vec learns the context of a word by looking at the words which commonly occur around the target word. Words which have similar contexts share meaning under Word2Vec, and their reduced vector representations will be similar.  In the skip-gram model version of Word2Vec the goal is to take a target word and predict the surrounding context words. We will attempt to apply this technique to recipe ingredients,learning ingredients that commonly occur together. 

We will supply our input target words as one-hot vectors to the neural network.  Then, via a hidden layer, we want to train the neural network to increase the probability of valid context words, while decreasing the probability of invalid context words.

There are two variants of the Word2Vec paradigm – skip-gram and CBOW.  The skip-gram variant takes a target word and tries to predict the surrounding context words, while the CBOW (continuous bag of words) variant takes a set of context words and tries to predict a target word.  In this case, we will be considering the skip-gram variant.

We also need a way of ensuring that, as the network trains, words which are similar end up having similar embedding vectors.  Therefore, we want to ensure that the trained network will always output a 1 when it is supplied words which are in the same context, but 0 when it is supplied words which are never in the same context. Therefore, we need a vector similarity score supplied to the output sigmoid layer – with similar vectors outputting a high score and un-similar vectors outputting a low score.  The most typical similarity measure used between two vectors is the cosine similarity score:

In [51]:
from keras.models import Model
from keras.layers import Input, Dense, Reshape, merge
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import skipgrams
from keras.preprocessing import sequence

import urllib
import collections
import os
import zipfile

import numpy as np
import tensorflow as tf


In [52]:
def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data2 = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data2.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data2, count, dictionary, reversed_dictionary

In [53]:
def collect_data(vocabulary_size=10000):
    data2, count, dictionary, reverse_dictionary = build_dataset(data['ingred_cat'],vocab_size)
    #del vocabulary
    return data2, count, dictionary, reverse_dictionary


In [54]:
vocab_size = len(data)
data2, count, dictionary, reverse_dictionary = collect_data(vocabulary_size=vocab_size)
print(data[:7])

window_size = 10
vector_dim = 300
epochs = 10

valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

   level_0  index                            title  \
0        0      0                 1-Dish Taco Bake   
1        1      1  15-Minute Dinner Nachos Supreme   
2        2      2         A Firefighter's Meatloaf   
3        3      3          Aaron's Missouri Burger   
4        4      4   Absolute Best Liver and Onions   
5        5      5                    Adobo Sirloin   
6        6      6                       Albondigas   

                                          categories  \
0  ['Trusted Brands: Recipes and Tips', "ARGO&#17...   
1  ['Trusted Brands: Recipes and Tips', "Campbell...   
2         ['Main Dish', 'Meatloaf', 'Beef Meatloaf']   
3             ['Main Dish', 'Burgers', 'Hamburgers']   
4                                      ['Main Dish']   
5     ['World Cuisine', 'Latin American', 'Mexican']   
6    ['Soups, Stews and Chili', 'Soup', 'Beef Soup']   

                                              ingred  \
0  TacoMeatFilling:,groundbeef,tacoseasoning,Batt...   
1  gro

In [55]:
sampling_table = sequence.make_sampling_table(vocab_size)
couples, labels = skipgrams(data2, vocab_size, window_size=window_size, sampling_table=sampling_table)
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

print(couples[:10], labels[:10])

[[6009, 6003], [4154, 1959], [5519, 5512], [3453, 367], [5586, 5590], [6052, 1522], [942, 941], [2852, 4290], [3032, 3024], [2143, 1044]] [1, 0, 1, 0, 1, 1, 1, 0, 1, 0]


In [56]:
# create some input variables
input_target = Input((1,))
input_context = Input((1,))

embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')
target = embedding(input_target)
target = Reshape((vector_dim, 1))(target)
context = embedding(input_context)
context = Reshape((vector_dim, 1))(context)

In [57]:
from keras.layers import merge
# setup a cosine similarity operation which will be output in a secondary model
similarity = merge([target, context], mode='cos', dot_axes=0)


  This is separate from the ipykernel package so we can avoid doing imports until
  name=name)


In [58]:
# now perform the dot product operation to get a similarity measure
dot_product = merge([target, context], mode='dot', dot_axes=1)
dot_product = Reshape((1,))(dot_product)
# add the sigmoid output layer
output = Dense(1, activation='sigmoid')(dot_product)
# create the primary training model
model = Model(input=[input_target, input_context], output=output)
model.compile(loss='binary_crossentropy', optimizer='rmsprop')

  
  name=name)
  import sys


In [59]:
# create a secondary validation model to run our similarity checks during training
validation_model = Model(input=[input_target, input_context], output=similarity)

  


In [60]:
class SimilarityCallback:
    def run_sim(self):
        for i in range(valid_size):
            valid_word = reverse_dictionary[valid_examples[i]]
            top_k = 5  # number of nearest neighbors
            sim = self._get_sim(valid_examples[i])
            nearest = (-sim).argsort()[1:top_k + 1]
            log_str = 'Nearest to %s:' % valid_word
            for k in range(top_k):
                try:
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = '%s %s,' % (log_str, close_word)
                except Exception:
                    continue
            print(log_str)

    @staticmethod
    def _get_sim(valid_word_idx):
        sim = np.zeros((vocab_size,))
        in_arr1 = np.zeros((1,))
        in_arr2 = np.zeros((1,))
        in_arr1[0,] = valid_word_idx
        for i in range(vocab_size):
            in_arr2[0,] = i
            out = validation_model.predict_on_batch([in_arr1, in_arr2])
            sim[i] = out
        return sim
sim_cb = SimilarityCallback()

arr_1 = np.zeros((1,))
arr_2 = np.zeros((1,))
arr_3 = np.zeros((1,))
for cnt in range(epochs):
    idx = np.random.randint(0, len(labels)-1)
    arr_1[0,] = word_target[idx]
    arr_2[0,] = word_context[idx]
    arr_3[0,] = labels[idx]
    loss = model.train_on_batch([arr_1, arr_2], arr_3)
    if cnt % 100 == 0:
        print("Iteration {}, loss={}".format(cnt, loss))
    if cnt % 8110 == 0:
        sim_cb.run_sim()

Iteration 0, loss=0.6915212869644165
Nearest to longgrainwhiterice water Sauce: onion  oliveoil marinarasauce beefbroth balsamicvinegar redpepperflakes Peppers: groundbeef hotItalianporksausage casingremoved tomatoes Italianparsley clovesgarlic  salt pepper groundcayennepepper greenbellpeppers  ParmigianoReggianocheese morefortopping maindish stuffedmaindishes stuffedbellpeppers: Italiansausage into greenbellpeppers  farfallepasta beefbroth pepper pastaandnoodles pastabyshape, bacon butter  whitebread Americancheese tomato maindish sandwiches grilledcheese, almondmilk moreneeded pumpkinpuree banana vanillaproteinpowder pumpkinpiespice breakfastandbrunch drinks, groundbeef beefbroth Italianstyletomatoes farfallebowtiepasta zucchini thick Parmesancheese  pastaandnoodles pastabyshape, groundbeef breadcrumbs onion milk salt eggs bottlebarbecuesauce appetizersandsnacks meatandpoultry meatballappetizers,
Nearest to bulkhotItaliansausage groundbeef chilipowder groundcumin groundcoriander clov

Nearest to oliveoil groundbeef chorizosausage  onion  redbellpepper clovesgarlic  groundcumin chilipowder driedoregano paprika cayennepepper groundcinnamon tomatoes beefstock whitesugar raisins pimentostuffedgreenolives applecidervinegar capers  sliveredalmonds toasted limejuice maindish sandwiches sloppyjoes: sweetonion butter garlicsalt pepper sidedish vegetables onion, yellowcorn  maplesyrup eggs milk koshersaltinhalfifusingfinesalt cayennepepper allpurposeflour bakingpowder heavycream butter  sidedish vegetables corn cornpudding, wholetrout pandressed saltpepper allpurposeflour butter blanchedsliveredalmonds lemonjuice parsley forgarnish lemon forgarnish seafood fish, cookingspray headcauliflower intoflorets oliveoil  Cajunseasoning garlicsalt salt sidedish vegetables roastedvegetables,
Nearest to Cheddarcheese Parmesancheese onion  egg ketchup Worcestershiresauce salt pepper groundbeef bacon hamburgerbuns split maindish burgers hamburgers: yogurt lemonjuice groundcumin groundcinna

In [65]:
len(data)

8110

In [66]:
sampling_table = sequence.make_sampling_table(vocab_size)
couples, labels = skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

print(couples[:10], labels[:10])

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [67]:
import gensim 
import keras.models as kmodels
import keras.layers as klayers
import keras.backend as K
import keras


In [68]:
# First, we take the movie and vectorize it.
# The embedding layer is normally used for sequences (think, sequences of words)
# so we need to flatten it out.
# The dropout layer is also important in preventing overfitting
# Count the movies and users
n_recipes = data.shape[0]

data_input = keras.layers.Input(shape=[1])
data_vec = keras.layers.Flatten()(keras.layers.Embedding(n_recipes + 1, 32)(data_input))
data_vec = keras.layers.Dropout(0.5)(data_vec)


# Next, we join them all together and put them
# through a pretty standard deep learning architecture
#input_vecs = keras.layers.merge([movie_vec, user_vec], mode='concat')

nn = keras.layers.Dropout(0.5)(keras.layers.Dense(128, activation='relu')(data_vec))
nn = keras.layers.normalization.BatchNormalization()(nn)
nn = keras.layers.Dropout(0.5)(keras.layers.Dense(128, activation='relu')(nn))
nn = keras.layers.normalization.BatchNormalization()(nn)
nn = keras.layers.Dense(128, activation='relu')(nn)

# Finally, we pull out the result!
result = keras.layers.Dense(5, activation='softmax')(nn)

# And make a model from it that we can actually run.
model = kmodels.Model([data_input], result)
model.compile('adam', 'categorical_crossentropy')

# If we wanted to inspect part of the model, for example, to look
# at the movie vectors, here's how to do it. You don't need to 
# compile these models unless you're going to train them.
final_layer = kmodels.Model([data_input], nn)
data_vec = kmodels.Model(data_input, data_vec)

In [69]:
from sklearn import dummy, metrics, cross_validation, ensemble
# Split the data into train and test sets...
#a_title, b_title, a_ingred, b_ingred, a_y, b_y = cross_validation.train_test_split(title, ingred, y)






In [None]:
# And of _course_ we need to make sure we're improving, so we find the MAE before
# training at all.
#metrics.mean_absolute_error(np.argmax(b_y, 1)+1, np.argmax(model.predict([b_title, b_ingred]), 1)+1)


In [None]:
#siraj:

In [142]:
from __future__ import absolute_import, division, print_function
import codecs
import glob
import multiprocessing
import os
import pprint
import re
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
#import seaborn as sns


In [73]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/claudiacassidy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/claudiacassidy/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [143]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

tokenizer

<nltk.tokenize.punkt.PunktSentenceTokenizer at 0x121fa09b0>

In [81]:
#def sentence_to_wordlist(raw):
#    clean = re.sub("[^a-zA-Z]"," ", raw)
#    words = clean.split()
#    return words

In [88]:
print(data['ingred_cat'][5])

lime  garlic driedoregano groundcumin chipotlepeppersinadobosauce adobosaucefromchipotlepeppers beefsirloinsteaks saltpepper worldcuisine latinamerican mexican


In [89]:
sentences = []
for raw_sentence in data['ingred_cat']:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [128]:
print(sentences[5])
#print(sentence_to_wordlist(sentences))
sentences.head()

['lime', 'garlic', 'driedoregano', 'groundcumin', 'chipotlepeppersinadobosauce', 'adobosaucefromchipotlepeppers', 'beefsirloinsteaks', 'saltpepper', 'worldcuisine', 'latinamerican', 'mexican']


AttributeError: 'list' object has no attribute 'head'

In [102]:
#vectors help with distance, similarity, ranking

#more dimensions, more computationally expensive to train
#but also more accurate
#more dimensions = more generalized
num_features = 300

#minimum word count threshold
min_word_count = 3

num_workers = multiprocessing.cpu_count()

#context window length
context_size = 7

#downsample setting for frequent words
#0 - 1e-5 is good for this
downsampling = 1e-3

#random number generator
#deterministic, good for debugging
seed = 1


In [103]:
token_count = sum([len(sentence) for sentence in sentences])
token_count

101901

In [133]:
vocabulary = data['ingred_cat']
vocabulary_size = 50000

def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [134]:
data4, count, dictionary, reverse_dictionary = build_dataset(
    vocabulary, vocabulary_size)
del vocabulary  # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data', data4[:10], [reverse_dictionary[i] for i in data4[:10]])

data_index = 0

Most common words (+UNK) [['UNK', 0], ('groundbeef egg onion  milk driedbreadcrumbs saltpepper brownsugar mustard ketchup maindish meatloaf beefmeatloaf', 4), ('groundbeef ketchup chunkysalsa brownsugar Worcestershiresauce whitevinegar Dijonmustard hotsauce potatorolls maindish sandwiches sloppyjoes', 4), ('spaghetti groundbeef milk eggs  saltpepper pastasauce driedoregano garlicpowder driedonion pepperonisausage Cheddarcheese worldcuisine european italian', 4), ('eggplant soysauce chickenstock chilisauce whitesugar pepper oystersauce cornstarch water clovesgarlic  greenonions  gingerroot shrimp deveined  groundbeef sesameoil hotrice worldcuisine asian chinese', 4)]
Sample data [128, 129, 130, 131, 132, 133, 134, 9, 135, 136] ['TacoMeatFilling: groundbeef tacoseasoning Batter: MazolaPure®CookingSpray allpurposeflour masacornflourORcornmeal "envelopesFleischmanns®RapidRiseYeast" sugar salt verywarmmilkdegreesFdegreesF Mazola®CornOil egg Topping: chunkysalsa Mexicanstylecheese cornchips 

In [139]:
import random

# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    if data_index + span > len(data4):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    for i in range(batch_size // num_skips):
        context_words = [w for w in range(span) if w != skip_window]
        words_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(words_to_use):
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[context_word]
        if data_index == len(data4):
            buffer.extend(data[0:span])
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data4) - span) % len(data4)
    return batch, labels

In [140]:
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
    print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0],
        reverse_dictionary[labels[i, 0]])


ValueError: invalid literal for int() with base 10: 'categories_clean'

In [141]:
# Step 4: Build and train a skip-gram model.

batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1  # How many words to consider left and right.
num_skips = 2  # How many times to reuse an input to generate a label.
num_sampled = 64  # Number of negative examples to sample.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. These 3 variables are used only for
# displaying model accuracy, they don't affect calculation.
valid_size = 16  # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

#### Train Word2Vec

In [156]:
model = w2v.Word2Vec(
                    sg=1,
                   seed=seed,
                   workers=num_workers,
                   size=num_features,
                   min_count=min_word_count,
                   window=context_size,
                   sample=downsampling)


In [157]:
# loads corpus into memory
model.build_vocab(sentences, update=False)

#print("vocab length: ",len(vec.vocab))



In [160]:
model.train(sentences,total_examples=model.corpus_count,epochs=10)

(710279, 1019010)

In [162]:
if not os.path.exists("trained"):
    os.makedirs("trained")

model.save(os.path.join("trained","model.w2v"))

In [163]:
ingred_cat_vec = w2v.Word2Vec.load(os.path.join("trained", "model.w2v"))

In [164]:
tsne = sklearn.manifold.TSNE(n_components=2,random_state=0)

In [165]:
all_word_vectors_matrix = ingred_cat_vec.syn0

AttributeError: 'Word2Vec' object has no attribute 'syn0'