A notebook for training a word2vec model on a corpus of recipes' ingredients.

#0. Setup

In [1]:
import pymongo
from tqdm import tqdm
import logging
from gensim.models import Word2Vec
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
#import enchant
#from nltk.tag import pos_tag

In [2]:
client = pymongo.MongoClient()
chefs = client.chefs_db
celebrity_recipes = client.chefs_db.celebrity_recipes
yummly_recipes = client.chefs_db.yummly_recipes
yummly_recipes2 = client.chefs_db.yummly_recipes2
yummly_recipes3 = client.chefs_db.yummly_recipes3

In [36]:
yummly_recipes2.find_one()

{u'_id': ObjectId('56f0a831fdb32c06bd913f81'),
 u'celebrity_recipe': u'Vanilla Frosting',
 u'chef': u'Giada De Laurentiis',
 u'flavors': {u'bitter': 0.16666666666666666,
  u'meaty': 0.8333333333333334,
  u'piquant': 0.0,
  u'salty': 0.16666666666666666,
  u'sour': 0.16666666666666666,
  u'sweet': 0.8333333333333334},
 u'yummly_ingredients': [u'vegetable shortening',
  u'powdered sugar',
  u'vanilla extract',
  u'milk'],
 u'yummly_recipe': u'Vanilla Frosting'}

In [3]:
celebrity_recipes.find_one()['ingredients']

[u' butter', u'powdered sugar', u'milk', u'vanilla extract']

In [4]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#1. Corpus-building
####collecting ingredients from:
(a) celebrity recipes

(b) yummly recipes (1, 2, eventually 3)

In [15]:
specs = ['dash', 'pinch', 'teaspoons', 'tablespoons', 'cup', 'scoop', 'pound', 'ounce', 'oz', 
         'quart', 'pint', 'gallon', 'milliliter', 'ml', 'liter', 'small', 'medium', 'large', 
         'freshly', 'ground', 'piece', 'clove', 'boneless', 'cube', 'dice', 'finely', 
         'grated', 'to', 'inch', 'each', 'whole', 'about', 'as', 'thawed', 'by', 'all', 'a',
         'chopped', 'crushed', 'plus', 'minus', 'such', 'the', 'an', 'slice', 'approximately',
         'and', 'or', 'weight', 'of', 'recipe', 'basic', 'slab', 'stick', 'pure', 'melt', 
         'melted', 'minute', 'add', 'heat', 'place', 'set', 'top', 'bring', 'bowl', 'hour', 
         'preheat', 'together', 'serve', 'stir', 'serving', 'side', 'valerie', 'bertinelli', 
         'dish', 'time', 'italian', 'recipe', 'childhood', 'provided', 'reserved', 'right', 
         'courtesy', 'american', 'degree', 'cook', 'pan', 'mix', 'mixture', 'season', 'food', 
         'pour', 'use', 'water', 'high', 'using', 'remove', 'let', 'taste', 'duff', 'like', 
         'oven', 'sheet', 'onto', 'also', 'boil', 'day', 'sit', 'room', 'cooled', 'cover',
         'jar', 'least', 'enough', 'jarcombine', 'special', 'pounds', 'head', 'extract', 
         'packed', 'dark', 'light', 'stick', 'sticks', '-inch-thick', 'sprig', 'ounces'
         'package', 'dried', 'dry', 'wet', 'coarse', 'fine', 'ground', 'precooked', 'perfect', 
         'leaf', 'leaves', 'flat-leaf', 'seed', 'fillet', 'recommended', 'squeezed', 'juice',
         'powdered', 'condensed', 'concentrate', 'extra-large', 'thin', 'peeled', 'andor',
         'thigh', 'breast', 'dale', 'free', 'enhancer', 'coke', 'casserole', 'dale', 'hp', 
         'pot', 'weed', 'santo', 'fresh', 'yardlong', 'inner', 'blade', 'outer', 'bucco', 
         'low', 'bottom', 'top', 'navy', 'eye', 'umms', 'veri', 'flat', 'tenderized', 
         'tenderizer', 'landis', 'marinating', 'reduced', 'open', 'close', 'excess', 'le', 
         'isolate', 'lower', 'leftover', 'complete', 'pace', 'pit', 'stacys', 'refined', 'tip', 
         'au', 'jus', 'ey', 'vay', 'oigatsuo', 'topside', 'hsing', 'pace', 'par', 'yet', 
         'nielsen', 'flap', 'crosse', 'di', 'grilling', 'braising', 'vital', 'columbia', 
         'left', 'right', 'bing', 'blade', 'crest', 'blue']
stops = set(specs + stopwords.words('english'))

In [8]:
def clean_ingredients(ingredients):
    #d = enchant.Dict("en_US")
    ingredients = ' '.join([re.compile('[^a-zA-Z]').sub(' ', i).lower() for i in ingredients])
    ingredients = [WordNetLemmatizer().lemmatize(word) for word in ingredients.split() 
                   if len(word) > 1]
                   #and d.check(word) <-takes way long
                   #and WordNetLemmatizer().lemmatize(word) not in stops] <-cuts corp too much
    return ingredients

While the model's performance may have marginally improved without foreign words, checking whether every word was in English proved untenable given the time constraints of this project. This was all the more true of using NLTK's `pos_tag` method to gauge part of speech for every word, in order to, say, keep only nouns, proper nouns, and adjectives.

Additionally, removing stopwords worsened model performance regardless of the number of stopwords. Therefore, stopwords were ultimately not used in putting together the final corpus.

In [16]:
corpus = []
errorcount = 0

cursor = celebrity_recipes.find()
for recipe in tqdm(cursor):
    try:
        ingredients = clean_ingredients(recipe['ingredients'])
        corpus.append(ingredients)
    except:
        errorcount += 1
        print 'recipes w/o ingredients:', errorcount

for coll in [yummly_recipes, yummly_recipes2, yummly_recipes3]:
    print 'next collection'
    cursor = coll.find()
    for recipe in tqdm(cursor):
        try:
            ingredients = clean_ingredients(recipe['yummly_ingredients'])
            corpus.append(ingredients)
        except:
            errorcount += 1
            print 'recipes w/o ingredients:', errorcount

                              

next collection
next collection

                              


next collection

                             






In [17]:
len(corpus)

3022453

Each document in the corpus consists of a list of ingredients called for in a particular recipe.

#2. Train w2v

In [18]:
model = Word2Vec(tqdm(corpus), min_count=1)
with open('word2vec_ingredients_take_4.pkl', 'w') as picklefile:
    pickle.dump(model, picklefile)



Bonus: a preliminary example using the trained w2v:

In [34]:
with open('word2vec_ingredients.pkl', 'r') as picklefile: 
    w2v_trained = pickle.load(picklefile)

In [37]:
ws1=[u' butter', u'powdered sugar', u'milk', u'vanilla extract']
ws2=[u'vegetable shortening', u'powdered sugar', u'vanilla extract', u'milk']
w2v_trained.n_similarity(ws1, ws2)

0.96788931612928597