This notebook creates the custom word2vec model and the conitnuous bag of words dataframe for the recipes. First we take the original recipe dataframe with strings, preprocess the text into monograms and bigrams and check them against the Google W2V model's vocabulary to create a dictionary of words and two-word phrases for the custom W2V's corpus. That dictionary is used to train the MWETokenizer with acceptable bigrams that should always be bigrams. Next, we tokenize the recipes again, into sentences and words, and then train the custom word2vec model. Lastly, we process the recipes one more time to create continuous bags of words for the title, ingredients, directions, and recipes as a whole. That final dataframe is saved to be used in the next notebook to create document vectors.

# Imports

In [179]:
# Basics
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re
import string
import csv
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS

# MongoDB
from pymongo import MongoClient

# natural language processing
import nltk
from nltk.util import ngrams
from textblob import TextBlob
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.corpus import stopwords
from ingreedypy import Ingreedy
stop_words = stopwords.words('english')

# sklearn
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import scale
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score

# gensim
from gensim import corpora, models, similarities, matutils
import gensim

# Other
import warnings
warnings.filterwarnings('ignore')


# Google WV

In [2]:
import gensim
google_vec_file = '/Users/carliebadder/Downloads/GoogleNews-vectors-negative300.bin'
g_model = gensim.models.KeyedVectors.load_word2vec_format(google_vec_file, binary=True)

# MWETokenizer with bi- and tri-grams

In [180]:
# text data
df_text = pd.read_pickle('df_01.pkl')

In [336]:
df_text.title

0                                                       Lentil, Apple, and Turkey Wrap 
1                                           Boudin Blanc Terrine with Red Onion Confit 
2                                                         Potato and Fennel Soup Hodge 
3                                                      Mahi-Mahi in Tomato Olive Sauce 
4                                                             Spinach Noodle Casserole 
5                                                                        The Best Blts 
6                              Ham and Spring Vegetable Salad with Shallot Vinaigrette 
7                                                                 Spicy-Sweet Kumquats 
8                                                                Korean Marinated Beef 
9                             Ham Persillade with Mustard Potato Salad and Mashed Peas 
10                                        Yams Braised with Cream, Rosemary and Nutmeg 
11                              

In [83]:
def preprocess_ngrams(text):
    text = text.lower()
    doc = word_tokenize(text)
#     print('token',doc)
    doc2 = []
    for word in doc:
        if '-' in word:
            two_words = word.split('-')
            doc2.extend(two_words)
        elif '/' in word:
            two_words = word.split('/')
            doc2.extend(two_words)
        else:
            doc2.append(word)
#     print('\-',doc2)

    doc2 = [re.sub('[%s]' % re.escape(string.punctuation), '', word) for word in doc2]
    doc2 = [word for word in doc2 if word.isalpha()]
    monograms = doc2 #[word for word in doc2 if word not in stop_words]
#     bigrams = []
#     for i in range(len(monograms)-1):
#         bigrams.append(monograms[:i] + [monograms[i]+'_'+monograms[i+1]] + monograms[i+2:])
    bigrams = list(ngrams(monograms,2))
    bigrams = [w for w in bigrams if '_'.join(w) in g_model]
#     trigrams = list(ngrams(monograms,3))
#     trigrams = [w for w in trigrams if '_'.join(w) in model]
    
#     print('mono',monograms)
#     print('bi',(bigrams))
#     print('tri',(trigrams))
    
    return bigrams #+trigrams

In [84]:
df_text.title[7293], preprocess_ngrams(df_text.title[7293])

('Roasted Butternut Squash with Lime Juice ',
 [('roasted', 'butternut'), ('butternut', 'squash'), ('lime', 'juice')])

In [85]:
title_ngrams = []
# Titles
for i in range(len(df_text)):
    temp_word_list = preprocess_ngrams(df_text.title[i])
    title_ngrams.extend(temp_word_list)

In [86]:
ing_ngrams = []
# Ingredients
for i in range(len(df_text)):
    try:
        temp_ing_list = df_text.ingredients[i]
        for ing in temp_ing_list:
            temp_word_list = preprocess_ngrams(ing)
            ing_ngrams.extend(temp_word_list)
    except:
        continue

In [87]:
dir_ngrams = []
# Directions
for i in range(len(df_text)):
    try:
        temp_dir_list = df_text.directions[i]
        for dir_ in temp_dir_list:
            temp_word_list = preprocess_ngrams(dir_)
            dir_ngrams.extend(temp_word_list)
    except:
        continue

In [88]:
desc_ngrams = []
# Descriptions
for i in range(len(df_text)):
    try:
        temp_desc_list = df_text.desc[i]
        for desc in temp_desc_list:
            temp_word_list = preprocess_ngrams(desc)
            desc_ngrams.extend(temp_word_list)
    except:
        continue

In [89]:
ngram_list = title_ngrams + ing_ngrams + dir_ngrams + desc_ngrams

In [95]:
(ngram_list[590])

('cabbage', 'slaw')

In [91]:
('lemon','zest') in ngram_list

True

In [96]:
from nltk.tokenize import MWETokenizer

tokenizer = MWETokenizer(ngram_list)
tokenizer.tokenize('This is a test lemon peel'.split())
    

['This', 'is', 'a', 'test', 'lemon_peel']

In [97]:
import pickle
with open('ngram_list.pkl', 'wb') as f:
    pickle.dump(ngram_list, f)

# Epicurious Corpus for Word2Vec Model

In [47]:
pd.options.display.max_colwidth = 500
df_text.head()

Unnamed: 0,title,ingredients,directions,categories,desc
0,"Lentil, Apple, and Turkey Wrap","[4 cups low-sodium vegetable or chicken stock, 1 cup dried brown lentils, 1/2 cup dried French green lentils, 2 stalks celery, chopped, 1 large carrot, peeled and chopped, 1 sprig fresh thyme, 1 teaspoon kosher salt, 1 medium tomato, cored, seeded, and diced, 1 small Fuji apple, cored and diced, 1 tablespoon freshly squeezed lemon juice, 2 teaspoons extra-virgin olive oil, Freshly ground black pepper to taste, 3 sheets whole-wheat lavash, cut in half crosswise, or 6 (12-inch) flour tortillas...","[1. Place the stock, lentils, celery, carrot, thyme, and salt in a medium saucepan and bring to a boil. Reduce heat to low and simmer until the lentils are tender, about 30 minutes, depending on the lentils. (If they begin to dry out, add water as needed.) Remove and discard the thyme. Drain and transfer the mixture to a bowl; let cool., 2. Fold in the tomato, apple, lemon juice, and olive oil. Season with the pepper., 3. To assemble a wrap, place 1 lavash sheet on a clean work surface. Spre...","[Sandwich, Bean, Fruit, Tomato, turkey, Vegetable, Kid-Friendly, Apple, Lentil, Lettuce, Cookie]",
1,Boudin Blanc Terrine with Red Onion Confit,"[1 1/2 cups whipping cream, 2 medium onions, chopped, 5 teaspoons salt, 3 bay leaves, 3 whole cloves, 1 large garlic clove, crushed, 1 teaspoon pepper, 1/8 teaspoon ground nutmeg, Pinch of dried thyme, crumbled, 8 large shallots, minced, 1 tablespoon butter, 1 pound trimmed boneless center pork loin, sinew removed cut into 1-inch chunks, well chilled, 3 eggs, 6 tablespoon all purpose flour, 1/4 cup tawny Port, 3 tablespoons dried currants, minced, Lettuce leaves, Cracked peppercorns, Minced ...","[Combine first 9 ingredients in heavy medium saucepan. Add 3 shallots. Bring to simmer. Remove from heat, cover and let stand 30 minutes. Chill overnight., Preheat oven to 325°F. Line 7-cup pâté or bread pan with plastic wrap. Melt butter in heavy small skillet over low heat. Add remaining 5 shallots. Cover and cook until very soft, stirring occasionally, about 15 minutes. Transfer to processor. Add pork, eggs, flour and Port and puree. Strain cream mixture, pressing on solids to extract as ...","[Food Processor, Onion, Pork, Bake, Bastille Day, New Year's Eve, Dried Fruit, Port, Winter, Chill, Bon Appétit]","This uses the same ingredients found in boudin blanc, the classic French white sausage. Start two days before serving."
2,Potato and Fennel Soup Hodge,"[1 fennel bulb (sometimes called anise), stalks discarded, bulb cut into 1/2-inch dice, and feathery leaves reserved for garnish, 1 onion, diced, 2 tablespoons unsalted butter, 2 medium russet (baking) potatoes, 2 cups chicken broth, 1 1/2 cups milk]","[In a large heavy saucepan cook diced fennel and onion in butter over moderate heat, stirring, until softened, about 10 minutes. Peel and cube potatoes. Add potatoes and broth to fennel mixture and simmer, covered, until potatoes are very tender, about 20 minutes. In a blender or food processor purée mixture in batches until smooth and return to saucepan. Stir in milk and salt and pepper to taste and simmer soup, stirring occasionally, 10 minutes, or until heated through., Garnish soup with ...","[Soup/Stew, Dairy, Potato, Vegetable, Fennel, Gourmet, New York]",
3,Mahi-Mahi in Tomato Olive Sauce,"[2 tablespoons extra-virgin olive oil, 1 cup chopped onion, 1 cup dry white wine, 1 teaspoon anchovy paste, 2 14 1/2-ounce cans diced tomatoes with garlic, basil, and oregano in juice, 4 6-ounce mahi-mahi fillets, 1/2 cup large green olives, quartered, pitted, 3 teaspoons chopped fresh oregano, divided, 1 teaspoon (packed) finely grated orange peel, Country-style white bread cut into 1/2-inch-thick slices, toasted]","[Heat oil in heavy large skillet over medium-high heat. Add onion; sauté until translucent and beginning to brown, about 4 minutes. Add wine and anchovy paste. Boil until reduced to 3/4 cup, about 3 minutes. Add tomatoes with juice; bring to boil., Sprinkle fish with salt and pepper. Add fish to skillet atop tomato mixture. Reduce heat to low, cover, and simmer until fish is cooked through, about 9 minutes. Using slotted metal spatula, transfer fish to plate and tent with foil to keep warm. ...","[Fish, Olive, Tomato, Sauté, Low Fat, Low Cal, High Fiber, Dinner, Healthy, Simmer, Bon Appétit, Pescatarian, Dairy Free, Peanut Free, Tree Nut Free, Soy Free, Kosher]","The Sicilian-style tomato sauce has tons of Mediterranean flavor, thanks to the orange peel, olives, and oregano."
4,Spinach Noodle Casserole,"[1 12-ounce package frozen spinach soufflé, thawed, 1/2 pound extra-wide egg noodles, freshly cooked, 1 cup sour cream, 2 tablespoons purchased pesto sauce, 1/4 teaspoon ground nutmeg, 1 cup grated sharp cheddar cheese]","[Preheat oven to 350°F. Lightly grease 8x8x2-inch glass baking dish. Blend spinach, noodles, sour cream, pesto sauce and nutmeg in large bowl. Spoon mixture into prepared dish. Sprinkle cheese over. Bake until set, about 45 minutes. Let stand 10 minutes.]","[Cheese, Dairy, Pasta, Vegetable, Side, Bake, Vegetarian, Quick & Easy, Fall, Bon Appétit, California]",


In [98]:
def preprocess(text):
    text = text.lower()
    doc = text.split()
#     print('token',doc)
    doc2 = []
    for word in doc:
        if '-' in word:
            two_words = word.split('-')
            doc2.extend(two_words)
        elif '/' in word:
            two_words = word.split('/')
            doc2.extend(two_words)
        else:
            doc2.append(word)

    doc2 = [re.sub('[%s]' % re.escape(string.punctuation), '', word) for word in doc2]
    doc2 = [word for word in doc2 if word.isalpha()]
#     print(doc2)
    monograms = tokenizer.tokenize(doc2)
    
    return monograms

In [99]:
preprocess(df_text.title[7913]) # df_text.title[7913], 

['dried_fruit', 'compote', 'with', 'ginger_syrup']

In [100]:
df_text.ingredients[333][0], preprocess(df_text.ingredients[333][0])

('2/3 cup whipping cream', ['cup', 'whipping_cream'])

In [101]:
# Titles
title_sent = []

for i in range(len(df_text)):
    temp_word_list = preprocess(df_text.title[i])
    title_sent.append(temp_word_list)

In [103]:
# Ingredients
ing_sent = []

for i in range(len(df_text)):
    try:
        temp_ing_list = df_text.ingredients[i]
        for ing in temp_ing_list:
            temp_word_list = preprocess(ing)
            ing_sent.append(temp_word_list)
    except:
        continue

In [105]:
# Directions
dir_sent = []

for i in range(len(df_text)):
    try:
        temp_dir_list = df_text.directions[i]
        for dir_ in temp_dir_list:
            temp_word_list = preprocess(dir_)
            dir_sent.append(temp_word_list)
    except:
        continue

In [106]:
# Descriptions
desc_sent = []

for i in range(len(df_text)):
    try:
        temp_desc_list = df_text.desc[i]
        for desc in temp_desc_list:
            temp_word_list = preprocess(desc)
            desc_sent.append(temp_word_list)
    except:
        continue

In [109]:
epicurious_texts = title_sent + ing_sent + dir_sent + desc_sent

In [110]:
with open('epicurious_texts.pkl', 'wb') as f:
    pickle.dump(epicurious_texts, f)

# Word2Vec

In [111]:
model = gensim.models.Word2Vec(epicurious_texts, size=200, window=5, min_count=1, workers=2, sg=1)

In [132]:
model.similarity('lemon_zest','lemon_peel')

0.68392276556823139

# CBOW

In [315]:
def preprocess_cbow(text):
    text = text.lower()
    doc = text.split()
#     print('token',doc)
    doc2 = []
    for word in doc:
        if '-' in word:
            two_words = word.split('-')
            doc2.extend(two_words)
        elif '/' in word:
            two_words = word.split('/')
            doc2.extend(two_words)
        else:
            doc2.append(word)

    doc2 = [re.sub('[%s]' % re.escape(string.punctuation), '', word) for word in doc2]
    doc2 = [word for word in doc2 if word.isalpha()]
    doc2 = [word for word in doc2 if word not in stop_words]
    doc2 = [word for word in doc2 if word not in measurements] # only use for ingredients
#     print(doc2)
    monograms = tokenizer.tokenize(doc2)
    
    return monograms

In [133]:
# Titles
title_cbow = title_sent

In [330]:
measurements = ['cup','cups','C','c','gram','grams','g','kilogram','kilograms','kg','liter','liters','L','l',
               'pound','pounds','lb','milliliter','milliliters','ml','mL','ounce','ounces','oz','pint','pints','pt',
               'teaspoon','teaspoons','t','tsp','tablespoon','tablespoons','T','TB','Tbl','Tbsp','tbsp','quart','quarts','qt',
               'dash','pinch','piece','pieces','slice','slices','sheet','sheets','log','stick','sticks','head',
               'chopped','diced','sliced','cored','seeded','trimmed','chunk','chunks','minced','divided','cut',
               'packed','fresh','large','medium','small','sprig','thin','thinly','thick','inch','well','chilled',
               'half','halved','halves','peeled','crumbled','crushed','dice','packaged','purchased','cubes','package',
               'separated','fine','finely','coarsely','freshly','generous','diameter','stemmed','packages','thaw',
               'thawed','additional','grated','grate', 'pitted','rounds','wedges','skinned','deveined','shredded',
               'unpeeled','optional','preferably','long','frozen','room','temperature','quartered','sometimes','called',
               'reserve','reserved','plus','lengthwise','parts','discarded','parts','bunch','shelled','available',
               'seasonally','markets','bottle','canned','drained','ingredient','ground','whole','round','style',
               'cans','leaves','strips','ground','bag','strip','split','supermarket','supermarkets','specialty',
               'crosswise','size','ring','rings','square','squares','colors','julienne','minute','minutes','dry',
               'chopping','shell','notes','shells','wash','diagonal','accompaniment','string','strings','press',
               'dish','firm','dash','dashes']

# split? supermarket(s), specialty, vanilla_extract?, crosswise, matchstick, size, rings, squares, colors, julienne,
# minute(s), shells, shell, notes, diagonal, wash, dry, chopping, accompaniment, string(s), dish, press, firm,
# dash, dashes

In [331]:
# Ingredients
ing_cbow = []
for i in range(len(df_text)):
    try:
        temp_ing_list = df_text.ingredients[i]
        temp_word_list = []
        for ing in temp_ing_list:
#             print([w for w in (ing.split()) if w not in measurements])
#             temp_ing = [w for w in (ing.split()) if w not in measurements]
#             temp_ing = ' '.join(temp_ing)
#             print(temp_ing)
            temp_word_list.append(preprocess_cbow(ing))
        ing_cbow.append(temp_word_list)
    except:
        continue

In [332]:
ing_cbow[:4]

[[['low', 'sodium', 'vegetable', 'chicken', 'stock'],
  ['dried', 'brown', 'lentils'],
  ['dried', 'french', 'green', 'lentils'],
  ['stalks_celery'],
  ['carrot'],
  ['thyme'],
  ['kosher_salt'],
  ['tomato'],
  ['fuji', 'apple'],
  ['squeezed', 'lemon_juice'],
  ['extra', 'virgin', 'olive_oil'],
  ['black', 'pepper', 'taste'],
  ['wheat', 'lavash', 'flour_tortillas'],
  ['turkey_breast'],
  ['bibb_lettuce']],
 [['whipping_cream'],
  ['onions'],
  ['salt'],
  ['bay'],
  ['cloves'],
  ['garlic_clove'],
  ['pepper'],
  ['nutmeg'],
  ['dried_thyme'],
  ['shallots'],
  ['butter'],
  ['boneless', 'center', 'pork_loin', 'sinew', 'removed'],
  ['eggs'],
  ['purpose', 'flour'],
  ['tawny_port'],
  ['dried_currants'],
  ['lettuce'],
  ['cracked_peppercorns'],
  ['parsley'],
  ['bay'],
  ['french_bread', 'baguette'],
  ['olive_oil'],
  ['red', 'onions'],
  ['dried_currants'],
  ['red', 'wine_vinegar'],
  ['chicken_broth'],
  ['thyme', 'dried'],
  ['sugar']],
 [['fennel_bulb', 'anise', 'stalks',

In [145]:
# Directions
dir_cbow = []

for i in range(len(df_text)):
    try:
        temp_dir_list = df_text.directions[i]
        temp_word_list = []
        for dir_ in temp_dir_list:
            temp_word_list.append(preprocess_cbow(dir_))
        dir_cbow.append(temp_word_list)
    except:
        continue

In [170]:
# Descriptions
desc_cbow = []

for i in range(len(df_text)):
    try:
        temp_desc_list = df_text.desc[i]
#         temp_word_list = []
#         for desc in temp_desc_list:
#             print(desc)
        desc_cbow.append(preprocess_cbow(temp_desc_list))
#         desc_cbow.append(temp_word_list)
    except:
        desc_cbow.append([])

In [171]:
len(desc_cbow), len(dir_cbow)

(17436, 17436)

In [333]:
df_cbow = pd.DataFrame(columns=['title','ingredients','directions','desc'])

In [334]:
df_cbow['title'] = title_cbow
df_cbow['ingredients'] = ing_cbow
df_cbow['directions'] = dir_cbow
df_cbow['desc'] = desc_cbow

In [335]:
df_cbow.to_pickle('df_07_2.pkl')