# Imports

In [4]:
# Basics
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re
import string
import csv
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS
from collections import Counter, defaultdict
import pickle as pkl
from scipy.spatial import distance

# MongoDB
from pymongo import MongoClient

# natural language processing
import nltk
from textblob import TextBlob
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.tokenize import MWETokenizer
from nltk.corpus import stopwords
from ingreedypy import Ingreedy

# sklearn
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import scale
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity

# gensim
from gensim import corpora, models, similarities, matutils

# Other
import warnings
warnings.filterwarnings('ignore')

# Google Word2Vec Model

In [9]:
import gensim
google_vec_file = '/Users/carliebadder/Downloads/GoogleNews-vectors-negative300.bin'
model = gensim.models.KeyedVectors.load_word2vec_format(google_vec_file, binary=True)

In [10]:
model_wv = model.wv
del model

# Custom Word2Vec

In [5]:
with open('word_vectors.pkl', 'rb') as f:
    w2v = pkl.load(f)

# Data

In [28]:
with open('food_clusters.pkl', 'rb') as f:
    centroids = pkl.load(f)

In [6]:
# text data
df_text = pd.read_pickle('df_01.pkl')

In [7]:
# word2vec vectors
df_vec = pd.read_pickle('df_03.pkl')

In [8]:
# cbow vectors
df_cbow = pd.read_pickle('df_02.pkl')

In [9]:
# tags
df_tags = pd.read_pickle('df_04.pkl')

In [308]:
for i in range(len(df_cbow)):
    if 'pizza' in df_cbow.title[i]:
        print(i, df_cbow.title[i])

112 ['kids', 'matzoh', 'pizza']
247 ['tossed', 'pizza', 'salad']
292 ['oven', 'baked', 'pizza', 'caramelized', 'onions', 'gorgonzola', 'walnuts', 'oven_baked', 'caramelized_onions']
479 ['bacon', 'cabbage', 'gruyère', 'pizza']
656 ['grilled', 'pizza', 'tomatoes', 'corn', 'cheese']
1052 ['pizza', 'dough', 'pizza_dough']
1370 ['presto', 'pesto', 'pizza']
2107 ['eggplant', 'tomato', 'fontina', 'pizza', 'eggplant_tomato']
2144 ['soppressata', 'pizza']
2163 ['pizza', 'pan', 'fried', 'hawaiian', 'pizza', 'pan_fried']
2371 ['spicy', 'sausage', 'gorgonzola', 'pizza', 'spicy_sausage']
2422 ['asparagus', 'fingerling', 'potato', 'goat', 'cheese', 'pizza', 'fingerling_potato', 'goat_cheese']
2633 ['ricotta', 'pizza', 'pie']
3055 ['onion', 'bacon', 'cream', 'pizza']
3171 ['pizza', 'leeks', 'tomato', 'goat', 'cheese', 'goat_cheese']
3245 ['apple', 'pizza']
3309 ['grilled', 'pizza', 'yellow', 'squash', 'mozzarella', 'lemon', 'thyme', 'lemon_thyme']
3495 ['mozzarella', 'prosciutto', 'pizza', 'balsamic

# MWE Tokenizer

In [12]:
with open('ngram_list.pkl', 'rb') as f:
    ngram_list = pickle.load(f)

In [13]:
tokenizer = MWETokenizer(ngram_list)

# Dictionaries

In [14]:
# probability of ingredients
df_meat_probs = pd.read_pickle('df_meat.pkl')
df_seas_probs = pd.read_pickle('df_seasonings.pkl')
df_veg_probs = pd.read_pickle('df_veg.pkl')
df_fruit_probs = pd.read_pickle('df_fruit.pkl')
df_grains_probs = pd.read_pickle('df_grains.pkl')
df_dairy_probs = pd.read_pickle('df_dairy.pkl')
df_cond_probs = pd.read_pickle('df_condiments.pkl')

In [17]:
food_dfs = {'fruits': df_fruit_probs, 
              'vegetables': df_veg_probs, 
              'grains': df_grains_probs, 
              'seasonings': df_seas_probs,
              'meat': df_meat_probs, 
              'dairy': df_dairy_probs, 
              'condiments': df_cond_probs}

In [18]:
food_types = {'fruits': list(df_fruit_probs.index), 
              'vegetables': list(df_veg_probs.index), 
              'grains': list(df_grains_probs.index), 
              'seasonings': list(df_seas_probs.index),
              'meat': list(df_meat_probs.index), 
              'dairy': list(df_dairy_probs.index), 
              'condiments': list(df_cond_probs.index), 
              'bases': bases}
food_tags = food_types.keys()


In [366]:
df_grains_probs.sample(10)

Unnamed: 0,African,American,Asian,Dessert,East European,Jewish,Latin,Middle Eastern,West European
muffuletta,,0.000636,,,,,,,
crouton,,,,,0.000864,0.000571,,,0.000464
rice,0.360656,0.050691,0.446711,0.018863,0.047537,0.054286,0.091808,0.068835,0.032464
matzos,,,,0.000114,,0.010286,,,
baguettes,,0.000636,,,,0.000571,0.000254,0.000581,0.000928
crusts,0.001261,0.005721,0.000624,0.008352,0.005186,0.009714,0.000761,0.004938,0.006609
corn,0.052963,0.105355,0.033097,0.027555,0.060501,0.068,0.222927,0.028464,0.012754
muffin,,0.003496,0.000416,0.017499,0.009507,0.005714,0.000761,0.001452,0.005217
sorghum,0.008827,0.000636,,0.000455,,0.002857,0.000254,,
rye_bread,,0.002225,0.000624,,0.005186,0.001714,,0.00029,0.001391


# Parser

## Nouns and Measurements/Stopwords

In [103]:
nouns = ['NN','NNS','DT']
        
measurements = ['cup','cups','C','c','gram','grams','g','kilogram','kilograms','kg','liter','liters','L','l',
               'pound','pounds','lb','milliliter','milliliters','ml','mL','ounce','ounces','oz','pint','pints','pt',
               'teaspoon','teaspoons','t','tsp','tablespoon','tablespoons','T','TB','Tbl','Tbsp','tbsp','quart','quarts','qt',
               'dash','pinch','piece','pieces','slice','slices','sheet','sheets','log','stick','sticks','head',
               'chopped','diced','sliced','cored','seeded','trimmed','chunk','chunks','minced','divided','cut',
               'packed','fresh','large','medium','small','sprig','thin','thinly','thick','inch','well','chilled',
               'half','halved','halves','peeled','crumbled','crushed','dice','packaged','purchased','cubes','package',
               'separated','fine','finely','coarsely','freshly','generous','diameter','stemmed','packages','thaw',
               'thawed','additional','grated','grate', 'pitted','rounds','wedges','skinned','deveined','shredded',
               'unpeeled','optional','preferably','long','frozen','room','temperature','quartered','sometimes','called',
               'reserve','reserved','plus','lengthwise','parts','discarded','parts','bunch','shelled','available',
               'seasonally','markets','bottle','canned','drained','ingredient','ground','whole','round','style',
               'cans','leaves','strips','ground','bag','strip','split','supermarket','supermarkets','specialty',
               'crosswise','size','ring','rings','square','squares','colors','julienne','minute','minutes','dry',
               'chopping','shell','notes','shells','wash','diagonal','accompaniment','string','strings','press',
               'dish','firm','dash','dashes']

# sprigs, bunches?

essentials = ['flour','butter','salt','baking_soda','baking_powder','olive_oil','water','kosher_salt',
             'eggs','egg','sugar','brown_sugar','granulated_sugar','garlic','garlic_cloves','garlic_clove',
             'ice','equipment','unsalted_butter', 'unsalted_butter', 'cloves_garlic', 'melted_butter', 'oil',
             'pepper','egg_yolk','egg_white','egg_yolks','egg_whites']


## Functions

In [140]:
# 3.0
# Functions
# check if the replacement word is essentially the same ingredient
def similar(a, b):
    c = 0
    if a in b or b in a:
        c+=1
    if a == b.lower():
        c+=1
    return c>0

from numpy.random import choice

def normalize(lst):
    s = sum(lst)
    return map(lambda x: (float(x)/s), lst)

In [141]:
def preprocess_cbow(text):
    text = text.lower()
    doc = text.split()
    
    doc2 = []
    for word in doc:
        if '-' in word:
            two_words = word.split('-')
            doc2.extend(two_words)
        elif '/' in word:
            two_words = word.split('/')
            doc2.extend(two_words)
        else:
            doc2.append(word)

    doc2 = [re.sub('[%s]' % re.escape(string.punctuation), '', word) for word in doc2]
    doc2 = [word for word in doc2 if word.isalpha()]
    doc2 = [word for word in doc2 if word not in measurements]
    doc2 = tokenizer.tokenize(doc2)
    doc2 = pos_tag(doc2)
    doc2 = [x[0] for x in doc2 if x[1] in nouns]
    doc2 = [w for w in doc2 if w in w2v]
    
    return doc2

In [142]:
def document_vector(word2vec_model, doc):
    doc = [word for word in doc if word in w2v]
    return np.mean(word2vec_model[doc], axis=0)

In [143]:
ing_types = ['mustards','green_onions','baking_ing','vegetables','herbs','milk',
            'ginger','spices','citrus_juice','whipping_cream','citrus_fruits','baking_adds',
            'grains','cheese','onion','bell_peppers','vinegars','fruits','meats','wine/sauce']

def ingredient_label(ing_vec):
    compare = []
    for i in range(20):
        clust_cent = centroids[i]
        compare.append([cosine_similarity(clust_cent, ing_vec), ing_types[i]])
    return max(compare)[1]

In [315]:
def ingredient_parser(ingredient):
    # ingredient text
    ing = ingredient
    
    # ingredient cleaned, continuous bag of words
    ing_cbow = preprocess_cbow(ing) 
    
    # food2vec
    try:
        ing_vec = document_vector(w2v, ing_cbow) 
    except:
        return ('No replacement recommended for : ' + ing)
    
    # make sure the ingredient isn't a non-replaceable one
    test_ess = [w for w in ing_cbow if w in essentials] 
    
    if len(test_ess) == len(ing_cbow):
#         temp_type = 'essential'
        return ('No replacement recommended for : ' + ing)

    # if the ingredient might be replaceable...
    else:
        # use the ingredient_label function to figure out the type of ingredient
        ing_label = ingredient_label(ing_vec)

        # find possible matches depending on the label, the food_tag, w2v similarity, and food_types dictionaries
        if ing_label=='fruits' or ing_label=='citrus_fruits':
            # search food_tags = 'fruits'
            temp_type = 'fruits'
            
        elif ing_label=='vegetables' or ing_label=='bell_peppers' or ing_label=='onion':
            # search food_tags = 'vegetables'
            temp_type = 'vegetables'

        elif ing_label=='grains':
            # search food_tags = 'grains'
            temp_type = 'grains'

        elif ing_label=='mustards' or ing_label=='vinegars' or ing_label=='wine/sauce' or ing_label=='citrus_juice':
            # search food_tags = 'condiments'
            temp_type = 'condiments'

        elif ing_label=='cheese' or ing_label=='milk' or ing_label=='whipping_cream':
            # search food_tags = 'dairy'
            temp_type = 'dairy'

        elif ing_label=='meats':
            # search food_tags = 'meat'
            temp_type = 'meat'

        elif ing_label=='green_onions' or ing_label=='herbs' or ing_label=='ginger' or ing_label=='spices':
            # search food_tags = 'seasonings'
            temp_type = 'seasonings'
        
        else: # if we can't be confident about a food label
            return ('No replacement recommended for : ' + (ing))

    poss_matches = [x for x in w2v.most_similar(ing_cbow,topn=15) if x[0] in food_types[temp_type]]
    poss_matches = [x for x in poss_matches if not similar(x[0], ' '.join(ing_cbow))]
    
    temp_matches = []
    if len(poss_matches) > 0:
        for x in poss_matches:
            temp_prob = food_dfs[temp_type].loc[x[0]][cuisine_filter]
            if not np.isnan(temp_prob):
                temp_matches.append((x[0], temp_prob))
    else:
        return ('No replacement recommended for : ' + ing)

    if temp_matches != []:
        rep_list = [x[0] for x in temp_matches]
        prob_list = [x[1] for x in temp_matches]
        norm_prob_list = list(normalize(prob_list))

        replacement_ingredient = choice(rep_list, 1, p=norm_prob_list)
        return ('Replace ' + ing + ' with :' + replacement_ingredient[0])
    else:
        return ('No replacement recommended for : ' + ing)
    

## Test Recipes

In [221]:
# 2013 to African
# 144 to african
# 1785 to west european
# 8582 to latin (pizza)
# 13561 to latin pizza
filters = list(df_tags.kmeans_rec.unique())

In [338]:
recipe_num = 13561
cuisine_filter = 'Latin'
recipe_title = df_text.iloc[recipe_num].title
ingredient_list = df_text.iloc[recipe_num].ingredients
original_cuisine = df_tags.loc[df_text.iloc[recipe_num].title].kmeans_rec
ingredient_cbow = df_cbow.iloc[recipe_num].ingredients

In [339]:
print('Recipe: ', recipe_title)
print('Original Cuisine: ', original_cuisine)
print('Cuisine Filter: ', cuisine_filter, '\n')

for i in range(len(ingredient_list)):
    print(i, ':', ingredient_list[i])

Recipe:  Three-Cheese Pizza with Pancetta and Mushrooms 
Original Cuisine:  West European
Cuisine Filter:  Latin 

0 : Pizza dough
1 : 1/2 cup purchased marinara sauce
2 : 1 cup coarsely grated Fontina cheese
3 : 1/2 cup finely grated Parmesan cheese
4 : 1/3 cup coarsely grated mozzarella cheese
5 : 2 ounces crimini (baby bella) mushrooms, thinly sliced
6 : 2 ounces thinly sliced pancetta (Italian bacon), coarsely chopped


In [342]:
# test on individual ingredient
num = 3
ing_item = ingredient_cbow[num]
ing_text = ingredient_list[num]
print((ing_text))

ingredient_parser(ing_text)

1/2 cup finely grated Parmesan cheese


'No replacement recommended for : 1/2 cup finely grated Parmesan cheese'

In [344]:
# test on ingredient list
for ing_item in ingredient_list:
    print(ingredient_parser(ing_item))

Replace Pizza dough with :puff_pastry
No replacement recommended for : 1/2 cup purchased marinara sauce
No replacement recommended for : 1 cup coarsely grated Fontina cheese
No replacement recommended for : 1/2 cup finely grated Parmesan cheese
Replace 1/3 cup coarsely grated mozzarella cheese with :pepper_jack
Replace 2 ounces crimini (baby bella) mushrooms, thinly sliced with :chanterelle
Replace 2 ounces thinly sliced pancetta (Italian bacon), coarsely chopped with :chorizo


# Widget

In [193]:
from ipywidgets import widgets
from IPython.display import display
from ipywidgets import interactive
from ipywidgets import Layout
import traitlets
from ipywidgets import interact, interactive, fixed, interact_manual

In [231]:
def on_submit(Submit):
    if Submit:
        n = recipe_id.value
        cuisine_filter = c_filter.value
        recipe_title = df_text.iloc[recipe_num].title
        ingredient_list = df_text.iloc[recipe_num].ingredients
        original_cuisine = df_tags.loc[df_text.iloc[recipe_num].title].kmeans_rec
        ingredient_cbow = df_cbow.iloc[recipe_num].ingredients
        
        print('Recipe: ', recipe_title)
        print('Original Cuisine: ', original_cuisine)
        print('Cuisine Filter: ', cuisine_filter, '\n')

        for i in range(len(ingredient_list)):
            print(i, ':', ingredient_list[i])

#         w = wine_type.value
#         fruit = fruit_flavors.value
#         other = more_flavors.value
#         descriptions = descriptor.value
        return n

def recipe_title(x):
    num = int(x)
    return df_text.iloc[num].title

In [252]:
recipe_id = widgets.IntText(
    value=7,
    description='ID Number:',
    disabled=False,
    
)

cuisine_list = widgets.ToggleButtons(
    options=filters,
    description='Cuisine Options:',
    disabled=False,
    button_style='' # 'success', 'info', 'warning', 'danger' or ''
#     tooltips=['Red', 'White', 'Surprise Me'],
)

cuisine_list.observe(names=)

c_filter = widgets.ToggleButtons(
    options=filters,
    description='Cuisine Options:',
    disabled=False,
    button_style='' # 'success', 'info', 'warning', 'danger' or ''
#     tooltips=['Red', 'White', 'Surprise Me'],
)


In [253]:
accordion = widgets.Accordion(children=[cuisine_list,c_filter]) #, fruit_flavors, more_flavors, descriptor])
accordion.set_title(0, 'Choose a Cuisine:')
accordion.set_title(1, 'Choose a Cuisine Filter:')

In [254]:
display(accordion)
interact(on_submit, Submit=False);
# interact_manual(on_submit, Submit=False);