In [72]:
import ast
from collections import Counter
import json
import os
import pickle
import sys

sys.path.append("../..")
from src.training.dataset_utils import tokens_to_indices, correct_BIO_encodings
from src.training.train_utils import form_ner_pred_matrix

import numpy as np
import pandas as pd
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
import tensorflow as tf

In [2]:
# load Spacy tokenizer
nlp = English()
tokenizer = nlp.Defaults.create_tokenizer(nlp)

In [3]:
# load food NER model
base_path = "/Users/Carol/Google Drive/"
model_file = "/Users/Carol/Google Drive/nlp_data/output/20200523_22_06_28/20200523_22_06_28_food_ner_epoch_5_dev_f1_0.9851520816163143.h5"
mappings_file = "/Users/Carol/Google Drive/nlp_data/output/20200523_22_06_28/20200523_22_06_28_food_ner_mappings.pkl"
model = tf.keras.models.load_model(model_file)

W0525 09:17:42.210789 140736447558592 deprecation.py:506] From /Users/Carol/anaconda/envs/nlp/lib/python3.6/site-packages/tensorflow_core/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0525 09:17:42.216892 140736447558592 deprecation.py:506] From /Users/Carol/anaconda/envs/nlp/lib/python3.6/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0525 09:17:42.219383 140736447558592 deprecation.py:506] From /Users/Carol/anaconda/envs/nlp/lib/python3.6/site-packages/tensorflow_core/python/ops/init_ops

In [4]:
# load saved mappings for model
mappings = pickle.load(open(mappings_file, "rb"))
label_to_index = mappings['label_to_index']
token_to_index = mappings['token_to_index']
index_to_label = {v:k for k,v in label_to_index.items()}

In [6]:
# load recipe data (data set for kosher/gluten-free/vegan classifier)
infile = "/Users/Carol/Google Drive/nlp_data/recipe_data/20200524_categories/20200524_gf_k_v_cats.tsv"
recipe_df = pd.read_csv(infile, sep="\t")

In [7]:
recipe_df.head(2)

Unnamed: 0,title,ingredients,gluten_free,kosher,vegan
0,Mahi-Mahi in Tomato Olive Sauce,"['2 tablespoons extra-virgin olive oil', '1 cu...",False,True,False
1,Ham and Spring Vegetable Salad with Shallot Vi...,"['1 1/2 pounds small red-skinned potatoes, eac...",True,False,False


In [8]:
# join the directions to this dataframe from the full recipe df
recipe_file = "/Users/Carol/Dropbox/epicurious-recipes-with-rating-and-nutrition/full_format_recipes.json"
full_recipe_df = pd.read_json(recipe_file, orient='records')
recipe_df.shape

(11022, 5)

In [10]:
deduped = full_recipe_df.drop_duplicates(subset="title", keep="first")

In [12]:
with_directions = pd.merge(recipe_df, deduped, how='left', on='title', suffixes=("", '_y'))[['title', 'ingredients', 'directions', 'gluten_free', 'kosher', 'vegan']]
with_directions.head(2)

Unnamed: 0,title,ingredients,directions,gluten_free,kosher,vegan
0,Mahi-Mahi in Tomato Olive Sauce,"['2 tablespoons extra-virgin olive oil', '1 cu...",[Heat oil in heavy large skillet over medium-h...,False,True,False
1,Ham and Spring Vegetable Salad with Shallot Vi...,"['1 1/2 pounds small red-skinned potatoes, eac...",[Cook potatoes and carrots in large pot of boi...,True,False,False


In [13]:
dir_lists = with_directions.directions.tolist()

In [14]:
def extract_food_terms(tokens, labels):
    # get a list of food terms from NER prediction
    text = ""
    start = 0
    ents = []
    curr_label = ""
    new_ent = {}
    for token, label in zip(tokens, labels):
        text += token + " "
        end = start + len(token)
        if label.startswith("B-"):
            if new_ent:
                new_ent['text'] = text[new_ent['start']:new_ent['end']]
                ents.append(new_ent)
            curr_label = label[2:]
            new_ent = {"start": start, "end": end,
                       "label": curr_label}
        elif label.startswith("I-"):
            assert label[2:] == curr_label
            new_ent['end'] = end
        elif label == "O":
            if new_ent:
                new_ent['text'] = text[new_ent['start']:new_ent['end']]
                ents.append(new_ent)
                new_ent = {}
        else:
            raise Exception("Found non-BIO label {}!".format(label))
        start += len(token) + 1
    if new_ent:
        new_ent['text'] = text[new_ent['start']:new_ent['end']]
        ents.append(new_ent)
    return ents

In [24]:
# extract food terms from the cooking directions (didn't work well from ingredient lists, likely due to not looking like training data)
all_food_terms = []  # collect the food terms extracted from each recipe
for dir_list in dir_lists:
    food_terms = []
    if isinstance(dir_list, float):  # NAs
        dir_list = []
    for sent in dir_list:
        tokens = [t.text for t in tokenizer(sent)]
        token_indices = tokens_to_indices(tokens, token_to_index)
        preds = model.predict(token_indices)
        preds = np.argmax(preds, axis=-1)
        labels = [index_to_label[ind[0]] for ind in preds]
        labels = correct_BIO_encodings(labels)
        food_terms.extend([term['text'] for term in extract_food_terms(tokens, labels)])
    all_food_terms.append(food_terms)

In [29]:
# fuse each list of ingredients into a single string for later use
ing_lists = with_directions.ingredients.tolist()
all_fused_ingredients = []
for ing_list in ing_lists:
    ings = ""
    if isinstance(ing_list, float):
        reloaded_list = []
    else:
        reloaded_list = ast.literal_eval(ing_list)
    for part in reloaded_list:
        ings += part + ". "
    all_fused_ingredients.append(ings)


In [32]:
len(all_food_terms)

11022

In [33]:
len(all_fused_ingredients)

11022

In [34]:
with_directions['food_terms'] = all_food_terms
with_directions['joined_ingredients'] = all_fused_ingredients

In [35]:
outfile = "/Users/Carol/Google Drive/nlp_data/recipe_data/20200524_categories/20200524_gf_k_v_food_terms_extracted.tsv"
with_directions.to_csv(outfile, sep="\t", index=False)

## Explore terms

In [168]:
vegan_terms = with_directions[with_directions.vegan==True]['food_terms'].tolist()
kosher_terms = with_directions[with_directions.kosher==True]['food_terms'].tolist()
gf_terms = with_directions[with_directions.gluten_free==True]['food_terms'].tolist()
non_vegan_terms = with_directions[with_directions.vegan==False]['food_terms'].tolist()
non_kosher_terms = with_directions[with_directions.kosher==False]['food_terms'].tolist()
non_gf_terms = with_directions[with_directions.gluten_free==False]['food_terms'].tolist()

In [169]:
def deduplicate_by_recipe(term_list):
    term_list = [list(set(terms)) for terms in term_list]
    return term_list

def flatten_list(nested_list):
    return [item for sublist in nested_list for item in sublist]

vegan_terms = flatten_list(deduplicate_by_recipe(vegan_terms))
non_vegan_terms = flatten_list(deduplicate_by_recipe(non_vegan_terms))
kosher_terms = flatten_list(deduplicate_by_recipe(kosher_terms))
non_kosher_terms = flatten_list(deduplicate_by_recipe(non_kosher_terms))
gf_terms = flatten_list(deduplicate_by_recipe(gf_terms))
non_gf_terms = flatten_list(deduplicate_by_recipe(non_gf_terms))

In [170]:
all_terms = vegan_terms + non_vegan_terms
all_terms = [term.lower() for term in all_terms]
all_counts = Counter(all_terms)

all_unique_terms = list(set(all_terms))
unique_terms_to_use = []
for term in all_unique_terms:
    if all_counts[term] >= 100:
        unique_terms_to_use.append(term)
        

In [171]:
len(all_unique_terms)

6301

In [172]:
len(unique_terms_to_use)

280

In [173]:
unique_terms_to_use

['pineapple',
 'celery',
 'zucchini',
 'sour cream',
 'bell pepper',
 'pound',
 'butter',
 'grain',
 'cabbage',
 'kosher',
 'parmesan',
 'flavor',
 'lemon zest',
 'turkey',
 'yogurt',
 'seeds',
 'whole',
 'red',
 'orange juice',
 'ginger',
 'crème fraîche',
 'seasoned',
 'turmeric',
 'vegetables',
 'caramel',
 'black pepper',
 'lettuce',
 'pears',
 'raisins',
 'bite',
 'berries',
 'horseradish',
 'dill',
 'cider',
 'broccoli',
 'parsley',
 'wine',
 'vegetable',
 'green',
 'coffee',
 'salt',
 'cookies',
 'seasoning',
 'zest',
 'bell',
 'broth',
 'cubes',
 'glaze',
 'shell',
 'scallions',
 'pie',
 'spinach',
 'brandy',
 'beans',
 'cucumber',
 'peppers',
 'vinaigrette',
 'oregano',
 'bread',
 'cookie',
 'squash',
 'herbs',
 'olive',
 'salad',
 'scallion',
 'ricotta',
 'pepper',
 'charred',
 'blackened',
 'pastry',
 'fennel',
 'potatoes',
 'pine',
 'drippings',
 '8–10',
 'sausage',
 'ribs',
 'crumbs',
 'fish',
 'rum',
 'cayenne',
 'fruit',
 'fennel seeds',
 'lamb',
 '/',
 'sea',
 'almonds'

In [174]:
def get_over_and_under_represented_terms(terms_a, terms_b, terms_to_check, how_many):
    freq_dict = {}
    a_counts = Counter([term.lower() for term in terms_a])
    b_counts = Counter([term.lower() for term in terms_b])
    for term in terms_to_check:
        try:
            ratio = a_counts[term]/b_counts[term]
        except ZeroDivisionError:
            ratio = 1000
        freq_dict[term] = ratio

    freq_results = [(v, k) for k, v in freq_dict.items()]
    freq_results.sort(key=lambda x: x[0], reverse=True
                     )
#     print(freq_results)
    return freq_results[:how_many], freq_results[-1*how_many:][::-1]
    
over, under = get_over_and_under_represented_terms(vegan_terms, non_vegan_terms, unique_terms_to_use, 10)

In [175]:
under

[(0.0, 'scallops'),
 (0.0, 'yolks'),
 (0.0, 'yolk'),
 (0.0, 'bacon'),
 (0.0, 'chops'),
 (0.0, 'beef'),
 (0.0, 'breast'),
 (0.0, 'buttermilk'),
 (0.0, 'salmon'),
 (0.0, 'egg yolk')]

In [176]:
over

[(0.4482758620689655, 'white pith'),
 (0.43636363636363634, 'eggplant'),
 (0.4189189189189189, 'broccoli'),
 (0.3951612903225806, 'sesame'),
 (0.37628865979381443, 'vinaigrette'),
 (0.3684210526315789, 'oranges'),
 (0.35185185185185186, 'sesame seeds'),
 (0.3388888888888889, 'zucchini'),
 (0.33035714285714285, 'beets'),
 (0.3181818181818182, 'turmeric')]

In [177]:
get_over_and_under_represented_terms(kosher_terms, non_kosher_terms, unique_terms_to_use, 10)

([(5.944444444444445, 'frosting'),
  (5.4411764705882355, 'cookies'),
  (4.611111111111111, 'blueberries'),
  (4.571428571428571, 'powdered sugar'),
  (4.473684210526316, 'oats'),
  (4.277511961722488, 'vanilla'),
  (4.08, 'pudding'),
  (4.016393442622951, 'cake'),
  (4.0, 'meringue'),
  (3.984375, 'ice cream')],
 [(0.006211180124223602, 'bacon'),
  (0.007380073800738007, 'shrimp'),
  (0.00980392156862745, 'scallops'),
  (0.01904761904761905, 'gelatin'),
  (0.019230769230769232, 'pork'),
  (0.045871559633027525, 'ham'),
  (0.05517241379310345, 'cocktail'),
  (0.05673758865248227, 'sausage'),
  (0.11818181818181818, 'chops'),
  (0.14388489208633093, 'drippings')])

In [178]:
# Investigate the underrepresentation of cocktail in kosher recipes
non_kosher_terms = with_directions[with_directions.kosher==False]['food_terms'].tolist()
non_kosher_directions = with_directions[with_directions.kosher==False]['directions'].tolist()

for terms, directions in zip(non_kosher_terms, non_kosher_directions):
    terms = [t.lower() for t in terms]
    if "cocktail" in terms:
        print(directions)
        print("====")

['Grate coconut in bowl. Add peeled and sectioned oranges; discard membrane and seeds. Add pineapple, fruit cocktail, apples, bananas, and nuts. Sweeten with sugar, being careful not to add too much as the syrup from the fruit is usually sufficient. Chill before serving. This dessert is pretty garnished with cherries and served in a crystal bowl.']
====
['Preheat the oven to 200°C (400°F/gas mark 6). Brush a baking tray with the oil.', 'Cut each bacon rasher in half lengthways. Lay a strip of bacon on a chopping board, place a dried prune at the top edge and roll, wrapping it tightly around the prune. Pierce a cocktail stick through the centre of the wrapped prune to secure. Repeat with the remaining bacon and prunes.', 'Place the prunes on a the prepared baking tray and bake them for 10–15 minutes, turning over half way through cooking, until the bacon is slightly crispy on both sides.']
====
['1. Rinse the shrimp under cold running water, then blot them dry with paper towels.', '2. P

In [179]:
get_over_and_under_represented_terms(gf_terms, non_gf_terms, unique_terms_to_use, 10)

([(1.7009966777408638, 'potatoes'),
  (1.6808510638297873, 'white pith'),
  (1.5476190476190477, 'gelatin'),
  (1.4953271028037383, 'vinaigrette'),
  (1.4833333333333334, 'beets'),
  (1.4406779661016949, 'asparagus'),
  (1.375, 'squash'),
  (1.3673469387755102, 'turmeric'),
  (1.3275862068965518, 'radishes'),
  (1.2523364485981308, 'chile')],
 [(0.0, 'breadcrumbs'),
  (0.019417475728155338, 'soy'),
  (0.03, 'toasts'),
  (0.031446540880503145, 'crumbs'),
  (0.035147392290249435, 'flour'),
  (0.036585365853658534, 'meal'),
  (0.037900874635568516, 'pasta'),
  (0.04081632653061224, 'cocktail'),
  (0.04329004329004329, 'tart'),
  (0.04377104377104377, 'soda')])

In [180]:
# Investigate the underrepresentation of cocktail in gluten free recipes
non_gf_terms = with_directions[with_directions.gluten_free==False]['food_terms'].tolist()
non_gf_directions = with_directions[with_directions.gluten_free==False]['directions'].tolist()

for terms, directions in zip(non_gf_terms, non_gf_directions):
    terms = [t.lower() for t in terms]
    if "cocktail" in terms:
        print(directions)
        print("====")

['In a saucepan combine the cranberries, the pear, the brown sugar, the dried fruit, the zest, the cinnamon stick, the cranberry juice cocktail, the vanilla, and a pinch of salt and simmer the mixture for 10 minutes. Strain the mixture through a sieve into a bowl, reserving the solids, and transfer the cranberry liquid to a shallow dish. Line a 1 1/2-quart soufflé dish with plastic wrap and trim enough of the bread slices to cover the bottom and the side of the dish. Soak the trimmed slices in the cranberry liquid and line the bottom and the side of the soufflé dish with them.', 'Discard the zest and the cinnamon stick from the reserved cranberry solids and add the solids to the soufflé dish. Trim enough of the remaining bread slices to cover the cranberry solids, soak the trimmed slices in the cranberry liquid, and top the cranberry solids with them. Cover the pudding with plastic wrap, put a plate on top of the plastic wrap, and weight the pudding with three 1-pound cans. Chill the p

In [182]:
# Investigate the underrepresentation of soy in gluten free recipes
non_gf_terms = with_directions[with_directions.gluten_free==False]['food_terms'].tolist()
non_gf_directions = with_directions[with_directions.gluten_free==False]['directions'].tolist()

for terms, directions in zip(non_gf_terms, non_gf_directions):
    terms = [t.lower() for t in terms]
    if "soy" in terms:
        print(directions)
        print("====")
        print(terms)
        print("========")

['Cook rice with a pinch of salt according to package directions.', 'Set a steamer basket in a medium pot filled with 1" water, cover pot, and bring water to a boil. Uncover and steam broccoli until crisp-tender, 5–6 minutes. Drain and set aside.', 'Meanwhile, whisk cornstarch, pepper, and 1 1/2 tsp. salt in a large bowl. Add chicken and toss to coat.', 'Heat 2 Tbsp. vegetable oil in a large skillet over high. Add half of the chicken and cook, turning occasionally, until chicken is cooked through and a light brown crust forms, about 5 minutes. Transfer chicken to a plate. Heat remaining 2 Tbsp. vegetable oil; cook remaining chicken and transfer to plate.', 'Combine honey, soy sauce, vinegar, Sriracha, garlic, ginger, sesame oil, 1 Tbsp. sesame seeds, and remaining 1/4 tsp. salt in a medium bowl. Transfer sauce to skillet and cook 1 minute to reduce slightly. Return chicken to skillet, toss in sauce to coat, and cook until sauce is reduced and thickened, about 2 minutes. Fold in reserve

['Mix together the soy sauce, ginger, and mirin in a large bowl to make the marinade. Reserve 1/2 cup of the marinade and set aside. Lay the squid in the remaining liquid, gently turning 4 times to coat all over. Marinate the squid for 10 minutes at room temperature, turning once.', "Preheat a grill to hot. Grill the squid for about 4 minutes (5 minutes if they're larger than 1 ounce apiece). Every 30 seconds flip the squid and brush on the reserved marinade. The squid will turn from translucent to white when they're ready, becoming tender and releasing a rich aroma. Be careful not to overcook; squid turns rubbery if grilled too long. Serve immediately."]
====
['soy', 'sauce', 'ginger', 'mirin', 'marinade', 'marinade', 'squid', 'squid', 'squid', 'squid', 'marinade', 'squid', 'white', 'rich', 'squid', 'rubbery']
['Whisk all ingredients in small bowl to blend.', 'Mix oyster sauce, Sherry, tamari soy sauce, minced garlic, and brown sugar in medium bowl. Stir in steak. Marinate steak at le