In [2]:
import argparse
import ast
import pickle
import sys

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy import displacy
import tensorflow as tf
from tensorflow.keras import backend as K
from xgboost import XGBClassifier

from food_tools.training.dataset_utils import tokens_to_indices, correct_BIO_encodings

In [3]:
def output_to_displacy(tokens, labels):
    text = ""
    start = 0
    ents = []
    curr_label = ""
    new_ent = {}
    for token, label in zip(tokens, labels):
        text += token + " "
        end = start + len(token)
        if label.startswith("B-"):
            if new_ent:
                ents.append(new_ent)
            curr_label = label[2:]
            new_ent = {"start": start, "end": end,
                       "label": curr_label}
        elif label.startswith("I-"):
            assert label[2:] == curr_label
            new_ent['end'] = end
        elif label == "O":
            if new_ent:
                ents.append(new_ent)
                new_ent = {}
        else:
            raise Exception("Found non-BIO label {}!".format(label))
        start += len(token) + 1
    if new_ent:
        ents.append(new_ent)
    doc = {"text": text,
           "ents": ents,
           "title": None}
    return doc


def load_model(model_path):
    """
    https://github.com/tensorflow/tensorflow/issues/14356
    https://github.com/tensorflow/tensorflow/issues/28287
    """
    session = tf.Session(graph=tf.Graph())
    with session.graph.as_default():
        K.set_session(session)
        loaded_model = tf.keras.models.load_model(model_path)
        loaded_model.summary()
    return loaded_model, session


def load_mappings(filepath):
    return pickle.load(open(filepath, "rb"))


def load_sentencizer_and_tokenizer():
    nlp = English()
    sentencizer = nlp.create_pipe("sentencizer")
    nlp.add_pipe(sentencizer)
    tokenizer = nlp.Defaults.create_tokenizer(nlp)
    return nlp, tokenizer


def form_matrix(tokens):
    tokens = np.expand_dims(tokens, axis=0)
    return np.array(tokens)




In [4]:
saved_model = "/Users/Carol/Google Drive/nlp_data/output/20200503_16_50_50/20200503_16_50_50_food_ner_epoch_3_dev_f1_0.9867637173043644.h5"

saved_mappings = "/Users/Carol/Google Drive/nlp_data/output/20200503_16_50_50/20200503_16_50_50_food_ner_mappings.pkl"

model, session = load_model(saved_model)
mappings = load_mappings(saved_mappings)
index_to_label = {v: k for k, v in mappings['label_to_index'].items()}
token_to_index = mappings['token_to_index']
sentencizer, tokenizer  = load_sentencizer_and_tokenizer()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
token_input (InputLayer)     [(None, None)]            0         
_________________________________________________________________
word_embeddings (Embedding)  (None, None, 100)         40000500  
________

In [5]:
text =  "Heat garlic and rosemary with oil. Drizzle oil over dip and serve with " \
                    "vegetables."
text = "Combine pineapple, banana, cream of coconut, rolled " \
                "oats, quick-cooking oats, baking powder, mint, " \
                "chia seeds, and poppy seeds in a blender; blend until " \
                "smooth. Pour into 2 mugs."



sents = sentencizer(text)
all_tokens = []
for sent in sents.sents:
    tokens = tokenizer(sent.text)
    all_tokens.append([t.text for t in tokens])


In [7]:
def extract_food_terms(tokens, labels):
    # get a list of food terms from NER prediction for classifier
    text = ""
    start = 0
    ents = []
    curr_label = ""
    new_ent = {}
    for token, label in zip(tokens, labels):
        text += token + " "
        end = start + len(token)
        if label.startswith("B-"):
            if new_ent:
                new_ent['text'] = text[new_ent['start']:new_ent['end']]
                ents.append(new_ent)
            curr_label = label[2:]
            new_ent = {"start": start, "end": end,
                       "label": curr_label}
        elif label.startswith("I-"):
            assert label[2:] == curr_label
            new_ent['end'] = end
        elif label == "O":
            if new_ent:
                new_ent['text'] = text[new_ent['start']:new_ent['end']]
                ents.append(new_ent)
                new_ent = {}
        else:
            raise Exception("Found non-BIO label {}!".format(label))
        start += len(token) + 1
    if new_ent:
        new_ent['text'] = text[new_ent['start']:new_ent['end']]
        ents.append(new_ent)
    return ents

In [8]:
final_doc = None   # collects entity results from all sentences
all_terms = []   # collects food terms from all sentences
with session.graph.as_default():
    K.set_session(session)
    for tokens in all_tokens:
        token_indices = tokens_to_indices(tokens, token_to_index)
        preds = model.predict([tokens_to_indices(tokens, token_to_index)])
        preds = np.argmax(preds, axis=-1)
        labels = [index_to_label[ind[0]] for ind in preds]
        labels = correct_BIO_encodings(labels)
        
        terms = [term['text'] for term in extract_food_terms(tokens, labels)]
        all_terms.extend(terms)
        doc = output_to_displacy(tokens, labels)
        if not final_doc:  # first sentence
            final_doc = doc
            continue
        shift = len(final_doc['text'])
        for ent in doc['ents']:
            ent['start'] += shift
            ent['end'] += shift
            final_doc['ents'].append(ent)
        final_doc['text'] += doc['text']



In [9]:
all_terms

['pineapple',
 'banana',
 'cream',
 'coconut',
 'oats',
 '-',
 'oats',
 'powder',
 'mint',
 'chia seeds',
 'poppy seeds']

In [10]:
final_doc

{'text': 'Combine pineapple , banana , cream of coconut , rolled oats , quick - cooking oats , baking powder , mint , chia seeds , and poppy seeds in a blender ; blend until smooth . Pour into 2 mugs . ',
 'ents': [{'start': 8, 'end': 17, 'label': 'FOOD'},
  {'start': 20, 'end': 26, 'label': 'FOOD'},
  {'start': 29, 'end': 34, 'label': 'FOOD'},
  {'start': 38, 'end': 45, 'label': 'FOOD'},
  {'start': 55, 'end': 59, 'label': 'FOOD'},
  {'start': 68, 'end': 69, 'label': 'FOOD'},
  {'start': 78, 'end': 82, 'label': 'FOOD'},
  {'start': 92, 'end': 98, 'label': 'FOOD'},
  {'start': 101, 'end': 105, 'label': 'FOOD'},
  {'start': 108, 'end': 118, 'label': 'FOOD'},
  {'start': 125, 'end': 136, 'label': 'FOOD'}],
 'title': None}

In [11]:
colors = {"FOOD": "#87CEEB"}
options = {"ents": ["FOOD"], "colors": colors}
displacy.render(final_doc, style="ent", options={"colors":colors},
                       manual=True)


In [12]:
# load sklearn count vectorizer
def get_tokens(input_cell):
    return ast.literal_eval(input_cell)

vect_file = "/Users/Carol/Google Drive/nlp_data/models/20200613_vectorizer.p"
vectorizer = pickle.load(open(vect_file, "rb"))



In [13]:
# load classifier models
vegan_model_file = "/Users/Carol/Google Drive/nlp_data/models/20200613_vegan.p"
vegan_model = pickle.load(open(vegan_model_file, "rb"))

kosher_model_file = "/Users/Carol/Google Drive/nlp_data/models/20200613_kosher.p"
kosher_model = pickle.load(open(kosher_model_file, "rb"))

gf_model_file = "/Users/Carol/Google Drive/nlp_data/models/20200613_gluten_free.p"
gf_model = pickle.load(open(gf_model_file, "rb"))

In [14]:
features = vectorizer.transform([str(all_terms)])

In [15]:
vegan_prob = vegan_model.predict_proba(features)[0][1]
kosher_prob = kosher_model.predict_proba(features)[0][1]
gf_prob = gf_model.predict_proba(features)[0][1]

In [16]:
vegan_prob

0.052471418

In [17]:
kosher_prob

0.85738266

In [18]:
gf_prob

0.7891858

In [19]:
round(gf_prob, 2)

0.79

In [24]:
int(round(100*kosher_prob))

86