<a href="https://colab.research.google.com/github/carolmanderson/food/blob/master/notebooks/modeling/Colab_Train_basic_LSTM_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, TimeDistributed, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

In [3]:
! python -m pip install git+https://github.com/carolmanderson/food.git#egg=food_tools-0.0

Collecting food_tools-0.0
  Cloning https://github.com/carolmanderson/food.git to /tmp/pip-install-1rqnzay0/food-tools-0.0
  Running command git clone -q https://github.com/carolmanderson/food.git /tmp/pip-install-1rqnzay0/food-tools-0.0
Building wheels for collected packages: food-tools, food-tools
  Building wheel for food-tools (setup.py) ... [?25l[?25hdone
  Created wheel for food-tools: filename=food_tools-0.0-cp36-none-any.whl size=7815 sha256=41f15a572b9670eb555b5194efbaa498c5ec3a3744475c5aa6ebdcb1e2be3868
  Stored in directory: /tmp/pip-ephem-wheel-cache-2zq656c7/wheels/42/ae/ff/a56f7557a75ac42d7087b5aeca2ca8317d3fdae397d60faae5
  Building wheel for food-tools (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for food-tools[0m
[?25h  Running setup.py clean for food-tools
[31m  ERROR: Failed cleaning build dir for food-tools[0m
Successfully built food-tools
Failed to build food-tools
Installing collected packages: food-tools
Successfully installed food-tools-0

In [0]:
from food_tools.training.dataset_utils import *
from food_tools.training.train_utils import *

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
datafile = "/content/drive/My Drive/NLP_data/recipe_data/food_gold_train.conll"
label_column = 4  # column index holding the labels to be trained on

In [0]:
# read data set
dataset = read_conll_file(datafile)
vocabulary = compile_vocabulary(dataset)
label_to_index = make_label_map(dataset)

In [90]:
# pull out the needed embeddings
embeddings_file = "/content/drive/My Drive/NLP_data/glove.6B.100d.txt"
embedding_dim = 100
token_frequency_threshold = 5
token_to_index, embeddings = get_token_embeddings(embeddings_file, embedding_dim, vocabulary, token_frequency_threshold)

1996
°
1997
1996
F
1998
1997
F.
1999
1998
Increase
2000
1999
Tbsp
2001
2000
Parmesan
2002
2001
Meanwhile
2003
2002
rockweed
2004
2003


In [67]:
index_to_label = {v:k for k, v in label_to_index.items()}
index_to_label

{0: 'I-FOOD', 1: 'O', 2: 'B-FOOD'}

In [0]:
index_to_tokens = {v:k for k, v in token_to_index.items()}

In [0]:
# map tokens in the data set to their indices
sentences = examples_to_indices(dataset, label_to_index, token_to_index)

In [0]:
sentence_length = None
lstm_size = 100
n_class_labels = len(label_to_index)
max_len = 30

In [0]:
token_input = Input(shape=(None,), dtype='int32', name='token_input')
token_embeddings = Embedding(input_length=sentence_length, weights = [embeddings], input_dim=embeddings.shape[0], output_dim=embeddings.shape[1], name="word_embeddings")(token_input)
lstm_layer = Bidirectional(LSTM(lstm_size, return_sequences=True), name='BiLSTM')(token_embeddings)
output = TimeDistributed(Dense(n_class_labels, activation='softmax'), name='output_softmax')(lstm_layer)
model = Model(inputs=token_input, outputs=output)
opt = tf.keras.optimizers.Nadam(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=None, schedule_decay=0.004)
model.compile(loss='sparse_categorical_crossentropy', optimizer=opt)

In [0]:
# tensorboard writer
logdir = "/content/drive/My Drive/NLP_data/tensorboard_logs/recipes"

In [0]:
def form_matrices(sentence):
    tokens = np.expand_dims(sentence['tokens'], axis=0)
    labels = sentence['labels']
    labels = np.expand_dims(labels, axis=0)
    labels = np.expand_dims(labels, axis=-1)
#     print("tokens:", tokens)
#     print("labels:", labels)
    return np.array(tokens), np.array(labels)


In [44]:
writer = tf.summary.create_file_writer(logdir)
with writer.as_default():

    for epoch in range(20):
        for sentence in sentences:
            features, labels = form_matrices(sentence)
            loss = model.train_on_batch(features, labels)
            writer.flush()





In [0]:
# predict on dev set
dev_file = "/content/drive/My Drive/NLP_data/recipe_data/food_gold_dev.conll"
label_column = 4  # column index holding the labels to be trained on
dev_dataset = read_conll_file(dev_file)
dev_sentences = examples_to_indices(dev_dataset, label_to_index, token_to_index)

In [47]:
dev_sentences[0]

{'labels': [1, 2, 1, 1, 1, 1, 1, 2, 1],
 'tokens': [2001, 169, 58, 1026, 7, 1996, 1996, 856, 4]}

In [94]:
for sent in dev_sentences[:2]:
  preds = model.predict_on_batch(form_matrices(sent))
  preds = np.argmax(preds, axis=-1)
  labels = [index_to_label[ind] for ind in preds[0]]
  tokens = [index_to_tokens[tok] for tok in sent['tokens']]
  print(tokens)
  print(labels)



['parmesan', 'oil', 'over', 'dip', 'and', '°', '°', 'vegetables', '.']
['O', 'B-FOOD', 'O', 'O', 'O', 'O', 'O', 'B-FOOD', 'O']
['°', 'duck', 'legs', ',', '°', 'peel', 'UNKNOWN_TOKEN', 'over', 'them', 'and', 'UNKNOWN_TOKEN', '°', 'parsley', 'sprigs', ',', '°', 'roasted', 'pears', 'and', 'onions', '.']
['O', 'B-FOOD', 'I-FOOD', 'O', 'O', 'B-FOOD', 'I-FOOD', 'O', 'O', 'O', 'O', 'O', 'B-FOOD', 'I-FOOD', 'O', 'O', 'B-FOOD', 'I-FOOD', 'O', 'B-FOOD', 'O']


In [0]:
t, l = form_matrices(dev_sentences[0])

In [51]:
t.shape

(1, 9)

In [52]:
l.shape

(1, 9, 1)

In [53]:
t

array([[2001,  169,   58, 1026,    7, 1996, 1996,  856,    4]])

In [54]:
l

array([[[1],
        [2],
        [1],
        [1],
        [1],
        [1],
        [1],
        [2],
        [1]]])

In [65]:
label_to_index

{'B-FOOD': 2, 'I-FOOD': 0, 'O': 1}

In [72]:
dev_dataset[0]

[['Drizzle', 'O'],
 ['oil', 'B-FOOD'],
 ['over', 'O'],
 ['dip', 'O'],
 ['and', 'O'],
 ['serve', 'O'],
 ['with', 'O'],
 ['vegetables', 'B-FOOD'],
 ['.', 'O']]

In [73]:
dev_dataset[1]

[['Serve', 'O'],
 ['duck', 'B-FOOD'],
 ['legs', 'I-FOOD'],
 [',', 'O'],
 ['with', 'O'],
 ['sauce', 'B-FOOD'],
 ['spooned', 'O'],
 ['over', 'O'],
 ['them', 'O'],
 ['and', 'O'],
 ['garnished', 'O'],
 ['with', 'O'],
 ['parsley', 'B-FOOD'],
 ['sprigs', 'I-FOOD'],
 [',', 'O'],
 ['with', 'O'],
 ['roasted', 'B-FOOD'],
 ['pears', 'I-FOOD'],
 ['and', 'O'],
 ['onions', 'B-FOOD'],
 ['.', 'O']]

In [91]:
token_to_index['parmesan']

2001

In [92]:
token_to_index['drizzle']

1368

In [93]:
index_to_tokens[2001]

'parmesan'