<a href="https://colab.research.google.com/github/carolmanderson/food/blob/master/notebooks/modeling/Train_basic_LSTM_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, TimeDistributed, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

In [0]:
! python -m pip install git+https://github.com/carolmanderson/food.git#egg=food_tools-0.0

In [0]:
from food_tools.training.dataset_utils import *
from food_tools.training.train_utils import *

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
train_datafile = "/content/drive/My Drive/NLP_data/recipe_data/food_gold_train.conll"
label_column = 4  # column index holding the labels to be trained on

In [0]:
# read data set
train_dataset = read_conll_file(train_datafile)
vocabulary = compile_vocabulary(train_dataset)
label_to_index = make_label_map(train_dataset)

In [0]:
# pull out the needed embeddings
embeddings_file = "/content/drive/My Drive/NLP_data/glove.6B.100d.txt"
embedding_dim = 100
token_frequency_threshold = 5
token_to_index, embeddings = get_token_embeddings(embeddings_file, embedding_dim, vocabulary, token_frequency_threshold)

In [0]:
# map tokens in the data set to their indices
train_sentences = examples_to_indices(train_dataset, label_to_index, token_to_index)

In [0]:
index_to_label = {v:k for k, v in label_to_index.items()}
index_to_tokens = {v:k for k, v in token_to_index.items()}

In [0]:
sentence_length = None
lstm_size = 100
n_class_labels = len(label_to_index)
max_len = 30

In [0]:
token_input = Input(shape=(None,), dtype='int32', name='token_input')
token_embeddings = Embedding(input_length=sentence_length, weights = [embeddings], input_dim=embeddings.shape[0], output_dim=embeddings.shape[1], name="word_embeddings")(token_input)
lstm_layer = Bidirectional(LSTM(lstm_size, return_sequences=True), name='BiLSTM')(token_embeddings)
output = TimeDistributed(Dense(n_class_labels, activation='softmax'), name='output_softmax')(lstm_layer)
model = Model(inputs=token_input, outputs=output)
opt = tf.keras.optimizers.Nadam(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=None, schedule_decay=0.004)
model.compile(loss='sparse_categorical_crossentropy', optimizer=opt)

In [0]:
# tensorboard writer
logdir = "/content/drive/My Drive/NLP_data/tensorboard_logs/recipes"

In [0]:
def form_matrices(sentence):
    tokens = np.expand_dims(sentence['tokens'], axis=0)
    labels = sentence['labels']
    labels = np.expand_dims(labels, axis=0)
    labels = np.expand_dims(labels, axis=-1)
#     print("tokens:", tokens)
#     print("labels:", labels)
    return np.array(tokens), np.array(labels)


In [0]:
writer = tf.summary.create_file_writer(logdir)
with writer.as_default():

    for epoch in range(20):
        for sentence in train_sentences:
            features, labels = form_matrices(sentence)
            loss = model.train_on_batch(features, labels)
            writer.flush()

In [0]:
# predict on dev set
dev_file = "/content/drive/My Drive/NLP_data/recipe_data/food_gold_dev.conll"
label_column = 4  # column index holding the labels to be trained on
dev_dataset = read_conll_file(dev_file)
dev_sentences = examples_to_indices(dev_dataset, label_to_index, token_to_index)

In [0]:
for sent in dev_sentences[:2]:
  preds = model.predict_on_batch(form_matrices(sent))
  preds = np.argmax(preds, axis=-1)
  labels = [index_to_label[ind] for ind in preds[0]]
  tokens = [index_to_tokens[tok] for tok in sent['tokens']]
  print(tokens)
  print(labels)



In [0]:
model_path = '/content/drive/My Drive/NLP_data/recipe_data/20200421_food_ner.h5'
model.save(model_path)