<a href="https://colab.research.google.com/github/carolmanderson/food/blob/master/notebooks/modeling/Train_basic_LSTM_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
import json
import os
import pickle
import sys

import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, TimeDistributed, Dense
from tensorflow.keras.models import Model

In [2]:
LOCAL = True   # training on local vs. in Colab

if LOCAL:
    sys.path.append("../..")
    from src.training.dataset_utils import read_conll_file, compile_vocabulary, make_label_map, get_token_embeddings, examples_to_indices, tokens_to_indices
    from src.training.train_utils import generate_batch_indices, get_current_time, form_ner_train_matrices, evaluate_ner
else:
  ! python -m pip install git+https://github.com/carolmanderson/food.git#egg=food_tools-0.7


In [3]:

if LOCAL:
    base_path = "/Users/Carol/Google Drive/"
else:
    from google.colab import drive
    drive.mount('/content/drive')
    base_path = "/content/drive/My Drive/"


In [4]:
if not LOCAL:
    from food_tools.training.dataset_utils import read_conll_file, compile_vocabulary, make_label_map, get_token_embeddings, examples_to_indices, tokens_to_indices
    from food_tools.training.train_utils import get_current_time, form_ner_train_matrices, evaluate_ner


In [5]:
# input files
train_datafile = os.path.join(base_path, "nlp_data/recipe_data/20200523_food_gold_train.conll")
dev_file = os.path.join(base_path, "nlp_data/recipe_data/20200523_food_gold_dev.conll")
embeddings_file = os.path.join(base_path,"nlp_data/glove.6B.100d.txt")

In [6]:
# set output paths
experiment_id = "20200523_" + get_current_time()
outpath = os.path.join(base_path, "nlp_data/output/{}".format(experiment_id))
if not os.path.exists(outpath):
    os.makedirs(outpath)
    
model_base_name = os.path.join(outpath, experiment_id + '_food_ner_epoch_{}_dev_f1_{}.h5')
metrics_file = os.path.join(outpath,'{}_food_ner_metrics.json'.format(experiment_id))
mapping_file = os.path.join(outpath,'{}_food_ner_mappings.pkl'.format(experiment_id))

In [7]:
# load data
train_dataset = read_conll_file(train_datafile)
dev_dataset = read_conll_file(dev_file)
vocabulary = compile_vocabulary(train_dataset + dev_dataset)

In [8]:
# create mappings from token to index and label to index
embedding_dim = 100
token_frequency_threshold = 5
token_to_index, embeddings = get_token_embeddings(embeddings_file, embedding_dim, vocabulary, token_frequency_threshold)
index_to_tokens = {v:k for k, v in token_to_index.items()}
label_to_index = make_label_map(train_dataset)
index_to_label = {v:k for k, v in label_to_index.items()}

In [9]:
# save the mappings
mappings = {"label_to_index" : label_to_index, "token_to_index" : token_to_index}
with open(mapping_file, "wb") as out:
    pickle.dump(mappings, out)

In [10]:
# map tokens and labels in the data set to their indices
train_sentences = examples_to_indices(train_dataset, label_to_index, token_to_index)
dev_sentences = examples_to_indices(dev_dataset, label_to_index, token_to_index)

In [11]:
# define model
sentence_length = None
lstm_size = 100
n_class_labels = len(label_to_index)

token_input = Input(shape=(None,), dtype='int32', name='token_input')
token_embeddings = Embedding(input_length=sentence_length, weights = [embeddings], input_dim=embeddings.shape[0], output_dim=embeddings.shape[1], name="word_embeddings")(token_input)
lstm_layer = Bidirectional(LSTM(lstm_size, return_sequences=True), name='BiLSTM')(token_embeddings)
output = TimeDistributed(Dense(n_class_labels, activation='softmax'), name='output_softmax')(lstm_layer)
model = Model(inputs=token_input, outputs=output)
opt = tf.keras.optimizers.Nadam(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=None, schedule_decay=0.004)
model.compile(loss='sparse_categorical_crossentropy', optimizer=opt)

W0523 14:13:09.987452 140736447558592 deprecation.py:506] From /Users/Carol/anaconda/envs/nlp/lib/python3.6/site-packages/tensorflow_core/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0523 14:13:10.030029 140736447558592 deprecation.py:506] From /Users/Carol/anaconda/envs/nlp/lib/python3.6/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
W0523 14:13:44.475656 140736447558592 deprecation.py:506] From /Users/Carol/anaconda/envs/nlp/lib/python3.6/site-packages/tensorflow_core/python/ops/init_

In [65]:
# training loop
max_epochs = 50
batch_size = 4
all_metrics = {}  # epoch as key, [train_metrics, dev_metrics] as value

train_sentence_lengths = [len(sent['tokens']) for sent in train_sentences]  # for smart batching
epochs_without_improvement = 0
max_f1 = 0
for epoch in range(max_epochs):
    batch_generator = generate_batch_indices(batch_size, train_sentence_lengths)
    for batch in batch_generator:
        batch_sentences = [train_sentences[i] for i in batch]
        features, labels = form_ner_batch_matrices(batch_sentences, token_to_index, label_to_index)
        loss = model.train_on_batch(features, labels)
    dev_metrics = evaluate_ner(model, dev_sentences, index_to_label)
    dev_f1 = dev_metrics['weighted avg']['f1-score']
    train_metrics = evaluate_ner(model, train_sentences, index_to_label)
    train_f1 = train_metrics['weighted avg']['f1-score']
    print("epoch {}".format(epoch))
    print("train F1: {}".format(train_f1))
    print("dev F1: {}".format(dev_f1))
    all_metrics[epoch] = [train_metrics, dev_metrics]
    if dev_f1 > max_f1:
        max_f1 = dev_f1
        epochs_without_improvement = 0
        model_path = model_base_name.format(epoch, dev_f1)
        model.save(model_path)
    else:
        epochs_without_improvement += 1
    if epochs_without_improvement >= 10:
        break

epoch 0
train F1: 0.9794634666544589
dev F1: 0.9714440771819606
epoch 1
train F1: 0.9891904294112583
dev F1: 0.9819241118174422


KeyboardInterrupt: 

In [0]:
# log metrics
with open(metrics_file, "w") as out:
    out.write(json.dumps(all_metrics))