<a href="https://colab.research.google.com/github/carolmanderson/food/blob/master/notebooks/modeling/Train_basic_LSTM_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
import sys

LOCAL = True   # training on local vs. in Colab

if LOCAL:
    sys.path.append("../..")
    from src.training.dataset_utils import read_conll_file, compile_vocabulary, make_label_map, get_token_embeddings, examples_to_indices, tokens_to_indices
    from src.training.train_utils import generate_batch_indices, get_current_time, form_ner_train_matrices, evaluate_ner
else:
  ! python -m pip install git+https://github.com/carolmanderson/food.git#egg=food_tools-0.8

In [67]:
if not LOCAL:
    from food_tools.training.dataset_utils import read_conll_file, compile_vocabulary, make_label_map, get_token_embeddings, examples_to_indices, tokens_to_indices
    from food_tools.training.train_utils import generate_batch_indices, get_current_time, form_ner_train_matrices, evaluate_ner

In [68]:
import json
import os
import pickle

import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, TimeDistributed, Dense
from tensorflow.keras.models import Model

In [69]:
if LOCAL:
    base_path = "/Users/Carol/Google Drive/"
else:
    from google.colab import drive
    drive.mount('/content/drive')
    base_path = "/content/drive/My Drive/"

In [70]:
# input files
train_datafile = os.path.join(base_path, "nlp_data/recipe_data/20200523_food_gold_train.conll")
dev_file = os.path.join(base_path, "nlp_data/recipe_data/20200523_food_gold_dev.conll")
embeddings_file = os.path.join(base_path,"nlp_data/glove.6B.100d.txt")

In [71]:
# set output paths
experiment_id = "20200523_" + get_current_time()
outpath = os.path.join(base_path, "nlp_data/output/{}".format(experiment_id))
if not os.path.exists(outpath):
    os.makedirs(outpath)
    
model_base_name = os.path.join(outpath, experiment_id + '_food_ner_epoch_{}_dev_f1_{}.h5')
metrics_file = os.path.join(outpath,'{}_food_ner_metrics.json'.format(experiment_id))
mapping_file = os.path.join(outpath,'{}_food_ner_mappings.pkl'.format(experiment_id))

In [72]:
# load data
train_dataset = read_conll_file(train_datafile)
dev_dataset = read_conll_file(dev_file)
vocabulary = compile_vocabulary(train_dataset + dev_dataset)

In [73]:
# create mappings from token to index and label to index
embedding_dim = 100
token_frequency_threshold = 5
token_to_index, embeddings = get_token_embeddings(embeddings_file, embedding_dim, vocabulary, token_frequency_threshold)
index_to_tokens = {v:k for k, v in token_to_index.items()}
label_to_index = make_label_map(train_dataset)
index_to_label = {v:k for k, v in label_to_index.items()}

In [74]:
# save the mappings
mappings = {"label_to_index" : label_to_index, "token_to_index" : token_to_index}
with open(mapping_file, "wb") as out:
    pickle.dump(mappings, out)

In [75]:
# map tokens and labels in the data set to their indices
train_sentences = examples_to_indices(train_dataset, label_to_index, token_to_index)
dev_sentences = examples_to_indices(dev_dataset, label_to_index, token_to_index)

In [76]:
# define model
sentence_length = None
lstm_size = 100
n_class_labels = len(label_to_index)

token_input = Input(shape=(None,), dtype='int32', name='token_input')
token_embeddings = Embedding(input_length=sentence_length, weights = [embeddings], input_dim=embeddings.shape[0], output_dim=embeddings.shape[1], name="word_embeddings")(token_input)
lstm_layer = Bidirectional(LSTM(lstm_size, return_sequences=True), name='BiLSTM')(token_embeddings)
output = TimeDistributed(Dense(n_class_labels, activation='softmax'), name='output_softmax')(lstm_layer)
model = Model(inputs=token_input, outputs=output)
opt = tf.keras.optimizers.Nadam(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=None, schedule_decay=0.004)
model.compile(loss='sparse_categorical_crossentropy', optimizer=opt)

In [78]:
# training loop
max_epochs = 50
batch_size = 4
all_metrics = {}  # epoch as key, [train_metrics, dev_metrics] as value

train_sentence_lengths = [len(sent['tokens']) for sent in train_sentences]  # for smart batching
epochs_without_improvement = 0
max_f1 = 0
for epoch in range(1, max_epochs):
    batch_generator = generate_batch_indices(batch_size, train_sentence_lengths)
    for batch in batch_generator:
        batch_sentences = [train_sentences[i] for i in batch]
        features, labels = form_ner_batch_matrices(batch_sentences, token_to_index, label_to_index)
        loss = model.train_on_batch(features, labels)
    dev_metrics = evaluate_ner(model, dev_sentences, index_to_label)
    dev_f1 = dev_metrics['weighted avg']['f1-score']
    train_metrics = evaluate_ner(model, train_sentences, index_to_label)
    train_f1 = train_metrics['weighted avg']['f1-score']
    print("epoch {}".format(epoch))
    print("train F1: {}".format(train_f1))
    print("dev F1: {}".format(dev_f1))
    all_metrics[epoch] = [train_metrics, dev_metrics]
    if dev_f1 > max_f1:
        max_f1 = dev_f1
        epochs_without_improvement = 0
        model_path = model_base_name.format(epoch, dev_f1)
        model.save(model_path)
    else:
        epochs_without_improvement += 1
    if epochs_without_improvement >= 10:
        break

epoch 1
train F1: 0.9760074534383141
dev F1: 0.967805299732772
epoch 2
train F1: 0.9856060967733279
dev F1: 0.9784618789877809
epoch 3
train F1: 0.9913688687508029
dev F1: 0.983538998521265
epoch 4
train F1: 0.9942390004462792
dev F1: 0.9831426082622818
epoch 5
train F1: 0.9948075286813102
dev F1: 0.9825042316793384
epoch 6
train F1: 0.9976164772929389
dev F1: 0.9844670216576327
epoch 7
train F1: 0.9985025481006765
dev F1: 0.9840094021410872
epoch 8
train F1: 0.9981633735007299
dev F1: 0.9840537796600327
epoch 9
train F1: 0.9990824392864771
dev F1: 0.9833022686476892
epoch 10
train F1: 0.999388441976759
dev F1: 0.9842877562463749
epoch 11
train F1: 0.999602512127444
dev F1: 0.9837159383078236
epoch 12
train F1: 0.9995408971473675
dev F1: 0.9831357600115094
epoch 13
train F1: 0.9999694076391074
dev F1: 0.982302320922086
epoch 14
train F1: 1.0
dev F1: 0.9832325382636611
epoch 15
train F1: 0.9999694076391074
dev F1: 0.9832232715247797
epoch 16
train F1: 0.9999694246595793
dev F1: 0.983175

In [79]:
# log metrics
with open(metrics_file, "w") as out:
    out.write(json.dumps(all_metrics))