In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, TimeDistributed, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

In [None]:
import sys
sys.path.append("../../src/training")

In [None]:
from dataset_utils import *

In [None]:
from train_utils import *

In [None]:
datafile = "/Users/Carol/Dropbox/Repos/NER-datasets/CONLL2003/tiny_train.txt"
label_column = 3  # column index holding the labels to be trained on

In [None]:
# read data set
dataset = read_conll_file(datafile)
vocabulary = compile_vocabulary(dataset)
label_to_index = make_label_map(dataset)

In [None]:
# pull out the needed embeddings
embeddings_file = "/Users/Carol/Dropbox/Code/Glove/glove.6B.100d.txt"
embedding_dim = 100
token_frequency_threshold = 5
token_to_index, embeddings = get_token_embeddings(embeddings_file, embedding_dim, vocabulary, token_frequency_threshold)

In [None]:
# map tokens in the data set to their indices
sentences = examples_to_indices(dataset, label_to_index, token_to_index)

In [None]:
sentence_length = None
lstm_size = 100
n_class_labels = len(label_to_index)
max_len = 30

In [None]:
token_input = Input(shape=(None,), dtype='int32', name='token_input')
token_embeddings = Embedding(input_length=sentence_length, weights = [embeddings], input_dim=embeddings.shape[0], output_dim=embeddings.shape[1], name="word_embeddings")(token_input)
lstm_layer = Bidirectional(LSTM(lstm_size, return_sequences=True), name='BiLSTM')(token_embeddings)
output = TimeDistributed(Dense(n_class_labels, activation='softmax'), name='output_softmax')(lstm_layer)
model = Model(inputs=token_input, outputs=output)
opt = tf.keras.optimizers.Nadam(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=None, schedule_decay=0.004)
model.compile(loss='sparse_categorical_crossentropy', optimizer=opt)

In [None]:
# tensorboard writer
logdir = "/Users/Carol/Dropbox/repos/ner/tensorboard_logs"

In [None]:
def form_matrices(sentence):
    tokens = np.expand_dims(sentence['tokens'], axis=0)
    labels = sentence['labels']
    labels = np.expand_dims(labels, axis=0)
    labels = np.expand_dims(labels, axis=-1)
#     print("tokens:", tokens)
#     print("labels:", labels)
    return tokens, labels

In [None]:
writer = tf.summary.create_file_writer(logdir)
with writer.as_default():

    for epoch in range(20):
        for sentence in sentences:
            features, labels = form_matrices(sentence)
            loss = model.train_on_batch(features, labels)
            tf.summary.scalar("loss", loss, step=epoch)
    #         summary = tf.compat.v1.Summary(value=[tf.compat.v1.Summary.Value(tag="loss", 
    #                                              simple_value=loss), ])
    #         writer.add_summary(summary, global_step=epoch + 1)
            writer.flush()

