In [None]:
import os
import sys
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import tensorflow as tf
tf.executing_eagerly()
tf.get_logger().setLevel('ERROR')
tf.random.set_seed(
    13516013
)

In [36]:
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 10]

from tqdm import tqdm

from stog.utils.params import Params
from stog.data.dataset_builder import dataset_from_params, iterator_from_params
from stog.data.vocabulary import Vocabulary
from stog.training.trainer import Trainer
from stog.data.dataset import Batch
from model.text_to_amr import TextToAMR
from model.glove_embedding import GloveEmbedding

from tensorflow.keras.layers import Embedding, Input, Dense, Flatten, LSTM, concatenate, Bidirectional, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import OneHotEncoder

In [37]:
params = Params.from_file("../model/model_params.yaml")

  dict_merge.dict_merge(params_dict, yaml.load(f))


In [38]:
data_params = params['data']
dataset = dataset_from_params(data_params)

[2020-03-31 16:22:17,677 ERROR] Model name 'data/bert-base-cased/bert-base-cased-vocab.txt' was not found in model name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese). We assumed 'data/bert-base-cased/bert-base-cased-vocab.txt' was a path or url but couldn't find any file associated to this path or url.
0it [00:00, ?it/s][2020-03-31 16:22:17,680 INFO] Reading instances from lines in file at: ../data/raw/amr.txt.features
[2020-03-31 16:22:17,765 INFO] POS tag coverage: 0.3087 (184/596)
40it [00:00, 466.55it/s]
[2020-03-31 16:22:17,767 ERROR] Model name 'data/bert-base-cased/bert-base-cased-vocab.txt' was not found in model name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese). We assumed 'data/bert-base-cased/bert-base-cased-vocab.txt' was a path or url but couldn'

Building train datasets ...
False
Building dev datasets ...
False
Building test datasets ...
False


[2020-03-31 16:22:17,897 INFO] POS tag coverage: 0.3087 (184/596)
40it [00:00, 634.99it/s]


In [6]:
train_data = dataset['train']
dev_data = dataset.get('dev')
test_data = dataset.get('test')

In [7]:
vocab_params = params.get('vocab', {})
vocab = Vocabulary.from_instances(instances=train_data, **vocab_params)
vocab.save_to_files("../data/processed/serialization")

dataset = Batch(train_data)
dataset.index_instances(vocab)
print(dataset)

for key in dataset.as_tensor_dict():
    print(key)
    content = dataset.as_tensor_dict()[key]
    if isinstance(content, dict):
        for inner_key in content:
            print("  ", inner_key)

train_iterator, dev_iterater, test_iterater = iterator_from_params(vocab, data_params['iterator'])

train_dataset = Batch(train_data)
train_dataset.index_instances(vocab)

test_dataset = Batch(test_data)
test_dataset.index_instances(vocab)

train_dataset = train_dataset.as_tensor_dict()
test_dataset = test_dataset.as_tensor_dict()

[2020-03-31 16:08:40,086 INFO] Fitting token dictionary from dataset.
100%|██████████| 40/40 [00:00<00:00, 2875.17it/s]


<stog.data.dataset.Batch object at 0x7f367e71ff50>
src_tokens
   encoder_tokens
   encoder_characters
src_must_copy_tags
tgt_tokens
   decoder_tokens
   decoder_characters
src_pos_tags
tgt_pos_tags
tgt_copy_indices
tgt_copy_mask
tgt_copy_map
src_copy_indices
src_copy_map
head_tags
head_indices


In [8]:
def create_model_input(encoder_input, decoder_input, generator_input, parser_input):
    token_encoder_input = encoder_input.get('token')
    pos_encoder_input = encoder_input.get('pos_tag')
    mask_encoder_input = encoder_input.get('mask')
    token_decoder_input = decoder_input.get('token')
    pos_decoder_input = decoder_input.get('pos_tag')

    copy_attention_map_input = generator_input.get('copy_attention_maps')
    coref_attention_map_input = generator_input.get('coref_attention_maps')
    vocab_target_input  = generator_input.get('vocab_targets')
    coref_target_input  = generator_input.get('coref_targets')
    copy_target_input  = generator_input.get('copy_targets')

    edge_heads_input = parser_input.get('edge_heads')
    edge_labels_input = parser_input.get('edge_labels')
    parser_mask_input = parser_input.get('mask')
    coref_input = parser_input.get('corefs')

    # token_encoder_input, 
    # pos_encoder_input, 
    # token_decoder_input, 
    # pos_decoder_input, 
    # copy_attention_maps_input, 
    # coref_attention_maps_input,
    # mask_input,
    # edge_heads_input,
    # edge_labels_input,
    # corefs_input,

    model_input = dict(
        token_encoder=token_encoder_input, 
        pos_encoder=pos_encoder_input, 
        token_decoder=token_decoder_input, 
        pos_decoder=pos_decoder_input, 
        copy_attention_map=copy_attention_map_input, 
        coref_attention_map=coref_attention_map_input,
        mask_parser=parser_mask_input, 
        edge_heads=edge_heads_input,
        edge_labels=edge_labels_input,
        coref=coref_input,
        vocab_target=vocab_target_input,
        coref_target=coref_target_input,
        copy_target=copy_target_input,
        mask_encoder=mask_encoder_input
    )
    
    model_input = dict([(e,model_input[e].astype('int32')) for e in model_input])
    
    return model_input


In [9]:
print(vocab)

Vocabulary with namespaces:
 	Non Padded Namespaces: {'coref_tags', 'must_copy_tags'}
 	Namespace: encoder_token_ids, Size: 334 
 	Namespace: encoder_token_characters, Size: 39 
 	Namespace: decoder_token_ids, Size: 283 
 	Namespace: decoder_token_characters, Size: 65 
 	Namespace: pos_tags, Size: 16 
 	Namespace: head_tags, Size: 35 



###### Test Model

In [10]:
text_to_amr = TextToAMR(vocab)
encoder_input, decoder_input, generator_input, parser_input = text_to_amr.prepare_input(train_dataset)

ENCODER_INPUT
bert_token: None
token_subword_index: None
token: (40, 25)
pos_tag: (40, 25)
must_copy_tag: (40, 25)
char: (40, 25, 14)
mask: (40, 25)

DECODER_INPUT
token: (40, 28)
pos_tag: (40, 28)
char: (40, 28, 17)
coref: (40, 28)

GENERATOR_INPUT
vocab_targets: (40, 28)
coref_targets: (40, 28)
coref_attention_maps: (40, 28, 29)
copy_targets: (40, 28)
copy_attention_maps: (40, 25, 27)

PARSER_INPUT
edge_heads: (40, 28)
edge_labels: (40, 28)
corefs: (40, 28)
mask: (40, 28)



In [11]:
print(parser_input.get('edge_heads')[5])

[ 0  1  2  3  4  5  3  2  8  8  8  1 12 12 14  0  0  0  0  0  0  0  0  0
  0  0  0  0]


In [12]:
train_model_input = create_model_input(encoder_input, decoder_input, generator_input, parser_input)

In [13]:
batch_size = 64  # Batch size for training.
ENCODER_LATENT_DIM = 200  # Latent dimensionality of the encoding space.
DECODER_LATENT_DIM = 400
EMBEDDING_OUTPUT_DIM = 100

epochs = 100  # Number of epochs to train for.
num_samples = train_model_input['token_encoder'].shape[0]  # Number of samples to train on.

num_encoder_tokens = train_model_input['token_encoder'].shape[1]
num_decoder_tokens = train_model_input['token_decoder'].shape[1]
encoder_token_vocab_size = vocab.get_vocab_size("encoder_token_ids")
encoder_pos_vocab_size = vocab.get_vocab_size("encoder_token_ids")
decoder_token_vocab_size = vocab.get_vocab_size("decoder_token_ids")
encoder_pos_vocab_size = decoder_pos_vocab_size = vocab.get_vocab_size("pos_tags")

## Encoder Decoder

In [14]:
encoder_decoder_model_input = [train_model_input.get('token_encoder'), \
                               train_model_input.get('pos_encoder'), \
                               train_model_input.get('token_decoder'), 
                               train_model_input.get('pos_decoder')
                              ]
encoder_decoder_model_output = train_model_input.get('vocab_target')

print(encoder_decoder_model_input[0].shape)
print(encoder_decoder_model_output.shape)

(40, 25)
(40, 28)


### Encoder

In [15]:
encoder = Bidirectional(LSTM(
            ENCODER_LATENT_DIM,
            return_sequences=True,
            return_state=True))

In [16]:
def encoder_embedding(token, pos):
    token_embedding = GloveEmbedding(
            encoder_token_vocab_size, num_encoder_tokens)(token)
    pos_embedding = Embedding(
            input_dim=encoder_pos_vocab_size, output_dim=EMBEDDING_OUTPUT_DIM, input_length=num_encoder_tokens, mask_zero=True)(pos)

    return concatenate([token_embedding, pos_embedding])

def encode(x):
    output, forward_h, forward_c, backward_h, backward_c = encoder(
            x)
    state_h = concatenate([forward_h, backward_h])
    state_c = concatenate([forward_c, backward_c])
    states = (state_h, state_c)
    return output, states

### Decoder

In [17]:
decoder = LSTM(
                DECODER_LATENT_DIM,
                return_state=True,
                return_sequences=True)

decoder_linear = Dense(decoder_token_vocab_size)

In [18]:
def decoder_embedding(token, pos):
    token_embedding = GloveEmbedding(
            decoder_token_vocab_size, num_decoder_tokens)(token)
    pos_embedding = Embedding(
            input_dim=decoder_pos_vocab_size, output_dim=EMBEDDING_OUTPUT_DIM, input_length=num_decoder_tokens, mask_zero=True)(pos)

    return concatenate([token_embedding, pos_embedding])


### Loss

In [33]:
def simple_loss(self, probs, generate_targets):
    non_pad_mask = tf.cast(tf.math.not_equal(generate_targets, self.vocab_pad_idx), dtype='int32')

    generate_targets_indices = create_generator_indices(generate_targets)
    generate_target_probs = tf.gather_nd(probs, generate_targets_indices)

    mul_result = generate_target_probs
    likelihood = mul_result
    num_tokens = tf.math.reduce_sum(non_pad_mask)

    likelihood = likelihood + self.eps

    # Drop pads.
    loss = -np.log(likelihood) * tf.cast(non_pad_mask, dtype='float32')

    return loss

In [58]:
# Define an input sequence and process it.

## Encoder Inputs
token_encoder_inputs = Input(shape=(num_encoder_tokens))
pos_encoder_inputs = Input(shape=(num_encoder_tokens))


encoder_inputs = [token_encoder_inputs, pos_encoder_inputs]
embedded_encoder = encoder_embedding(token_encoder_inputs, pos_encoder_inputs)
## Decoder Inputs
token_decoder_inputs = Input(shape=(num_decoder_tokens))
pos_decoder_inputs = Input(shape=(num_decoder_tokens))
decoder_inputs = [token_decoder_inputs, pos_decoder_inputs]
embedded_decoder = decoder_embedding(token_decoder_inputs, pos_decoder_inputs)

# Encoder Embedding
encoder_outputs, encoder_states = encode(embedded_encoder)
# We discard `encoder_outputs` and only keep the states.

# Set up the decoder, using `encoder_states` as initial state.
print(encoder_outputs)

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_outputs, _, _ = decoder(embedded_decoder)
decoder_outputs = decoder_linear(decoder_outputs) 
print(decoder_outputs)
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model(encoder_inputs + decoder_inputs, decoder_outputs)

Tensor("bidirectional_1_8/Identity:0", shape=(None, 25, 400), dtype=float32)
Tensor("dense_7_8/Identity:0", shape=(None, 28, 283), dtype=float32)


In [60]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['accuracy'])
model.fit(encoder_decoder_model_input, to_categorical(encoder_decoder_model_output), epochs=epochs)

Train on 40 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
E

<tensorflow.python.keras.callbacks.History at 0x7f35a2f22c50>

In [65]:
prediction = model.predict(encoder_decoder_model_input)
np.argmax(prediction[5], axis=-1)

array([ 7, 19,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6])