In [1]:
!export CUDA_VISIBLE_DEVICES=1

In [1]:
import os

import numpy as np

import tensorflow as tf

import matplotlib.pyplot as plt

from nltk.tokenize import word_tokenize

from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow import keras

Version:  2.8.0
Eager mode:  True
GPU is available


In [2]:
data = open('train_events.txt', 'rb').read().decode(encoding='utf-8')

bad_words = ['Story Start', 'Story End']
filtered_data = []

for line in data:
    if not any(bad_word in line for bad_word in bad_words):
        filtered_data.append(line)
        
data = open('train_events_1.txt', 'rb').read().decode(encoding='utf-8')

for line in data:
    if not any(bad_word in line for bad_word in bad_words):
        filtered_data.append(line)
    

filtered_data = ''.join(filtered_data).lower()

In [3]:
vocab = sorted(set(word_tokenize(filtered_data)))
print(len(vocab))

10877


In [4]:
filtered_data = ''

In [5]:
ids_from_words = tf.keras.layers.StringLookup(vocabulary=list(vocab), mask_token=None)
words_from_ids = tf.keras.layers.StringLookup(vocabulary=ids_from_words.get_vocabulary(), invert=True, mask_token=None)

Metal device set to: Apple M1


2022-03-13 16:08:15.464361: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-03-13 16:08:15.464600: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [6]:
len(ids_from_words.get_vocabulary())

10878

In [12]:
def get_ids(input_sequence, seq_no):
    print(len(input_sequence))
    for index in range(len(input_sequence)):
        if index%100 == 0:
            print('processed', index)
        input_sequence[index] = ids_from_words(input_sequence[index])
    return input_sequence

In [8]:
def get_features_and_labels(input_sequences):
    X = input_sequences[:,:-1]
    Y = tf.keras.utils.to_categorical(input_sequences[:,-1], num_classes=len(ids_from_words.get_vocabulary()))
    return X, Y

In [11]:
def pad_input_sequence(input_sequences, max_sequence_len):
    print('padding')
    return tf.convert_to_tensor(pad_sequences(input_sequences, maxlen=max_sequence_len))

In [10]:
# Batch size
BATCH_SIZE = 256

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 1024

def get_dataset(X, Y):
    dataset =  tf.data.Dataset.from_tensor_slices((X,Y))
    dataset = (dataset
        .shuffle(BUFFER_SIZE)
        .batch(BATCH_SIZE, drop_remainder=False)
        .prefetch(tf.data.experimental.AUTOTUNE)
    )
    return dataset

In [11]:
class Event2Event(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstmunits = tf.keras.layers.LSTM(rnn_units,
                                       return_sequences=False,
                                       return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.lstmunits.get_initial_state(x)
        
        x, memory_state, carry_state = self.lstmunits(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
            return x, memory_state, carry_state
        else:
            return x

In [12]:
# Length of the vocabulary in chars
vocab_size = len(ids_from_words.get_vocabulary())

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [13]:
model = Event2Event(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_words.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units
)

In [14]:
import os

checkpoint_path = "trained_events_model_second_batch_validation_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 save_best_only=True,
                                                 monitor='val_loss',
                                                 mode='min',
                                                 verbose=1)

In [15]:
from pathlib import Path

my_file = Path('saved_sequences/events.npy')

count = 0
max_count = 10000

story_text = list()
input_sequences = list()

if not my_file.is_file():
    with open("train_events.txt", 'rb') as data:
        for line in data:
            line = line.decode(encoding='utf-8').lower().split('\n')[0]
            if line == 'story start':
                story_text = list()
                continue

            if line == 'story end':
                story_text = ''.join(story_text)
                token_list = list(filter(None, story_text.split(' ')))

                for i in range(1, len(token_list)):        
                    n_gram_sequence = token_list[:i+1]        
                    input_sequences.append(n_gram_sequence)

                story_text = list()

            story_text.append(line)

            count += 1
            if(count == max_count):
                print('size of sequences', len(input_sequences))
                input_sequences = get_ids(input_sequences, 1)

                max_sequence_len = max([len(x) for x in input_sequences])
                input_sequences = pad_input_sequence(input_sequences, max_sequence_len)

                with open('saved_sequences/events_1.npy', 'ab') as f:
                    np.save(f, input_sequences.numpy())

                input_sequences = list()
                count = 0

        if(len(input_sequences) > 0):
            input_sequences = get_ids(input_sequences, 1)

            max_sequence_len = max([len(x) for x in input_sequences])
            input_sequences = pad_input_sequence(input_sequences, max_sequence_len)

            with open('saved_sequences/events_1.npy', 'ab') as f:
                    np.save(f, input_sequences.numpy())

In [None]:
from pathlib import Path

my_file = Path('saved_sequences/test_events.npy')

count = 0
max_count = 10000

story_text = list()
input_sequences = list()

if not my_file.is_file():
    with open("train_events_1.txt", 'rb') as data:
        for line in data:
            line = line.decode(encoding='utf-8').lower().split('\n')[0]
            if line == 'story start':
                story_text = list()
                continue

            if line == 'story end':
                story_text = ''.join(story_text)
                token_list = list(filter(None, story_text.split(' ')))

                for i in range(1, len(token_list)):        
                    n_gram_sequence = token_list[:i+1]        
                    input_sequences.append(n_gram_sequence)

                story_text = list()

            story_text.append(line)

            count += 1
            if(count == max_count):
                print('size of sequences', len(input_sequences))
                input_sequences = get_ids(input_sequences, 1)

                max_sequence_len = max([len(x) for x in input_sequences])
                input_sequences = pad_input_sequence(input_sequences, max_sequence_len)

                with open('saved_sequences/test_events.npy', 'ab') as f:
                    np.save(f, input_sequences.numpy())

                input_sequences = list()
                count = 0

        if(len(input_sequences) > 0):
            input_sequences = get_ids(input_sequences, 1)

            max_sequence_len = max([len(x) for x in input_sequences])
            input_sequences = pad_input_sequence(input_sequences, max_sequence_len)

            with open('saved_sequences/test_events.npy', 'ab') as f:
                    np.save(f, input_sequences.numpy())

20026
processed 0
processed 100
processed 200
processed 300
processed 400
processed 500
processed 600
processed 700
processed 800
processed 900
processed 1000
processed 1100
processed 1200
processed 1300
processed 1400
processed 1500
processed 1600
processed 1700
processed 1800
processed 1900
processed 2000
processed 2100
processed 2200
processed 2300
processed 2400
processed 2500
processed 2600
processed 2700
processed 2800
processed 2900
processed 3000
processed 3100
processed 3200
processed 3300
processed 3400
processed 3500
processed 3600
processed 3700
processed 3800
processed 3900
processed 4000
processed 4100
processed 4200
processed 4300
processed 4400
processed 4500
processed 4600
processed 4700
processed 4800
processed 4900
processed 5000
processed 5100
processed 5200
processed 5300
processed 5400
processed 5500
processed 5600
processed 5700
processed 5800
processed 5900
processed 6000
processed 6100
processed 6200
processed 6300
processed 6400
processed 6500
processed 6600
p

In [17]:
with open('saved_sequences/events.npy', 'rb') as f:
    fsz = os.fstat(f.fileno()).st_size
    input_sequences = np.load(f)
    while f.tell() < fsz:
        input_sequences = np.vstack((input_sequences, np.load(f)))

In [18]:
len(input_sequences)

200127

In [19]:
# process input sequences in batches of 50000 out of 200127to save application from crashing
input_sequences = input_sequences[150000:]

In [20]:
loss_fn = tf.losses.CategoricalCrossentropy(from_logits=True)
optimizer = keras.optimizers.Adam(learning_rate=1e-4)

In [21]:
def text_from_ids(ids):
    return tf.strings.join(words_from_ids(ids), separator=' ')

In [22]:
model.compile(optimizer=optimizer, loss=loss_fn)

In [23]:
model.load_weights(checkpoint_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x15868bac0>

In [28]:
batch_size = 12500
end_count = len(input_sequences)
histories = [{}]

EPOCHS = 60
epoch = 1 

while epoch <= EPOCHS:
    curr_start = 0
    curr_end = 0
    
    while curr_end < end_count:
    
        curr_start = curr_end
        curr_end = min(curr_start + batch_size, end_count)

        x, y = get_features_and_labels(input_sequences[curr_start:curr_end])

        dataset = get_dataset(x, y)
        
        x, y = None, None
        
        model.fit(dataset, validation_data=test_dataset, epochs=1, callbacks=[cp_callback])
        
    histories[0][epoch] = model.history.history
        
    epoch += 1

2022-03-11 22:50:23.877316: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-11 22:50:24.007473: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


 1/49 [..............................] - ETA: 1:02 - loss: 3.2098

2022-03-11 22:50:24.147247: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 1: val_loss improved from inf to 3.25502, saving model to trained_events_model_second_batch_validation_1/cp.ckpt
Epoch 1: val_loss improved from 3.25502 to 3.07750, saving model to trained_events_model_second_batch_validation_1/cp.ckpt
Epoch 1: val_loss improved from 3.07750 to 2.83741, saving model to trained_events_model_second_batch_validation_1/cp.ckpt
Epoch 1: val_loss did not improve from 2.83741
Epoch 1: val_loss did not improve from 2.83741
Epoch 1: val_loss did not improve from 2.83741
Epoch 1: val_loss did not improve from 2.83741
Epoch 1: val_loss improved from 2.83741 to 2.80749, saving model to trained_events_model_second_batch_validation_1/cp.ckpt
Epoch 1: val_loss did not improve from 2.80749
Epoch 1: val_loss did not improve from 2.80749
Epoch 1: val_loss did not improve from 2.80749
Epoch 1: val_loss did not improve from 2.80749
Epoch 1: val_loss improved from 2.80749 to 2.78314, saving model to trained_events_model_second_batch_validation_1/cp.ckpt
Epoch 1: val_

KeyboardInterrupt: 

In [29]:
import pickle
with open('pickle/secondbatchloss1', 'wb') as f:
    pickle.dump(model.history.history, f)

In [31]:
model.load_weights(checkpoint_path)
checkpoint_path

'trained_events_model_second_batch_validation_1/cp.ckpt'

In [33]:
with open('saved_sequences/events_test.npy', 'rb') as f:
    fsz = os.fstat(f.fileno()).st_size
    input_test_sequences = np.load(f)
    while f.tell() < fsz:
        input_test_sequences = np.vstack((input_sequences, np.load(f)))

In [34]:
len(input_test_sequences)

28500

In [35]:
x_test, y_test = get_features_and_labels(input_test_sequences)

test_dataset = get_dataset(x_test, y_test)

results = model.evaluate(x_test, y_test, batch_size=128)



In [36]:
results = None
x_test, y_test = None, None
input_test_sequences = None
test_dataset

<PrefetchDataset element_spec=(TensorSpec(shape=(None, 19), dtype=tf.int32, name=None), TensorSpec(shape=(None, 10878), dtype=tf.float32, name=None))>

In [37]:
max_sequence_len = max([len(x) for x in input_sequences[:1000]])

In [38]:
max_sequence_len

20

In [39]:
class OneStep(tf.keras.Model):
    def __init__(self, model, words_from_ids, ids_from_words, temperature=1.0):
        super().__init__()
        self.temperature = temperature
        self.model = model
        self.words_from_ids = words_from_ids
        self.ids_from_words = ids_from_words

        # Create a mask to prevent "[UNK]" from being generated.
        skip_ids = self.ids_from_words(['[UNK]'])[:, None]
        sparse_mask = tf.SparseTensor(
            # Put a -inf at each bad index.
            values=[-float('inf')]*len(skip_ids),
            indices=skip_ids,
            # Match the shape to the vocabulary
            dense_shape=[len(ids_from_words.get_vocabulary())])
        self.prediction_mask = tf.sparse.to_dense(sparse_mask)
    
    @tf.function
    def generate_one_step(self, inputs, states=None):
    
        # Run the model.
        # predicted_logits.shape is [batch, char, next_char_logits]
        predicted_logits, state1, state2 = self.model(inputs=inputs, states=states,
                                              return_state=True)
    
        # Return the characters and model state.
        return predicted_logits, [state1, state2]

In [41]:
one_step_model = OneStep(model, words_from_ids, ids_from_words)

In [43]:
test_data = open('test_events.txt', 'rb').read().decode(encoding='utf-8')

corpus = test_data.lower().split("\n")

story_text = []

events_elapsed = []
events_actual_next = []


for line in corpus:
    if line == 'story start':
        story_text = []
        continue
        
    if line == 'story end':
        actual = list(filter(None, story_text[-1].split(' ')))
        events_actual_next.append(actual)
    
        events_elapsed.append([list(filter(None, ''.join(story_text[:-1]).split(' ')))])
        
        # print(words_from_ids(elapsed_tensor))
        # print(actual)
        
        story_text = []
        continue
    
    story_text.append(line)

In [44]:
print(len(events_elapsed))
print(events_elapsed[13])
print(events_actual_next[13])

1500
[['james', 'business', 'has', 'na', 'he', 'services', 'performs', 'neighborhoods', 'james', 'grass', 'cuts', 'na', 'he', 'branches', 'cuts', 'request']]
['james', 'na', 'is', 'happy']


In [48]:
predicted_events = []

for event_elapsed in events_elapsed[1000:1005]:
    
    states = None

    seed_words = [[]]
    for event in event_elapsed[0]:
        seed_words[0].append(event)

    event_predicted_next = []

    for n in range(4):
        seed_w = ids_from_words(seed_words)
        
        inputs = tf.convert_to_tensor(pad_sequences(seed_w, maxlen=max_sequence_len-1))
        
        predicted_logits, states = one_step_model.generate_one_step(inputs, states=states)
        
        predicted_logits = predicted_logits + one_step_model.prediction_mask
        
        predicted_word = words_from_ids(tf.math.argmax(tf.nn.softmax(predicted_logits[0]), 0))
        
        predicted_word = predicted_word.numpy().decode()
        
        print(predicted_word)

        event_predicted_next.append(predicted_word)

        seed_words[0].append(predicted_word)

        seed_words[0] = seed_words[0][-max_sequence_len-1:]
    
    # predicted_events.append(event_predicted_next)

she
it
could
na
na
marks
jail
dog
i
na
've
na
she
it
decided
na
i
na
had
na


In [49]:
# print(len(events_elapsed))
print(len(predicted_events))
# print(len(events_actual_next))

0


In [51]:
tf.saved_model.save(one_step_model, 'one_step_batch_second_validation_1')



2022-03-12 00:17:20.632945: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: one_step_batch_second_validation_1/assets


INFO:tensorflow:Assets written to: one_step_batch_second_validation_1/assets


In [53]:
one_step_reloaded = tf.saved_model.load('one_step_batch_second_validation_1')

2022-03-12 00:17:50.671923: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-12 00:17:50.680176: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-12 00:17:50.694519: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-12 00:17:50.698063: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [54]:
predicted_events = []

for event_elapsed in events_elapsed[940:950]:
    
    states = None

    seed_words = [[]]
    for event in event_elapsed[0]:
        seed_words[0].append(event)

    event_predicted_next = []

    for n in range(4):
        seed_w = ids_from_words(seed_words)
        
        inputs = tf.convert_to_tensor(pad_sequences(seed_w, maxlen=max_sequence_len-1))
        
        predicted_logits, states = one_step_reloaded.generate_one_step(inputs, states=states)
        
        predicted_logits = predicted_logits + one_step_reloaded.prediction_mask
        
        predicted_word = words_from_ids(tf.math.argmax(tf.nn.softmax(predicted_logits[0]), 0))
        
        predicted_word = predicted_word.numpy().decode()

        event_predicted_next.append(predicted_word)

        seed_words[0].append(predicted_word)

        seed_words[0] = seed_words[0][-max_sequence_len-1:]
    
    predicted_events.append(event_predicted_next)

In [55]:
predicted_events

[['counselor', 'na', 'went', 'na'],
 ['monica', 'na', 'felt', 'guilty'],
 ['he', 'planes', 'left', 'dog'],
 ['man', 'him', 'told', 'better'],
 ['cathy', 'na', 'were', 'great'],
 ['they', 'na', 'were', 'na'],
 ['arrow', 'na', 'were', 'able'],
 ['we', 'na', 'were', 'na'],
 ['he', 'na', 'is', 'better'],
 ['i', 'na', 'decided', 'na']]

In [56]:
histories

[{1: {'loss': [2.0643861293792725], 'val_loss': [2.961289405822754]},
  2: {'loss': [2.0032708644866943], 'val_loss': [2.942138195037842]},
  3: {'loss': [1.9470793008804321], 'val_loss': [2.9699578285217285]},
  4: {'loss': [1.911195993423462], 'val_loss': [2.9335718154907227]},
  5: {'loss': [1.861051082611084], 'val_loss': [2.90276837348938]},
  6: {'loss': [1.8483338356018066], 'val_loss': [2.891018867492676]},
  7: {'loss': [1.8335037231445312], 'val_loss': [2.850835084915161]},
  8: {'loss': [1.7970795631408691], 'val_loss': [2.871936559677124]},
  9: {'loss': [1.7511053085327148], 'val_loss': [2.8355655670166016]},
  10: {'loss': [1.723913550376892], 'val_loss': [2.8256797790527344]},
  11: {'loss': [1.726291298866272], 'val_loss': [2.789463758468628]},
  12: {'loss': [1.6716786623001099], 'val_loss': [2.7801589965820312]},
  13: {'loss': [1.661008358001709], 'val_loss': [2.774237871170044]},
  14: {'loss': [1.6238725185394287], 'val_loss': [2.738985776901245]},
  15: {'loss': [

In [68]:
one_step_reloaded.model

<tensorflow.python.saved_model.load.Loader._recreate_base_user_object.<locals>._UserObject at 0x2e44e8700>

In [70]:
one_step_model.model()

<keras.losses.CategoricalCrossentropy at 0x16716f280>