In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.data import Dataset
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Dense, SimpleRNN, GRU, LSTM, LSTMCell, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.optimizers import Adam, Nadam
from tensorflow.keras.callbacks import TensorBoard
import tensorflow_addons as tfa

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datetime import datetime

""" Avoid error with Blas:GEMM not initializing when using GPU:
See: https://stackoverflow.com/questions/43990046/tensorflow-blas-gemm-launch-failed
"""
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

tf.random.set_seed(98)
np.random.seed(99)

## Exercise 8

Embedded Reber grammars were used by Hochreiter and Schmidhuber in their paper about LSTMs. They are artificial grammars that produce strings such as “BPBTSXXVPSEPE.” Check out [Jenny Orr’s nice introduction to this topic](https://www.willamette.edu/~gorr/classes/cs449/reber.html). Choose a particular embedded Reber grammar (such as the one represented on Jenny Orr’s page), then train an RNN to identify whether a string respects that grammar or not. You will first need to write a function capable of generating a training batch containing about 50% strings that respect the grammar, and 50% that don’t.

### Generating Reber grammars

We start creating a Reber grammar generator. I'll follow the same structure provided in the link above will allow for tokens to be passed to the class to generate the strings.

Each state will be labelled as follows and has two possible transitions

![Reber Grammar](images\reber.png)

In [2]:
reber_transitions = {
    0: [(1, 'B')],
    1: [(2, 'T'), (3, 'P')],
    2: [(2, 'S'), (4, 'X')],
    3: [(3, 'T'), (5, 'V')],
    4: [(6, 'S'), (3, 'X')],
    5: [(4, 'P'), (6, 'V')],
    6: [(7, 'E')]}

def move_state(cur_state, transitions=reber_transitions):
    """Finds possible next moves in transition table and picks one at a random"""
    paths = transitions[cur_state]
    return paths[np.random.choice(len(transitions[cur_state]), size=1)[0]]
    
def generate_string(string=''):
    """Iterates over possible paths until final state is reached"""
    state = 0
    while state != 7:
        state, char = move_state(state)
        string += char
    return string

def find_next_state(cur_state, char, transitions=reber_transitions):
    """Given a current state and a character in the next state, searches transitions for a state
    with the corresponding next_char, if it exists."""
    for (next_state, next_char) in transitions[cur_state]:
        if next_char == char:
            return next_state, next_char
    return -1, -1

def validate_string(string, transitions=reber_transitions, verbose=False):
    """Iterates through a given string and checks whether the string was generated by some grammar with
    given transitions. 
    
    Probably can be improved
    """
    next_state, next_char = move_state(0, transitions)
    for idx, char in enumerate(string):
        if verbose: print(f"Next State: {next_state}; Testing {idx} : {char} vs {next_char}")
        if char != next_char:
            return 0
        try:
            if verbose: print(f"\tGoing to find next state by accessing {string[idx+1]}")
            next_state, next_char = find_next_state(next_state, string[idx+1])
            if verbose: print(f"\tReturned ({next_state}, {next_char})")
            if next_state == -1:
                return 0
        except IndexError:
            pass # Trying to access out of bounds value, meaning we reached the end of the checks
        if next_state == 7:
            break
    return 1
        

Now let's generate positive classes for our dataset and check they are valid

In [3]:
dataset_size = 10_000
reber_strings = [generate_string() for _ in range(dataset_size)]
reber_labels = [1 for _ in range(dataset_size)]

In [4]:
assert all([validate_string(reber_string) for reber_string in reber_strings]), "Some generated string is NOT REBER!"

That some sample non-reber strings are invalid

In [5]:
not_reber_tests = ['BTSSPXSE', 'BTXXVVSE', 'BPVSPSE', 'BTSSSE', 'BPTVVB']
assert not any([validate_string(not_reber) for not_reber in not_reber_tests]), "One of the test strings was identified as reber"

And let's generate a bunch of random strings and use the function above to mark them as not reber strings.

In [6]:
vocab = 'BEPSTVX'
min_length = len(min(reber_strings, key=len))
max_length = len(max(reber_strings, key=len))

# Generate N random strings with the vocab. Each time strings will have different lengths 
# that are bounded by the min/max size of the reber_lengths
randomly_generated = [''.join(np.random.choice(list(vocab), size=np.random.randint(min_length, max_length)))
                      for _ in range(dataset_size)]
randomly_gen_labels = [validate_string(random_str) for random_str in randomly_generated]

### Create datasets

In [144]:
X = np.concatenate((reber_strings, randomly_generated))
y = np.concatenate((reber_labels, randomly_gen_labels))
print(X.shape, y.shape)

(20000,) (20000,)


In [145]:
np.unique(y, return_counts=True)

(array([0, 1]), array([ 9998, 10002], dtype=int64))

Data looks balanced enough! Now let's convert it to a Tensorflow Dataset and do preprocessing

First we start with tokenizing at character level, converting the characters into numbers and creating a dataset. 

In [19]:
def create_datasets(X, y, batch_size=32):
    tokenizer = Tokenizer(char_level=True, lower=False)
    tokenizer.fit_on_texts(X)
    
    encoded = tokenizer.texts_to_sequences(X)
    padded = pad_sequences(encoded, maxlen=max_length, padding='post', value=0) # pad with zeros
    
    # Recall RNN inputs have shape [batch_size, time_steps, dimensionality]
    # Need to reshape the data to an appropriate format
    X_full, y_full = padded[..., np.newaxis], y.reshape(-1,1)
    X_train_full, X_test, y_train_full, y_test = train_test_split(X_full, y_full, test_size=0.05)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.05)
        
    train_set = Dataset.from_tensor_slices((X_train, y_train)).shuffle(dataset_size).batch(batch_size).prefetch(1)
    valid_set = Dataset.from_tensor_slices((X_valid, y_valid)).shuffle(dataset_size).batch(batch_size)
    test_set = Dataset.from_tensor_slices((X_test, y_test)).shuffle(dataset_size).batch(batch_size)
    return train_set, valid_set, test_set

In [21]:
train_set, valid_set, test_set = create_datasets(X, y)

### Define and train model

In [38]:
model = Sequential([
    Input(shape=[None, 1], name='Input'),
    LSTM(32, return_sequences=True),
    LSTM(32),
    Dense(1, activation="sigmoid")])

model.compile(loss="binary_crossentropy", optimizer=Adam(), metrics=['accuracy', Precision(), Recall()])

In [39]:
with tf.device('GPU:0'):
    model.fit(train_set, epochs=15, validation_data=valid_set)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


### Evaluate

In [40]:
loss, acc, pre, rec = model.evaluate(test_set)



# Exercise 9

Train an Encoder–Decoder model that can convert a date string from one format to another (e.g., from “April 22, 2019” to “2019-04-22”).

Let's first generate the dataset

### Data Creation

In [2]:
df = pd.DataFrame({"source":np.arange('1900-01-01', '2021-01-01', dtype='datetime64[D]')})
df["X"] = df["source"].dt.strftime("%B %d, %Y")
df["y"] = df["source"].dt.strftime("%Y-%m-%d")
df = df.sample(frac=1.)  # Resample the dataframe to shuffle the dates

X = df["X"].to_numpy()
y = df["y"].to_numpy()
df.tail()

Unnamed: 0,source,X,y
34536,1994-07-23,"July 23, 1994",1994-07-23
42697,2016-11-25,"November 25, 2016",2016-11-25
36008,1998-08-03,"August 03, 1998",1998-08-03
23587,1964-07-31,"July 31, 1964",1964-07-31
29313,1980-04-04,"April 04, 1980",1980-04-04


### Tokenizing

With the data already cleansed, we can tokenize both X and y with the same object. The `<sos>` and `<eos>` tokens will have ids 0 and 1 respectively.

Since we'll be outputting probabilities and our targets are words, we have no way of calculating metrics during training (maybe?). Thus I'll use a very small test size

*We also have to shift the decoder inputs by 1 so the words we give as inputs are the words that it **should** have output at the previous step* - Not so sure

In [3]:
input_vocab = sorted(list(set(word for entry in X for word in entry)))
output_vocab = sorted(list(set(word for entry in y for word in entry)))

In [4]:
def tokenize_sent(sent, vocab):
    return [vocab.index(char) for char in sent]

def encode_data(data, vocab):
    ids = [tokenize_sent(sent, vocab) for sent in data]
    data = tf.ragged.constant(ids, ragged_rank=1)
    return (data + 1).to_tensor() # will be padding token id

def decode_sequence(seq, vocab):
    return ''.join(vocab[char_id - 1] for char_id in seq)

In [5]:
X_encoded = encode_data(X, input_vocab)
y_encoded = encode_data(y, output_vocab)

In [6]:
train_size = len(X_encoded) * 80 // 100
test_size = (len(X_encoded) - train_size) // 2

In [7]:
X_train, y_train = X_encoded[:train_size], y_encoded[:train_size]
X_valid, y_valid = X_encoded[train_size:train_size + test_size], y_encoded[train_size:train_size + test_size]
X_test, y_test = X_encoded[train_size + test_size:], y_encoded[train_size + test_size:]

In [9]:
BATCH_SIZE = 32
EMBED_SIZE = 8
sos_id = len(output_vocab) + 1 # adding a start of sequence token for the shifted decoder inputs

def get_sequence_lengths(y):
    return np.full([y.shape[0]], y.shape[1])

def shift_sequence(y, fill=sos_id):
    """Shifts target out by 1 to generate decoder inputs with a start of sequence token"""
    return np.c_[np.full((len(y), 1), fill), y[:, :-1]]

def create_dataset(X, y, batch_size=BATCH_SIZE):
    input_ = Dataset.zip((
        Dataset.from_tensor_slices(X), # Encoder inputs
        Dataset.from_tensor_slices(shift_sequence(y)), # Decoder inputs
        Dataset.from_tensor_slices(get_sequence_lengths(y)))) # Target sequence lengths
    target = Dataset.from_tensor_slices(y)
    return Dataset.zip((input_, target)).shuffle(len(X)).batch(batch_size)
    
train_set = create_dataset(X_train, y_train).prefetch(1)
valid_set = create_dataset(X_valid, y_valid)
test_set = create_dataset(X_test, y_test)

### Creating Model

For this Encoder-Decoder Network, we'll be using [tensorflow-addons](https://www.tensorflow.org/addons/tutorials/networks_seq2seq_nmt)

In [10]:
encoder_inputs = Input(shape=[None], dtype=np.int32, name='encoder_inputs')
decoder_inputs = Input(shape=[None], dtype=np.int32, name='decoder_inputs')
sequence_lengths = Input(shape=[], dtype=np.int32, name='sequence_lengths')

# Add 1 to the Embed dimesion due to using 0 padding
encoder_embeddings = Embedding(len(input_vocab) + 1, EMBED_SIZE, name='encoder_embedding')
# And an extra 1 for the SoS token
decoder_embeddings = Embedding(len(input_vocab) + 2, EMBED_SIZE, name='decoder_embedding')

encoder_embeddings = encoder_embeddings(encoder_inputs)
decoder_embeddings = decoder_embeddings(decoder_inputs)

encoder = LSTM(64, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]

sampler = tfa.seq2seq.sampler.TrainingSampler()

decoder_cell = LSTMCell(64)
output_layer = Dense(len(output_vocab) + 1) # Zero padding
decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell, sampler, output_layer=output_layer)
final_outputs, final_state, final_sequence_lengths = decoder(decoder_embeddings,
                                                             initial_state=encoder_state,
                                                             sequence_length=sequence_lengths)
Y_proba = tf.nn.softmax(final_outputs.rnn_output)

model = Model(inputs=[encoder_inputs, decoder_inputs, sequence_lengths],
              outputs=[Y_proba],
              name='date_translator')
model.compile(loss="sparse_categorical_crossentropy", optimizer='nadam', metrics=['accuracy'])
model.summary()

Model: "date_translator"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
encoder_embedding (Embedding)   (None, None, 8)      312         encoder_inputs[0][0]             
__________________________________________________________________________________________________
decoder_embedding (Embedding)   (None, None, 8)      320         decoder_inputs[0][0]             
____________________________________________________________________________________

In [14]:
# setup for tensorboard
import os
from pathlib import Path

fpath = './my_logs/date_translator'
Path(fpath).mkdir(parents=True, exist_ok=True)
root_logdir = os.path.join(os.curdir, fpath)

def get_run_logdir():
    import time
    run_id = f"char_model_batch_size_{BATCH_SIZE}_embed_dim_{EMBED_SIZE}"
    run_id += time.strftime("_%Y_%m_%d-%H")
    return os.path.join(root_logdir, run_id)

callbacks = [TensorBoard(get_run_logdir())]

In [12]:
import os
from tensorboard.plugins import projector

def get_last_modified_folder(loc='./my_logs/date_translator/'):
    files = os.listdir(loc)
    stats = [(loc+fname, os.stat(loc+fname).st_mtime) for fname in files]
    return max(stats, key=lambda x: x[1])[0]

def save_embeddings(embed_layer, name, vocab):
    logdir = get_last_modified_folder()
    with open(os.path.join(logdir, f'{name}_metadata.tsv'), "w") as f:
        for word in vocab:
            f.write(f"{word}\n")
    
    weights = tf.Variable(embed_layer.get_weights()[0])
    checkpoint = tf.train.Checkpoint(embedding=weights)
    checkpoint.save(os.path.join(logdir, f"{name}.ckpt"))
    
    config = projector.ProjectorConfig()
    embedding = config.embeddings.add()
    embedding.tensor_name = f"{name}/.ATTRIBUTES/TEST_VALUE"
    embedding.metadata_path = f'{name}_metadata.tsv'
    projector.visualize_embeddings(logdir, config)

In [13]:
with tf.device('CPU:0'):
    model.fit(train_set, epochs=5, validation_data=valid_set,callbacks=callbacks)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Save the embeddings to examine in TensorBoard

In [16]:
embed_layer = model.layers[2]
save_embeddings(embed_layer, embed_layer.name, input_vocab)

embed_layer = model.layers[3]
save_embeddings(embed_layer, embed_layer.name, output_vocab)

In [31]:
np.argmax(y_pred_probas, axis=-1)

array([[ 3, 11,  9, ...,  1,  3,  5],
       [ 3, 11,  6, ...,  1,  4,  5],
       [ 3, 11,  6, ...,  1,  5,  2],
       ...,
       [ 3, 11,  9, ...,  1,  4, 11],
       [ 4,  2,  2, ...,  1,  4,  9],
       [ 3, 11,  3, ...,  1,  4,  3]], dtype=int64)

In [18]:
y_pred_probas = model.predict(test_set)
y_preds = np.argmax(y_pred_probas, axis=2)

In [34]:
actual = [decode_sequence(sent, output_vocab) for sent in y_test]
translated = [decode_sequence(sent , output_vocab) for sent in y_preds]

In [22]:
model.evaluate(test_set)



[0.014049885794520378, 0.9998868703842163]

In [35]:
result_df = pd.DataFrame({"actual": actual, "translation": translated})
result_df.head(20)

Unnamed: 0,actual,translation
0,2018-05-20,1975-12-13
1,2009-10-28,1946-07-23
2,1936-06-16,1948-12-30
3,1954-09-22,1920-12-29
4,1984-06-21,1908-01-23
5,1949-07-06,1982-12-07
6,1921-09-08,2006-01-30
7,1943-11-23,1906-01-03
8,1970-12-08,1923-12-02
9,1911-01-21,1949-04-10


Not sure why the translations are so off even though we have 100% accuracy. Must double check this