In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import time
import random
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import numpy as np
import tensorflow_addons as tfa
from data_generator import DataGenerator
from typing import List
from tensorflow.keras.utils import plot_model

In [3]:
m = 200
SOS_CHAR = "^"
# EOS_CHAR = "$"

In [4]:
curr_time_in_seconds = int(time.time())
random_times_since_epoch = [
    time.localtime(random.randrange(0, curr_time_in_seconds)) for _ in range(m)
]

Basic encoder decoder works like this:
training:
- encoder takes as input the encoder input string (January 02, 1990) embedded in ints for with an encoder-specific embedding
- at the SAME TIME, the decoder takes two things as input:
    - the encoder's final state
    - the decoder input string embedded in ints with a decoder-specific embedding (must include EOS and SOS??)

In [37]:
x_encoder_raw = [
    time.strftime("%B %d, %Y", random_time)
    for random_time in random_times_since_epoch
]
def convert_encoder_input_to_ints(raw_encoder_strs: List[str]):
    # TODO: only need to create this one time
    original_char_set = set(
        char
        for string in raw_encoder_strs
        for char in string
    )
    ordered_chars = sorted(original_char_set)
    char_to_shifted_idx = {
        char: idx + 1 # shift the idx so that 0 can be the padding char
        for idx, char in enumerate(ordered_chars)
    }
    converted_encoder_rows = [
        [char_to_shifted_idx[char] for char in string]
        for string in raw_encoder_strs
    ]
    return converted_encoder_rows, ordered_chars
encoder_input_lists, encoder_chars = convert_encoder_input_to_ints(x_encoder_raw)

In [29]:
x_decoder_raw = [
    time.strftime("%Y-%m-%d", random_time)
    for random_time in random_times_since_epoch
]
def convert_decoder_input_to_ints(raw_decoder_strs: List[str]):
    # TODO: shifted to accommodate SOS and EOS... is that right?
    chars_with_padding_tokens = f"-0123456789{SOS_CHAR}"
    char_to_idx = {
        char: idx
        for idx, char in enumerate(chars_with_padding_tokens)
    }
    # TODO: doing string concatenation might be inefficient. Maybe could prepend char_to_idx[SOS_CHAR] to each row instead?
    decoder_rows_as_ints_with_start_token = [
        [char_to_idx[char] for char in string]
        for string in raw_decoder_strs
    ]
    decoder_rows_as_ints_with_start_token_truncated = [
        [char_to_idx[char] for char in SOS_CHAR + string[:-1]] # TODO TODO: how is this legal??
        for string in raw_decoder_strs
    ]
    return (
        decoder_rows_as_ints_with_start_token,
        decoder_rows_as_ints_with_start_token_truncated,
        chars_with_padding_tokens
    )
decoder_input_lists, decoder_target_lists, decoder_chars = convert_decoder_input_to_ints(x_decoder_raw)

In [30]:
X_encoder = pd.DataFrame(encoder_input_lists).fillna(value=0).astype(np.int32)
X_decoder = pd.DataFrame(decoder_input_lists).astype(np.int32)
y_decoder = pd.DataFrame(decoder_target_lists).astype(np.int32) # TODO: is this cool??
encoder_seq_lens = pd.Series([len(row) for row in encoder_input_lists])

In [31]:
embed_size = 64 # TODO: change embed size?

## Encoder
# TODO: figure out size!
encoder_input_layer = keras.layers.Input(shape=[None], dtype=np.int32)
encoder_embeddings = keras.layers.Embedding(
    input_dim=len(encoder_chars) + 1, # TODO: figure out how to do masking properly. mask_zero = True?
    output_dim=embed_size
)(encoder_input_layer)

# TODO: right size for LSTM?
encoder = keras.layers.LSTM(embed_size, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]

## Decoder
decoder_input_layer = keras.layers.Input(shape=[None], dtype=np.int32)
decoder_embeddings = keras.layers.Embedding(
    input_dim=len(decoder_chars) + 1, # TODO: why the + 2?
    output_dim=embed_size
)(decoder_input_layer)
decoder = keras.layers.LSTM(embed_size, return_sequences=True) # need to predict full seq!
decoder_outputs = decoder(
    decoder_embeddings,
    initial_state=encoder_state
)

dense_preds = keras.layers.Dense(
    len(decoder_chars) + 1, # TODO: why the +1??
    activation='softmax'
)(decoder_outputs)

model = keras.Model(
    inputs=[
        encoder_input_layer,
        decoder_input_layer
    ],
    outputs=dense_preds
)
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

In [36]:
model.fit(
    x=[X_encoder, X_decoder],
    y=y_decoder,
    epochs=50
)

Train on 200 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x138ebe510>

In [45]:
encoder_input_raw = "January 13, 1986"
ints = [encoder_chars.index(char) for char in encoder_input_raw]
padding_len = X_encoder.shape[1] - len(ints)
encoder_input_padded =  ints + [0] * padding_len

# TODO: how do I do inference? I gotta do one character at a time

example_output = model.predict([encoder_input_padded, decoder_input])

-0123456789^


ValueError: substring not found

TODO: this is a more advanced version that I lifted from the textbook.
I'll get to it when I get the basic version working

In [39]:
encoder_input=encoder_input,
decoder_input=decoder_input,
decoder_target=decoder_target,
sequence_lengths=x_eng_seq_lengths

encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
sequence_lengths = keras.layers.Input(shape=[], dtype=np.int32)

encoder_vocab_size = len(encoder_chars) # TODO: seems like I should add 1 here. why?
decoder_vocab_size = len(decoder_chars)
embed_size = 64
# TODO: why this embed size?
# embeddings = keras.layers.Embedding(vocab_size, embed_size)
# TODO: what is the signature of this embeddings? what's going on here
encoder_embeddings = keras.layers.Embedding(
    input_dim=encoder_vocab_size,
    output_dim=embed_size,
    mask_zero=True
)(encoder_inputs)
decoder_embeddings = keras.layers.Embedding(
    input_dim=decoder_vocab_size,
    output_dim=embed_size
)(decoder_inputs)

# TODO: is this the right size of the LSTM?
encoder = keras.layers.LSTM(embed_size, return_state=True)
# TODO: what do these long and short term states mean?
encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]

sampler = tfa.seq2seq.sampler.TrainingSampler()

decoder = keras.layers.LSTMCell(embed_size)
output_layer = keras.layers.Dense(decoder_vocab_size)
decoder = tfa.seq2seq.basic_decoder.BasicDecoder(
    decoder_cell,
    sampler,
    output_layer=output_layer
)
# TODO: what does final_sequence_lengths mean?
final_outputs, final_state, final_sequence_lengths = decoder(
    decoder_embeddings,
    initial_state=encoder_state,
    sequence_length=sequence_lengths
)
y_proba = tf.nn.softmax(final_outputs.rnn_output)

model = keras.Model(
    inputs=[
        encoder_inputs,
        decoder_inputs,
        sequence_lengths],
    outputs=[y_proba],
)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [15]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 64)     2496        input_4[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 64)     832         input_5[0][0]                    
____________________________________________________________________________________________

In [40]:
model.fit(
    x=[
        X_encoder,
        X_decoder,
        encoder_seq_lens
    ],
    y=y_decoder
)

Train on 200 samples
 32/200 [===>..........................] - ETA: 20s

InvalidArgumentError:  Trying to access element 11 in a list with 11 elements.
	 [[{{node model_2/basic_decoder_3/decoder/while/body/_10/cond/else/_726/TensorArrayV2Read/TensorListGetItem}}]] [Op:__inference_distributed_function_23552]

Function call stack:
distributed_function


In [None]:
x_eng_seq_lengths = pd.Series([len(e) for e in x_encoder_raw])

INPUT_CHARS = sorted(
    set(
        char
        for string in x_encoder_raw
        for char in string
    )
)
input_char_idx = {char: idx for idx, char in enumerate(INPUT_CHARS)}
encoder_input = (
    pd.DataFrame([
        [input_char_idx[c]+1 for c in string] # it will be padded with zeros
        for string in x_encoder_raw
    ])
    .fillna(value=0)
    .astype("int32")
)

OUTPUT_CHARS = sorted(set(
    char
    for string in x_decoder_raw
    for char in string
))
output_char_idx = {char: idx for idx, char in enumerate(OUTPUT_CHARS)}
decoder_input = pd.DataFrame(
    [SOS_CHAR + row for row in x_decoder_raw]
).astype("int32")
# TODO: don't concatenate strings directly! could be slow
decoder_target = pd.DataFrame(
    self.converter.convert_all([row + EOS_CHAR for row in x_decoder_raw])
).astype("int32")

In [88]:
from random import shuffle
names = ['ben', 'christine', 'johann', 'jana', 'neil', 'unnati']
group_size = 2
shuffle(names)
for idx in range(0, len(names), group_size):
    print(f'Group: {"+".join(names[idx:idx+group_size])}')

Group: johann+ben
Group: christine+neil
Group: unnati+jana
