In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [None]:
# Retrieve the data
frank_url = 'https://storage.googleapis.com/acg-datasets/tiny_frankenstein.tgz'
cache_dir = '.'
cache_subdir = "data"
tf.keras.utils.get_file('tiny_frankenstein.tgz', frank_url, cache_dir=cache_dir, cache_subdir=cache_subdir, extract=True)


In [None]:
# Load the data

frank_file = os.path.join(cache_dir, cache_subdir, 'tiny_frankenstein.txt')
with open(frank_file, 'r') as f:
    frank_data = f.read().lower()

In [None]:
# Train a model to generate text

# Tokenize the data
tokenizer = Tokenizer()
# corpus = frank_data.split('\n')
# tokenizer.fit_on_texts(corpus)
tokenizer.fit_on_texts([frank_data])
known_words = len(tokenizer.word_index) 
total_tokens = known_words + 1 # Add 1 for the padding token




In [None]:
# Convert text to tokens
# frank_tokens = tokenizer.texts_to_sequences(corpus)[0]
frank_tokens = tokenizer.texts_to_sequences([frank_data])[0]

In [None]:
# Create input sequences
def wrangle_data(sequence, sequence_length, batch_size):
    sequence_length = sequence_length +1
    sequence_expand = tf.expand_dims(sequence, -1)
    
    dataset = tf.data.Dataset.from_tensor_slices(sequence_expand)
    dataset = dataset.window(sequence_length, shift=1, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(sequence_length))
    dataset = dataset.map(lambda window: (window[:-1], window[-1]))
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    
    return dataset

In [None]:
sequence_length = 72 # more context, but slower training
train_data = wrangle_data(frank_tokens, sequence_length, 64)

In [None]:
# Create the model

def bidirectional_rnn_model(total_tokens, sequence_length):
    new_model = tf.keras.models.Sequential([
        tf.keras.layers.Embedding(total_tokens, 32, input_length=sequence_length),
        # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
        # tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(total_tokens, activation='softmax')
    ])
    
    new_model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(0.03), metrics=['accuracy'])
    return new_model

In [None]:
# Train the model
model = bidirectional_rnn_model(total_tokens, sequence_length)
model.summary()

In [None]:
train_data

In [None]:
history = model.fit(train_data, epochs=10)


In [None]:
from history import save_history
# Save the model
model_name = 'frankenstein_bidirectional_rnn'
accuracy = model.evaluate(train_data)[1]

save_name = f'models/reviews-{model_name}-{len(history.epoch)}-{accuracy:.4f}'
model.save(f"{save_name}.tf", save_format="tf")
save_history(history, save_name)

In [None]:
# Generate text
token_lookup = {v:k for k, v in tokenizer.word_index.items()}

seed = frank_tokens[-sequence_length:]
seed_text = ""

for t in seed:
    seed_text += token_lookup[t] + " "
    
print(seed_text)

In [None]:
generate_tokens_length = 50

output = []

for _ in range(generate_tokens_length):
    tokens = pad_sequences([seed], maxlen=sequence_length, padding='pre', truncating='pre')
    predicted = model.predict(tokens)
    next_token = np.argmax(predicted)
    output.append(token_lookup[next_token+1])
    seed.append(next_token)
    
print(' '.join(output))