In [2]:
%tensorflow_version 2.x
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

In [4]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [5]:
text = open(path_to_file, 'rb').read().decode(encoding = 'utf-8')
# create a sorted set of text (remove duplicates)
vocab = sorted(set(text))
# create mapping for unique chars
chars_to_index = {u:i for i, u in enumerate(vocab)}
index_to_char = np.array(vocab)

def text_to_int(text):
  return np.array([chars_to_index[c] for c in text])

text_as_int = text_to_int(text)

In [6]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length + 1)
# create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [7]:
# limit to 101 characters
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

In [8]:
def split_input_range(chunk):
  input_text = chunk[:-1]
  target_text = chunk[1:]
  return input_text, target_text

dataset = sequences.map(split_input_range)

In [9]:
BATCH_SIZE = 64
VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 256
RNN_UNITS = 1024
BUFFER_SIZE = 10000
# shuffle all the data to train model well
data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder = True)

In [10]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
                         # since it is variable length
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

model = build_model(VOCAB_SIZE,EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
lstm (LSTM)                  (64, None, 1024)          5246976   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________
