In [1]:
!git add C3_NLP_with_RNN.ipynb
!git commit -m "NLP with RNN"
!git push

The file will have its original line endings in your working directory


[master 6048cc2] NLP with RNN
 1 file changed, 174 insertions(+), 20 deletions(-)


To https://github.com/auslei/python.git
   9ac814d..6048cc2  master -> master


## Creating Training Dataset

In [1]:
import tensorflow as tf
from tensorflow import keras

shakespeare_url = "https://homl.info/shakespeare"
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)

with open(filepath) as f:
    shakespeare_text = f.read()


In [2]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level = True)
tokenizer.fit_on_texts(shakespeare_text)

In [3]:
a = tokenizer.texts_to_sequences(["this is a test", "abcd"])
b = tokenizer.sequences_to_texts(a)

print(a, b)

print(len(tokenizer.word_index), tokenizer.document_count) #num of uniq chars, totol number of docs

[[3, 7, 6, 8, 1, 6, 8, 1, 5, 1, 3, 2, 8, 3], [5, 22, 19, 13]] ['t h i s   i s   a   t e s t', 'a b c d']
39 1115394


In [4]:
import numpy as np

[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1 #subtract one to start for 0 index 

In [5]:
encoded.shape

(1115394,)

In [6]:
## Set training Dataset

train_size = len(encoded) * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

print(train_size)

1003854


In [7]:
## chopping dataset into mulitple windows
n_steps = 100
window_length = n_steps + 1 # target = input shifted 1 character ahead (window is 101)
dataset = dataset.window(window_length, shift = 1, drop_remainder = True) 
# by default window is non-overlapping, setting shift to 1 the window will be overlapping. The shift maybe fine tuned
# use drop_remainder = True to ensure each window is exactly 101 characters long

In [8]:
# window function create nested dataset (each window is a dataset)
# we will use flat_map to convert them into a flat dataset (no nesting), batched on each window size
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [9]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size) # shuffle with a buffer size of 10000 and batch into minibatches
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:])) # generate train and targets (0 - 100, 1 - 101)

In [10]:
#on hot encode
dataset = dataset.map(lambda X_batch, y_batch: (tf.one_hot(X_batch, depth = 39), y_batch))

In [11]:
for d in dataset: print(d[0].shape, d[1].shape);break

(32, 100, 39) (32, 100)


In [None]:
cp_cb = keras.callbacks.ModelCheckpoint("shakespeare.h5", save_best_only=True)


model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences = True, input_shape = [None, 39], dropout = 0.2, recurrent_dropout = 0),
    keras.layers.GRU(128, return_sequences = True, input_shape = [None, 39], dropout = 0.2, recurrent_dropout = 0),
    keras.layers.TimeDistributed(keras.layers.Dense(39, activation = 'softmax'))
])

model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam')
history = model.fit(dataset, epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

The requirements to use the cuDNN implementation are:

1. activation == tanh
2. recurrent_activation == sigmoid
3. recurrent_dropout == 0
4. unroll is False
5. use_bias is True
6. reset_after is True
7. Inputs, if use masking, are strictly right-padded.
8. Eager execution is enabled in the outermost context.

In [None]:
model.save('shakespeare.h5')

## using the Char-RNN model