In [1]:
import tensorflow as tf
import numpy as np
%matplotlib inline
from tensorflow.models.rnn.ptb import reader
import time
import os
import urllib.request

In [2]:
%load_ext autoreload
%autoreload 2

In [24]:
"download dataset"
file_url = 'https://raw.githubusercontent.com/jcjohnson/torch-rnn/master/data/tiny-shakespeare.txt'
file_name = 'tinyshakespeare.txt'
if not os.path.isfile(file_name):
    urllib.request.urlretrieve(file_url, file_name)

In [25]:
data = open(file_name).read() # This contains all the file in a string in memory
print("the size of the data is {}".format(len(data)))
vocab = set(data) # upper-case and lower-case characters are different
vocab_size = len(vocab)
idx_to_vocab = dict(enumerate(vocab))
vocab_to_idx = dict(zip(idx_to_vocab.values(), idx_to_vocab.keys()))

the size of the data is 1115394


In [26]:
numeric_data = list(map(lambda x: vocab_to_idx[x], data))
del data

In [27]:
def gen_epochs(num_epochs, batch_size, num_steps):
    for _ in range(num_epochs):
        yield reader.ptb_iterator(numeric_data, batch_size, num_steps)

In [68]:
# Using the previously written code to build a language model
from basic_rnn_using_tensorflow_api import BasicRNN
from basic_lstm_using_tensorflow_api import BasicLSTM
from basic_lstm_using_dynamicRNN import DynamicLSTM
from basic_lstm_using_tfScan import DynamicScannedLSTM
from basic_lstm_using_custom_gru import CustomGRU
from basic_lstm_using_layer_normalized_lstm import CustomLSTM
from basic_GRU_using_dynamicRNN import DynamicGRU

In [29]:
state_size = 100
batch_size = 32
num_steps = 200
num_classes = vocab_size
inlayer_dropout = 0.6
num_weights = 3
learning_rate = 0.0001

### Basic RNN with one layer

In [9]:
tf.reset_default_graph()
tic = time.time()
language_model = BasicRNN(state_size=state_size, num_steps=num_steps, batch_size=batch_size,
                          num_classes=num_classes, num_layers=1, inlayer_dropout=inlayer_dropout,
                          learning_rate=learning_rate)
toc = time.time()
print("The time took to build the basic RNN model from list is ", toc - tic)

The time took to build the basic RNN model from list is  34.41115379333496


In [10]:
tic = time.time()
for n_epoch, epoch in enumerate(gen_epochs(3, batch_size, num_steps)):
    loss, step = 0, 0
    for batch in epoch:
        loss += language_model.update_params(batch)
        step += 1
    print("loss after {0} epoch is {1:0.2f}".format(n_epoch, loss / step))
toc = time.time()
print("time taken to finish this simulation is {0:0.2f} minutes".format((toc - tic) / 60))

loss after 0 epoch is 3.72
loss after 1 epoch is 3.36
loss after 2 epoch is 3.23
time taken to finish this simulatin is 1.00 minutes


### Basic RNN with 3-layers

In [11]:
tf.reset_default_graph()
tic = time.time()
language_model_3_layer_rnn = BasicRNN(state_size=state_size, num_steps=num_steps, num_layers=3, 
                                      batch_size=batch_size, inlayer_dropout=inlayer_dropout,
                                      num_classes=num_classes, learning_rate=learning_rate)
toc = time.time()
print("The time took to build the basic RNN model with 3 layers from list is ", toc - tic)

The time took to build the basic RNN model with 3 layers from list is  81.43449544906616


In [12]:
tic = time.time()
for n_epoch, epoch in enumerate(gen_epochs(3, batch_size, num_steps)):
    loss, step = 0, 0
    for batch in epoch:
        loss += language_model_3_layer_rnn.update_params(batch)
        step += 1
    print("loss after {0} epoch is {1:0.2f}".format(n_epoch, loss / step))
toc = time.time()
print("time taken to finish this simulation is {0:0.2f} minutes".format((toc - tic) / 60))

loss after 0 epoch is 3.67
loss after 1 epoch is 3.24
loss after 2 epoch is 3.02
time taken to finish this simulatin is 1.66


### Basic LSTM with 3 layers using tf.rnn api 

In [13]:
tf.reset_default_graph()
tic = time.time()
language_model_3_layer_lstm = BasicLSTM(state_size=state_size, num_steps=num_steps, 
                                        num_layers=3, batch_size=batch_size, inlayer_dropout=inlayer_dropout,
                                        num_classes=num_classes, learning_rate=learning_rate)
toc = time.time()
print("The time took to build the basic LSTM model with 3 layers from list is ", toc - tic)

The time took to build the basic LSTM model with 3 layers from list is  148.90693283081055


In [14]:
tic = time.time()
for n_epoch, epoch in enumerate(gen_epochs(3, batch_size, num_steps)):
    loss, step = 0, 0
    for batch in epoch:
        loss += language_model_3_layer_lstm.update_params(batch)
        step += 1
    print("loss after {0} epoch is {1:0.2f}".format(n_epoch, loss / step))
toc = time.time()
print("time taken to finish this simulation is {0:0.2f} minutes".format((toc - tic) / 60))

loss after 0 epoch is 3.70
loss after 1 epoch is 3.45
loss after 2 epoch is 3.41
time taken to finish this simulatin is 5.75


It is taking lot of time to just build the model. This is not a problem during training time because we have to build the model only once. But it could be a problem during test time where we may have to build the model multiple times. We can use a `Tensorflow` api `DynamicRNN` that can delay the creation of the graph to the run time. 

### Basic LSTM using tf.dynamic_rnn api

In [16]:
tf.reset_default_graph()
tic = time.time()
language_model_dynamic_lstm = DynamicLSTM(state_size=state_size, num_steps=num_steps, num_layers=3, 
                                         batch_size=batch_size, num_classes=num_classes, 
                                          inlayer_dropout=inlayer_dropout, learning_rate=learning_rate)
toc = time.time()
print("The time took to build the dynamic LSTM model with 3 layers from list is ", toc - tic)

The time took to build the dynamic LSTM model with 3 layers from list is  1.6791408061981201


In [17]:
tic = time.time()
for n_epoch, epoch in enumerate(gen_epochs(3, batch_size, num_steps)):
    loss, step = 0, 0
    for batch in epoch:
        loss += language_model_dynamic_lstm.update_params(batch)
        step += 1
    print("loss after {0} epoch is {1:0.2f}".format(n_epoch, loss / step))
toc = time.time()
print("time taken to finish this simulation is {0:0.2f} minutes".format((toc - tic) / 60))

loss after 0 epoch is 3.58
loss after 1 epoch is 3.35
loss after 2 epoch is 3.33
time taken to finish this simulation is 4.64


### Basic LSTM using tf.scan api

In [19]:
tf.reset_default_graph()
tic = time.time()
language_model_scanned_lstm = DynamicScannedLSTM(state_size=state_size, num_steps=num_steps, num_layers=3, 
                                                 batch_size=batch_size, num_classes=num_classes, 
                                                 inlayer_dropout=inlayer_dropout,
                                                 learning_rate=learning_rate)
toc = time.time()
print("The time took to build the dynamic LSTM model with 3 layers from list is ", toc - tic)

The time took to build the dynamic LSTM model with 3 layers from list is  1.8496365547180176


In [20]:
tic = time.time()
for n_epoch, epoch in enumerate(gen_epochs(3, batch_size, num_steps)):
    loss, step = 0, 0
    for batch in epoch:
        loss += language_model_scanned_lstm.update_params(batch)
        step += 1
    print("loss after {0} epoch is {1:0.2f}".format(n_epoch, loss / step))
toc = time.time()
print("time taken to finish this simulation is {0:0.2f} minutes".format((toc - tic) / 60))

loss after 0 epoch is 3.61
loss after 1 epoch is 3.35
loss after 2 epoch is 3.34
time taken to finish this simulation is 4.63


### Custom Gated Recurrent Unit (GRU) using 3 layers

In this code, I have created a custom GRU unit. This GRU unit uses $n$ weights. This Custom GRU is based on the follwoing intuitions:
1. Each sentence has subject, verb, and object.
2. We should treat subject, verb, and object differently.
3. We would learn the way weights should be treated by subject, verb, and object dynamically using $\lambda_i$s.
4. The input that would go to RNN will be $\sum_i \lambda_i W_i x$

In [33]:
tf.reset_default_graph()
tic = time.time()
language_model_custom_gru = CustomGRU(state_size=state_size, num_steps=num_steps, num_layers=3, 
                                                 batch_size=batch_size, num_classes=num_classes, 
                                                 inlayer_dropout=inlayer_dropout, num_weights=num_weights,
                                                 learning_rate=learning_rate)
toc = time.time()
print("The time took to build the dynamic LSTM model with 3 layers from list is ", toc - tic)

The time took to build the dynamic LSTM model with 3 layers from list is  3.9994800090789795


In [35]:
tic = time.time()
for n_epoch, epoch in enumerate(gen_epochs(3, batch_size, num_steps)):
    loss, step = 0, 0
    for batch in epoch:
        loss += language_model_custom_gru.update_params(batch)
        step += 1
    print("loss after {0} epoch is {1:0.2f}".format(n_epoch, loss / step))
toc = time.time()
print("time taken to finish this simulation is {0:0.2f} minutes".format((toc - tic) / 60))

loss after 0 epoch is 3.49
loss after 1 epoch is 2.90
loss after 2 epoch is 2.54
time taken to finish this simulation is 7.42


### Layer Normalized LSTM

Layer Normalization helps in stable and fast learning for RNN as Batch Normalization works for Feed Forward Networks. Layer Normalization works as following:
1. We layer normalized input that goes to a non-linearity.
2. By a given input, we compute the hiddent activation that it casued. For example, in RNN, at time $t$, when input $x_t$ and previous state $h_{t - 1}$ goes to a cell, it is first transformed to $z_t = W_t [x_t, h_t] + b_t$. In layer normalization, we normalize $z_t$. Essentially, we transform $z_t$ to 
$$
\tilde{z}_t = \gamma_t \frac{(z_t - mean(z_t))}{Var(z_t)} + \beta_t.
$$
Subsequently, we feed this normalized input to the non-linearity and compute the next state 
$$h_t = \tanh{\tilde{z}_t}$$

In [66]:
tf.reset_default_graph()
tic = time.time()
language_model_custom_LSTM = CustomLSTM(state_size=state_size, num_steps=num_steps, num_layers=3, 
                                                 batch_size=batch_size, num_classes=num_classes, 
                                                 inlayer_dropout=inlayer_dropout,
                                                 learning_rate=learning_rate)
toc = time.time()
print("The time took to build the dynamic LSTM model with 3 layers from list is ", toc - tic)

The time took to build the dynamic LSTM model with 3 layers from list is  9.066001176834106


In [67]:
tic = time.time()
for n_epoch, epoch in enumerate(gen_epochs(3, batch_size, num_steps)):
    loss, step = 0, 0
    for batch in epoch:
        loss += language_model_custom_LSTM.update_params(batch)
        step += 1
    print("loss after {0} epoch is {1:0.2f}".format(n_epoch, loss / step))
toc = time.time()
print("time taken to finish this simulation is {0:0.2f} minutes".format((toc - tic) / 60))

loss after 0 epoch is 3.71
loss after 1 epoch is 3.38
loss after 2 epoch is 3.33
time taken to finish this simulation is 13.89 minutes


### Train a gated recurrent unit for 20 epochs, save the model, and use it for generating texts

In [74]:
tf.reset_default_graph()
tic = time.time()
language_model_dynamic_gru = DynamicGRU(state_size=state_size, num_steps=num_steps, num_layers=3, 
                                                 batch_size=batch_size, num_classes=num_classes, 
                                                 inlayer_dropout=inlayer_dropout,
                                                 learning_rate=learning_rate)
toc = time.time()
print("The time took to build the dynamic LSTM model with 3 layers from list is ", toc - tic)

The time took to build the dynamic LSTM model with 3 layers from list is  2.747432231903076


In [75]:
tic = time.time()
for n_epoch, epoch in enumerate(gen_epochs(20, batch_size, num_steps)):
    loss, step = 0, 0
    for batch in epoch:
        loss += language_model_dynamic_gru.update_params(batch)
        step += 1
    print("loss after {0} epoch is {1:0.2f}".format(n_epoch, loss / step))
toc = time.time()
print("time taken to finish this simulation is {0:0.2f} minutes".format((toc - tic) / 60))

loss after 0 epoch is 3.60
loss after 1 epoch is 3.35
loss after 2 epoch is 3.33
loss after 3 epoch is 3.26
loss after 4 epoch is 3.07
loss after 5 epoch is 2.89
loss after 6 epoch is 2.79
loss after 7 epoch is 2.72
loss after 8 epoch is 2.65
loss after 9 epoch is 2.58
loss after 10 epoch is 2.49
loss after 11 epoch is 2.43
loss after 12 epoch is 2.38
loss after 13 epoch is 2.33
loss after 14 epoch is 2.29
loss after 15 epoch is 2.26
loss after 16 epoch is 2.23
loss after 17 epoch is 2.21
loss after 18 epoch is 2.18
loss after 19 epoch is 2.16
time taken to finish this simulation is 31.08 minutes


#### Restoring the previously saved model

In [121]:
tf.reset_default_graph()
tic = time.time()
language_model_dynamic_gru = DynamicGRU(state_size=state_size, num_steps=1, num_layers=3, 
                                                 batch_size=1, num_classes=num_classes, 
                                                 inlayer_dropout=inlayer_dropout,
                                                 learning_rate=learning_rate)
language_model_dynamic_gru.saver.restore(language_model_dynamic_gru.session, "dynamic_gru_model.ckpt-3400")
toc = time.time()
print("The time took to restore the dynamic GRU model with 3 layers from list is ", toc - tic)

The time took to restore the dynamic GRU model with 3 layers from list is  9.898928165435791


In [140]:
def predict_characters(initial_character="S", num_chars=200, initial_state=None, choose="multinomial"):
    char = [initial_character]
    lm = language_model_dynamic_gru
    input_ = [[vocab_to_idx[initial_character]]]
    for _ in range(num_chars):
        lm.init_state, probs = lm.session.run([lm.final_state, lm.probs], {lm.input: input_})  
        if choose == "multinomial":
            input_ = [[np.argmax(np.random.multinomial(1, probs[0]))]]
        elif choose == "max":
            input_ = [[np.argmax(probs[0])]]
        char.append(idx_to_vocab[input_[0][0]])
    return "".join(char)

In [141]:
predict_characters("E", choose="max")

'E te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te te t'