<h2> Sequential Data </h2>

In [1]:
#Whenever the points in a dataset are dependent on the other points, 
#the data is said to be sequential. 
#A common example of this is a time series, such as a stock price, 
#or sensor data, where each data point represents an observation at a certain point in time. 
#There are other examples of sequential data, like sentences, gene sequences, and weather data. 
#But traditional neural networks typically can't handle this type of data.

In [2]:
import numpy as np
import tensorflow as tf

In [3]:
LSTM_CELL_SIZE = 4  # output size (dimension), which is same as hidden size in the cell

state = (tf.zeros([1,LSTM_CELL_SIZE]),)*2
state

(<tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[0., 0., 0., 0.]], dtype=float32)>,
 <tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[0., 0., 0., 0.]], dtype=float32)>)

In [4]:
lstm = tf.keras.layers.LSTM(LSTM_CELL_SIZE, return_sequences=True, return_state=True)

lstm.states=state

print(lstm.states)

(<tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[0., 0., 0., 0.]], dtype=float32)>, <tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[0., 0., 0., 0.]], dtype=float32)>)


In [5]:
#Batch size x time steps x features.
sample_input = tf.constant([[3,2,2,2,2,2]],dtype=tf.float32)

batch_size = 1
sentence_max_length = 1
n_features = 6

new_shape = (batch_size, sentence_max_length, n_features)

inputs = tf.constant(np.reshape(sample_input, new_shape), dtype = tf.float32)

In [7]:
output, final_memory_state, final_carry_state = lstm(inputs)
print('Output : ', tf.shape(output))

print('Memory : ',tf.shape(final_memory_state))

print('Carry state : ',tf.shape(final_carry_state))

Output :  tf.Tensor([1 1 4], shape=(3,), dtype=int32)
Memory :  tf.Tensor([1 4], shape=(2,), dtype=int32)
Carry state :  tf.Tensor([1 4], shape=(2,), dtype=int32)


In [9]:
cells = []
LSTM_CELL_SIZE_1 = 4 #4 hidden nodes
cell1 = tf.keras.layers.LSTMCell(LSTM_CELL_SIZE_1)
cells.append(cell1)

In [10]:
LSTM_CELL_SIZE_2 = 5 #5 hidden nodes
cell2 = tf.keras.layers.LSTMCell(LSTM_CELL_SIZE_2)
cells.append(cell2)

In [11]:
stacked_lstm =  tf.keras.layers.StackedRNNCells(cells)

In [12]:
lstm_layer= tf.keras.layers.RNN(stacked_lstm ,return_sequences=True, return_state=True)

In [13]:
#Batch size x time steps x features.
sample_input = [[[1,2,3,4,3,2], [1,2,1,1,1,2],[1,2,2,2,2,2]],[[1,2,3,4,3,2],[3,2,2,1,1,2],[0,0,0,0,3,2]]]
sample_input

batch_size = 2
time_steps = 3
features = 6
new_shape = (batch_size, time_steps, features)

x = tf.constant(np.reshape(sample_input, new_shape), dtype = tf.float32)

In [14]:
output, final_memory_state, final_carry_state  = lstm_layer(x)

In [15]:
print('Output : ', tf.shape(output))

print('Memory : ',tf.shape(final_memory_state))

print('Carry state : ',tf.shape(final_carry_state))

Output :  tf.Tensor([2 3 5], shape=(3,), dtype=int32)
Memory :  tf.Tensor([2 2 4], shape=(3,), dtype=int32)
Carry state :  tf.Tensor([2 2 5], shape=(3,), dtype=int32)


<h2> Language Modelling with LSTM </h2>

In [16]:
import time

In [17]:
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================


"""Utilities for parsing PTB text files."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import os

import numpy as np
import tensorflow as tf


def _read_words(filename):
  with tf.io.gfile.GFile(filename, "r") as f:
    return f.read().replace("\n", "<eos>").split()


def _build_vocab(filename):
  data = _read_words(filename)

  counter = collections.Counter(data)
  count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

  words, _ = list(zip(*count_pairs))
  word_to_id = dict(zip(words, range(len(words))))

  return word_to_id


def _file_to_word_ids(filename, word_to_id):
  data = _read_words(filename)
  return [word_to_id[word] for word in data if word in word_to_id]


def ptb_raw_data(data_path=None):
  """Load PTB raw data from data directory "data_path".

  Reads PTB text files, converts strings to integer ids,
  and performs mini-batching of the inputs.

  The PTB dataset comes from Tomas Mikolov's webpage:

  http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz

  Args:
    data_path: string path to the directory where simple-examples.tgz has
      been extracted.

  Returns:
    tuple (train_data, valid_data, test_data, vocabulary)
    where each of the data objects can be passed to PTBIterator.
  """

  train_path = os.path.join(data_path, "ptb.train.txt")
  valid_path = os.path.join(data_path, "ptb.valid.txt")
  test_path = os.path.join(data_path, "ptb.test.txt")

  word_to_id = _build_vocab(train_path)
  train_data = _file_to_word_ids(train_path, word_to_id)
  valid_data = _file_to_word_ids(valid_path, word_to_id)
  test_data = _file_to_word_ids(test_path, word_to_id)
  vocabulary = len(word_to_id)
  return train_data, valid_data, test_data, vocabulary, word_to_id


def ptb_iterator(raw_data, batch_size, num_steps):
  """Iterate on the raw PTB data.

  This generates batch_size pointers into the raw PTB data, and allows
  minibatch iteration along these pointers.

  Args:
    raw_data: one of the raw data outputs from ptb_raw_data.
    batch_size: int, the batch size.
    num_steps: int, the number of unrolls.

  Yields:
    Pairs of the batched data, each a matrix of shape [batch_size, num_steps].
    The second element of the tuple is the same data time-shifted to the
    right by one.

  Raises:
    ValueError: if batch_size or num_steps are too high.
  """
  raw_data = np.array(raw_data, dtype=np.int32)

  data_len = len(raw_data)
  batch_len = data_len // batch_size
  data = np.zeros([batch_size, batch_len], dtype=np.int32)
  for i in range(batch_size):
    data[i] = raw_data[batch_len * i:batch_len * (i + 1)]

  epoch_size = (batch_len - 1) // num_steps

  if epoch_size == 0:
    raise ValueError("epoch_size == 0, decrease batch_size or num_steps")

  for i in range(epoch_size):
    x = data[:, i*num_steps:(i+1)*num_steps]
    y = data[:, i*num_steps+1:(i+1)*num_steps+1]
    yield (x, y)

In [18]:
#Initial weight scale
init_scale = 0.1
#Initial learning rate
learning_rate = 1.0
#Maximum permissible norm for the gradient (For gradient clipping -- another measure against Exploding Gradients)
max_grad_norm = 5
#The number of layers in our model
num_layers = 2
#The total number of recurrence steps, also known as the number of layers when our RNN is "unfolded"
num_steps = 20
#The number of processing units (neurons) in the hidden layers
hidden_size_l1 = 256
hidden_size_l2 = 128
#The maximum number of epochs trained with the initial learning rate
max_epoch_decay_lr = 4
#The total number of epochs in training
max_epoch = 15
#The probability for keeping data in the Dropout Layer (This is an optimization, but is outside our scope for this notebook!)
#At 1, we ignore the Dropout Layer wrapping.
keep_prob = 1
#The decay for the learning rate
decay = 0.5
#The size for each batch of data
batch_size = 30
#The size of our vocabulary
vocab_size = 10000
embeding_vector_size= 200
#Training flag to separate training from testing
is_training = 1
#Data directory for our dataset
data_dir = "data/simple-examples/data/"

<h3>Training data</h3>

In [19]:
# Reads the data and separates it into training data, validation data and testing data
raw_data = ptb_raw_data(data_dir)
train_data, valid_data, test_data, vocab, word_to_id = raw_data

In [20]:
len(train_data)

929589

In [21]:
def id_to_word(id_list):
    line = []
    for w in id_list:
        for word, wid in word_to_id.items():
            if wid == w:
                line.append(word)
    return line            
                

print(id_to_word(train_data[0:100]))

['aer', 'banknote', 'berlitz', 'calloway', 'centrust', 'cluett', 'fromstein', 'gitano', 'guterman', 'hydro-quebec', 'ipo', 'kia', 'memotec', 'mlx', 'nahb', 'punts', 'rake', 'regatta', 'rubens', 'sim', 'snack-food', 'ssangyong', 'swapo', 'wachter', '<eos>', 'pierre', '<unk>', 'N', 'years', 'old', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', 'N', '<eos>', 'mr.', '<unk>', 'is', 'chairman', 'of', '<unk>', 'n.v.', 'the', 'dutch', 'publishing', 'group', '<eos>', 'rudolph', '<unk>', 'N', 'years', 'old', 'and', 'former', 'chairman', 'of', 'consolidated', 'gold', 'fields', 'plc', 'was', 'named', 'a', 'nonexecutive', 'director', 'of', 'this', 'british', 'industrial', 'conglomerate', '<eos>', 'a', 'form', 'of', 'asbestos', 'once', 'used', 'to', 'make', 'kent', 'cigarette', 'filters', 'has', 'caused', 'a', 'high', 'percentage', 'of', 'cancer', 'deaths', 'among', 'a', 'group', 'of']


In [22]:
itera = ptb_iterator(train_data, batch_size, num_steps)
first_touple = itera.__next__()
_input_data = first_touple[0]
_targets = first_touple[1]

In [24]:
_input_data.shape


(30, 20)

In [25]:
_targets.shape

(30, 20)

In [26]:
_input_data[0:3]

array([[9970, 9971, 9972, 9974, 9975, 9976, 9980, 9981, 9982, 9983, 9984,
        9986, 9987, 9988, 9989, 9991, 9992, 9993, 9994, 9995],
       [2654,    6,  334, 2886,    4,    1,  233,  711,  834,   11,  130,
         123,    7,  514,    2,   63,   10,  514,    8,  605],
       [   0, 1071,    4,    0,  185,   24,  368,   20,   31, 3109,  954,
          12,    3,   21,    2, 2915,    2,   12,    3,   21]])

In [27]:
print(id_to_word(_input_data[0,:]))

['aer', 'banknote', 'berlitz', 'calloway', 'centrust', 'cluett', 'fromstein', 'gitano', 'guterman', 'hydro-quebec', 'ipo', 'kia', 'memotec', 'mlx', 'nahb', 'punts', 'rake', 'regatta', 'rubens', 'sim']


<h3>Embeddings</h3>

In [29]:
embedding_layer = tf.keras.layers.Embedding(vocab_size, embeding_vector_size,batch_input_shape=(batch_size, num_steps),trainable=True,name="embedding_vocab")  

In [30]:
inputs = embedding_layer(_input_data)
inputs

<tf.Tensor: shape=(30, 20, 200), dtype=float32, numpy=
array([[[ 1.36911981e-02, -2.38683354e-02, -2.19960343e-02, ...,
          1.51716806e-02, -4.15931456e-02,  1.50777586e-02],
        [-1.83446296e-02, -5.49820811e-03,  4.47747596e-02, ...,
         -3.41062695e-02,  4.88087572e-02, -1.40839815e-03],
        [ 2.16720365e-02, -1.56941898e-02,  3.83081324e-02, ...,
         -4.51611765e-02, -4.30250429e-02, -4.79110740e-02],
        ...,
        [-3.41730937e-02,  4.38601263e-02, -3.62863392e-03, ...,
          4.47701328e-02, -3.30872089e-02,  3.58760357e-03],
        [-8.66303593e-03, -2.98999902e-02,  6.04574755e-03, ...,
          3.46110575e-02, -2.39387639e-02, -2.97568794e-02],
        [ 9.56581905e-03,  3.47135551e-02,  4.14527319e-02, ...,
          6.00447506e-03, -4.74563241e-02,  4.99121435e-02]],

       [[-1.27938166e-02, -2.87154913e-02, -3.10521964e-02, ...,
          2.19414271e-02, -4.46020365e-02, -2.59881616e-02],
        [ 2.17729472e-02, -3.60244513e-02, -2.90

<h3>Constructing Recurrent Neural Networks</h3>

In [31]:
lstm_cell_l1 = tf.keras.layers.LSTMCell(hidden_size_l1)
lstm_cell_l2 = tf.keras.layers.LSTMCell(hidden_size_l2)

In [32]:
stacked_lstm = tf.keras.layers.StackedRNNCells([lstm_cell_l1, lstm_cell_l2])

In [33]:
layer  =  tf.keras.layers.RNN(stacked_lstm,[batch_size, num_steps],return_state=False,stateful=True,trainable=True)

In [34]:
init_state = tf.Variable(tf.zeros([batch_size,embeding_vector_size]),trainable=False)
layer.inital_state = init_state
layer.inital_state

<tf.Variable 'Variable:0' shape=(30, 200) dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [35]:
outputs = layer(inputs)

In [36]:
outputs

<tf.Tensor: shape=(30, 20, 128), dtype=float32, numpy=
array([[[ 5.23586641e-04,  6.66461885e-04,  7.64806755e-04, ...,
          7.89060374e-04,  1.33517559e-03,  3.96889308e-03],
        [ 9.63843719e-04,  9.25735512e-04,  2.12256913e-03, ...,
          9.33756586e-04,  2.70738266e-03,  4.99183172e-03],
        [ 2.85160565e-03,  1.85258593e-03,  3.33827827e-03, ...,
          1.73360389e-03,  1.08783878e-03,  5.39264642e-03],
        ...,
        [-5.67265693e-03,  1.32962107e-03, -1.46105897e-03, ...,
          3.28588742e-03, -2.80904211e-03, -2.55321997e-04],
        [-5.12129255e-03, -1.17917603e-03, -7.77002075e-04, ...,
          2.55897036e-03, -2.77340878e-03, -7.28090061e-04],
        [-4.07692604e-03, -2.55182153e-03, -4.61588323e-04, ...,
          3.12436568e-05, -3.02291336e-03, -7.73620617e-04]],

       [[ 1.89789222e-04, -6.21110841e-04, -1.60401524e-03, ...,
          1.10104425e-04, -1.28551561e-03, -6.43569161e-04],
        [ 7.80227128e-04, -1.42132782e-03, -2.68

In [38]:
dense = tf.keras.layers.Dense(vocab_size)
logits_outputs  = dense(outputs)
print("shape of the output from dense layer: ", logits_outputs.shape) #(batch_size, sequence_length, vocab_size)

shape of the output from dense layer:  (30, 20, 10000)


In [39]:
activation = tf.keras.layers.Activation('softmax')
output_words_prob = activation(logits_outputs)
print("shape of the output from the activation layer: ", output_words_prob.shape) #(batch_size, sequence_length, vocab_size)

shape of the output from the activation layer:  (30, 20, 10000)


In [40]:
print("The probability of observing words in t=0 to t=20", output_words_prob[0,0:num_steps])

The probability of observing words in t=0 to t=20 tf.Tensor(
[[9.99876211e-05 9.99798140e-05 1.00015728e-04 ... 9.99814074e-05
  1.00037185e-04 1.00000878e-04]
 [9.99777694e-05 9.99483891e-05 9.99934055e-05 ... 9.99705298e-05
  1.00041085e-04 1.00000230e-04]
 [9.99718250e-05 9.99314288e-05 9.99816402e-05 ... 9.99526455e-05
  1.00029283e-04 1.00018777e-04]
 ...
 [1.00086305e-04 1.00027224e-04 9.99926779e-05 ... 1.00098085e-04
  1.00035366e-04 1.00035220e-04]
 [1.00096993e-04 1.00022444e-04 9.99803015e-05 ... 1.00103214e-04
  1.00022247e-04 1.00036690e-04]
 [1.00109282e-04 1.00009624e-04 9.99919939e-05 ... 1.00076621e-04
  1.00036203e-04 1.00037556e-04]], shape=(20, 10000), dtype=float32)


In [41]:
np.argmax(output_words_prob[0,0:num_steps], axis=1)

array([8706,  824, 9585, 9585, 9585, 6578, 5527, 5531, 5531, 1211, 1211,
        131,  131,  131,  131, 9184, 9184, 6437, 5638, 5638], dtype=int64)

In [42]:
_targets[0]

array([9971, 9972, 9974, 9975, 9976, 9980, 9981, 9982, 9983, 9984, 9986,
       9987, 9988, 9989, 9991, 9992, 9993, 9994, 9995, 9996])

In [43]:
def crossentropy(y_true, y_pred):
    return tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred)

In [44]:
loss  = crossentropy(_targets, output_words_prob)

In [45]:
loss[0,:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([9.210808, 9.210276, 9.210986, 9.210007, 9.210913, 9.210234,
       9.209206, 9.210795, 9.209507, 9.21038 ], dtype=float32)>

In [46]:
cost = tf.reduce_sum(loss / batch_size)
cost

<tf.Tensor: shape=(), dtype=float32, numpy=184.20636>

In [48]:
# Create a variable for the learning rate
lr = tf.Variable(0.0, trainable=False)
optimizer = tf.keras.optimizers.SGD(learning_rate=lr, clipnorm=max_grad_norm)

In [49]:
model = tf.keras.Sequential()
model.add(embedding_layer)
model.add(layer)
model.add(dense)
model.add(activation)
model.compile(loss=crossentropy, optimizer=optimizer)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_vocab (Embedding)  (30, 20, 200)            2000000   
                                                                 
 rnn_1 (RNN)                 (30, 20, 128)             671088    
                                                                 
 dense_1 (Dense)             (30, 20, 10000)           1290000   
                                                                 
 activation (Activation)     (30, 20, 10000)           0         
                                                                 
Total params: 3,961,088
Trainable params: 3,955,088
Non-trainable params: 6,000
_________________________________________________________________


In [50]:
# Get all TensorFlow variables marked as "trainable" (i.e. all of them except _lr, which we just created)
tvars = model.trainable_variables

In [51]:
[v.name for v in tvars] 

['embedding_vocab/embeddings:0',
 'rnn_1/stacked_rnn_cells_1/lstm_cell_3/kernel:0',
 'rnn_1/stacked_rnn_cells_1/lstm_cell_3/recurrent_kernel:0',
 'rnn_1/stacked_rnn_cells_1/lstm_cell_3/bias:0',
 'rnn_1/stacked_rnn_cells_1/lstm_cell_4/kernel:0',
 'rnn_1/stacked_rnn_cells_1/lstm_cell_4/recurrent_kernel:0',
 'rnn_1/stacked_rnn_cells_1/lstm_cell_4/bias:0',
 'dense_1/kernel:0',
 'dense_1/bias:0']

In [52]:
x = tf.constant(1.0)
y =  tf.constant(2.0)
with tf.GradientTape(persistent=True) as g:
    g.watch(x)
    g.watch(y)
    func_test = 2 * x * x + 3 * x * y

In [53]:
var_grad = g.gradient(func_test, x) # Will compute to 10.0
print(var_grad)

tf.Tensor(10.0, shape=(), dtype=float32)


In [54]:
var_grad = g.gradient(func_test, y) # Will compute to 3.0
print(var_grad)

tf.Tensor(3.0, shape=(), dtype=float32)


In [55]:
with tf.GradientTape() as tape:
    # Forward pass.
    output_words_prob = model(_input_data)
    # Loss value for this batch.
    loss  = crossentropy(_targets, output_words_prob)
    cost = tf.reduce_sum(loss,axis=0) / batch_size

In [56]:
# Get gradients of loss wrt the trainable variables.
grad_t_list = tape.gradient(cost, tvars)

In [57]:
print(grad_t_list)

[<tensorflow.python.framework.indexed_slices.IndexedSlices object at 0x000001CCE0FA9990>, <tf.Tensor: shape=(200, 1024), dtype=float32, numpy=
array([[-1.21057360e-06,  1.11048939e-06,  1.81532386e-07, ...,
        -9.68049108e-08, -3.67111710e-08, -2.76418064e-07],
       [-5.07294317e-07,  1.44252951e-07, -3.60630679e-07, ...,
         2.64155318e-07, -3.07675350e-07, -2.30252766e-07],
       [ 7.60647367e-08,  4.53663773e-07,  2.49056058e-07, ...,
         3.81014900e-08, -1.46729548e-07, -2.64645848e-07],
       ...,
       [-2.80405317e-07,  4.15165147e-08, -7.11877249e-07, ...,
        -1.83229218e-07,  2.98460407e-07, -1.07527256e-07],
       [-1.00541524e-06,  8.35447622e-09,  3.09439770e-07, ...,
         3.70392456e-07,  8.74839259e-08, -1.75751911e-08],
       [ 5.03668559e-07, -2.07654324e-07,  5.58799741e-07, ...,
        -3.52775515e-08,  8.56571276e-08,  1.86711205e-07]], dtype=float32)>, <tf.Tensor: shape=(256, 1024), dtype=float32, numpy=
array([[-9.39754159e-08,  1.20

In [58]:
# Define the gradient clipping threshold
grads, _ = tf.clip_by_global_norm(grad_t_list, max_grad_norm)
grads

[<tensorflow.python.framework.indexed_slices.IndexedSlices at 0x1cce0fab1f0>,
 <tf.Tensor: shape=(200, 1024), dtype=float32, numpy=
 array([[-1.21057360e-06,  1.11048939e-06,  1.81532386e-07, ...,
         -9.68049108e-08, -3.67111710e-08, -2.76418064e-07],
        [-5.07294317e-07,  1.44252951e-07, -3.60630679e-07, ...,
          2.64155318e-07, -3.07675350e-07, -2.30252766e-07],
        [ 7.60647367e-08,  4.53663773e-07,  2.49056058e-07, ...,
          3.81014900e-08, -1.46729548e-07, -2.64645848e-07],
        ...,
        [-2.80405317e-07,  4.15165147e-08, -7.11877249e-07, ...,
         -1.83229218e-07,  2.98460407e-07, -1.07527256e-07],
        [-1.00541524e-06,  8.35447622e-09,  3.09439770e-07, ...,
          3.70392456e-07,  8.74839259e-08, -1.75751911e-08],
        [ 5.03668559e-07, -2.07654324e-07,  5.58799741e-07, ...,
         -3.52775515e-08,  8.56571276e-08,  1.86711205e-07]], dtype=float32)>,
 <tf.Tensor: shape=(256, 1024), dtype=float32, numpy=
 array([[-9.39754159e-08,  

In [59]:
# Create the training TensorFlow Operation through our optimizer
train_op = optimizer.apply_gradients(zip(grads, tvars))

In [60]:
class PTBModel(object):


    def __init__(self):
        ######################################
        # Setting parameters for ease of use #
        ######################################
        self.batch_size = batch_size
        self.num_steps = num_steps
        self.hidden_size_l1 = hidden_size_l1
        self.hidden_size_l2 = hidden_size_l2
        self.vocab_size = vocab_size
        self.embeding_vector_size = embeding_vector_size
        # Create a variable for the learning rate
        self._lr = 1.0
        
        ###############################################################################
        # Initializing the model using keras Sequential API  #
        ###############################################################################
        
        self._model = tf.keras.models.Sequential()
        
        ####################################################################
        # Creating the word embeddings layer and adding it to the sequence #
        ####################################################################
        with tf.device("/cpu:0"):
            # Create the embeddings for our input data. Size is hidden size.
            self._embedding_layer = tf.keras.layers.Embedding(self.vocab_size, self.embeding_vector_size,batch_input_shape=(self.batch_size, self.num_steps),trainable=True,name="embedding_vocab")  #[10000x200]
            self._model.add(self._embedding_layer)
            

        ##########################################################################
        # Creating the LSTM cell structure and connect it with the RNN structure #
        ##########################################################################
        # Create the LSTM Cells. 
        # This creates only the structure for the LSTM and has to be associated with a RNN unit still.
        # The argument  of LSTMCell is size of hidden layer, that is, the number of hidden units of the LSTM (inside A). 
        # LSTM cell processes one word at a time and computes probabilities of the possible continuations of the sentence.
        lstm_cell_l1 = tf.keras.layers.LSTMCell(hidden_size_l1)
        lstm_cell_l2 = tf.keras.layers.LSTMCell(hidden_size_l2)
        

        
        # By taking in the LSTM cells as parameters, the StackedRNNCells function junctions the LSTM units to the RNN units.
        # RNN cell composed sequentially of stacked simple cells.
        stacked_lstm = tf.keras.layers.StackedRNNCells([lstm_cell_l1, lstm_cell_l2])


        

        ############################################
        # Creating the input structure for our RNN #
        ############################################
        # Input structure is 20x[30x200]
        # Considering each word is represended by a 200 dimentional vector, and we have 30 batchs, we create 30 word-vectors of size [30xx2000]
        # The input structure is fed from the embeddings, which are filled in by the input data
        # Feeding a batch of b sentences to a RNN:
        # In step 1,  first word of each of the b sentences (in a batch) is input in parallel.  
        # In step 2,  second word of each of the b sentences is input in parallel. 
        # The parallelism is only for efficiency.  
        # Each sentence in a batch is handled in parallel, but the network sees one word of a sentence at a time and does the computations accordingly. 
        # All the computations involving the words of all sentences in a batch at a given time step are done in parallel. 

        ########################################################################################################
        # Instantiating our RNN model and setting stateful to True to feed forward the state to the next layer #
        ########################################################################################################
        
        self._RNNlayer  =  tf.keras.layers.RNN(stacked_lstm,[batch_size, num_steps],return_state=False,stateful=True,trainable=True)
        
        # Define the initial state, i.e., the model state for the very first data point
        # It initialize the state of the LSTM memory. The memory state of the network is initialized with a vector of zeros and gets updated after reading each word.
        self._initial_state = tf.Variable(tf.zeros([batch_size,embeding_vector_size]),trainable=False)
        self._RNNlayer.inital_state = self._initial_state
    
        ############################################
        # Adding RNN layer to keras sequential API #
        ############################################        
        self._model.add(self._RNNlayer)
        
        #self._model.add(tf.keras.layers.LSTM(hidden_size_l1,return_sequences=True,stateful=True))
        #self._model.add(tf.keras.layers.LSTM(hidden_size_l2,return_sequences=True))
        
        
        ####################################################################################################
        # Instantiating a Dense layer that connects the output to the vocab_size  and adding layer to model#
        ####################################################################################################
        self._dense = tf.keras.layers.Dense(self.vocab_size)
        self._model.add(self._dense)
 
        
        ####################################################################################################
        # Adding softmax activation layer and deriving probability to each class and adding layer to model #
        ####################################################################################################
        self._activation = tf.keras.layers.Activation('softmax')
        self._model.add(self._activation)

        ##########################################################
        # Instantiating the stochastic gradient decent optimizer #
        ########################################################## 
        self._optimizer = tf.keras.optimizers.SGD(lr=self._lr, clipnorm=max_grad_norm)
        
        
        ##############################################################################
        # Compiling and summarizing the model stacked using the keras sequential API #
        ##############################################################################
        self._model.compile(loss=self.crossentropy, optimizer=self._optimizer)
        self._model.summary()


    def crossentropy(self,y_true, y_pred):
        return tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred)

    def train_batch(self,_input_data,_targets):
        #################################################
        # Creating the Training Operation for our Model #
        #################################################
        # Create a variable for the learning rate
        self._lr = tf.Variable(0.0, trainable=False)
        # Get all TensorFlow variables marked as "trainable" (i.e. all of them except _lr, which we just created)
        tvars = self._model.trainable_variables
        # Define the gradient clipping threshold
        with tf.GradientTape() as tape:
            # Forward pass.
            output_words_prob = self._model(_input_data)
            # Loss value for this batch.
            loss  = self.crossentropy(_targets, output_words_prob)
            # average across batch and reduce sum
            cost = tf.reduce_sum(loss/ self.batch_size)
        # Get gradients of loss wrt the trainable variables.
        grad_t_list = tape.gradient(cost, tvars)
        # Define the gradient clipping threshold
        grads, _ = tf.clip_by_global_norm(grad_t_list, max_grad_norm)
        # Create the training TensorFlow Operation through our optimizer
        train_op = self._optimizer.apply_gradients(zip(grads, tvars))
        return cost
        
    def test_batch(self,_input_data,_targets):
        #################################################
        # Creating the Testing Operation for our Model #
        #################################################
        output_words_prob = self._model(_input_data)
        loss  = self.crossentropy(_targets, output_words_prob)
        # average across batch and reduce sum
        cost = tf.reduce_sum(loss/ self.batch_size)

        return cost
    @classmethod
    def instance(cls) : 
        return PTBModel()

In [61]:
########################################################################################################################
# run_one_epoch takes as parameters  the model instance, the data to be fed, training or testing mode and verbose info #
########################################################################################################################
def run_one_epoch(m, data,is_training=True,verbose=False):

    #Define the epoch size based on the length of the data, batch size and the number of steps
    epoch_size = ((len(data) // m.batch_size) - 1) // m.num_steps
    start_time = time.time()
    costs = 0.
    iters = 0
    
    m._model.reset_states()
    
    #For each step and data point
    for step, (x, y) in enumerate(ptb_iterator(data, m.batch_size, m.num_steps)):
        
        #Evaluate and return cost, state by running cost, final_state and the function passed as parameter
        #y = tf.keras.utils.to_categorical(y, num_classes=vocab_size)
        if is_training : 
            loss=  m.train_batch(x, y)
        else :
            loss = m.test_batch(x, y)
                                   

        #Add returned cost to costs (which keeps track of the total costs for this epoch)
        costs += loss
        
        #Add number of steps to iteration counter
        iters += m.num_steps

        if verbose and step % (epoch_size // 10) == 10:
            print("Itr %d of %d, perplexity: %.3f speed: %.0f wps" % (step , epoch_size, np.exp(costs / iters), iters * m.batch_size / (time.time() - start_time)))
        


    # Returns the Perplexity rating for us to keep track of how the model is evolving
    return np.exp(costs / iters)


In [62]:
# Reads the data and separates it into training data, validation data and testing data
raw_data = ptb_raw_data(data_dir)
train_data, valid_data, test_data, _, _ = raw_data

In [63]:
# Instantiates the PTBModel class
m=PTBModel.instance()   
K = tf.keras.backend 
for i in range(max_epoch):
    # Define the decay for this epoch
    lr_decay = decay ** max(i - max_epoch_decay_lr, 0.0)
    dcr = learning_rate * lr_decay
    m._lr = dcr
    K.set_value(m._model.optimizer.learning_rate,m._lr)
    print("Epoch %d : Learning rate: %.3f" % (i + 1, m._model.optimizer.learning_rate))
    # Run the loop for this epoch in the training mode
    train_perplexity = run_one_epoch(m, train_data,is_training=True,verbose=True)
    print("Epoch %d : Train Perplexity: %.3f" % (i + 1, train_perplexity))
        
    # Run the loop for this epoch in the validation mode
    valid_perplexity = run_one_epoch(m, valid_data,is_training=False,verbose=False)
    print("Epoch %d : Valid Perplexity: %.3f" % (i + 1, valid_perplexity))
    
# Run the loop in the testing mode to see how effective was our training
test_perplexity = run_one_epoch(m, test_data,is_training=False,verbose=False)
print("Test Perplexity: %.3f" % test_perplexity)


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_vocab (Embedding)  (30, 20, 200)            2000000   
                                                                 
 rnn_2 (RNN)                 (30, 20, 128)             671088    
                                                                 
 dense_2 (Dense)             (30, 20, 10000)           1290000   
                                                                 
 activation_1 (Activation)   (30, 20, 10000)           0         
                                                                 
Total params: 3,961,088
Trainable params: 3,955,088
Non-trainable params: 6,000
_________________________________________________________________
Epoch 1 : Learning rate: 1.000
Itr 10 of 1549, perplexity: 4592.739 speed: 2376 wps
Itr 164 of 1549, perplexity: 1096.940 speed: 2583 wps
Itr 318 of 1549, perplexity: 845.5