In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import os
import time
## code adopted from tf, pytorch and karpathy blog

In [None]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [None]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print(f'Length of text: {len(text)} characters')

Length of text: 1115394 characters


In [None]:
# Take a look at the first 400 characters in text
print(text[:400])
# The unique characters in the file
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')
example_texts = ['NLPUSF', 'Assignment3']

chars = tf.strings.unicode_split(example_texts, input_encoding='UTF-8')
print(chars)
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)
ids = ids_from_chars(chars)
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)
chars = chars_from_ids(ids)
tf.strings.reduce_join(chars, axis=-1).numpy()

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it 
65 unique characters
<tf.RaggedTensor [[b'N', b'L', b'P', b'U', b'S', b'F'],
 [b'A', b's', b's', b'i', b'g', b'n', b'm', b'e', b'n', b't', b'3']]>


array([b'NLPUSF', b'Assignment3'], dtype=object)

In [None]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))
seq_length = 140
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq))

for seq in sequences.take(5):
  print(text_from_ids(seq).numpy())

def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text
split_input_target(list("Tensorflow"))

dataset = sequences.map(split_input_target)
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 256

class NLPUSFModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

F
i
r
s
t
 
C
i
t
i
tf.Tensor(
[b'F' b'i' b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':'
 b'\n' b'B' b'e' b'f' b'o' b'r' b'e' b' ' b'w' b'e' b' ' b'p' b'r' b'o'
 b'c' b'e' b'e' b'd' b' ' b'a' b'n' b'y' b' ' b'f' b'u' b'r' b't' b'h'
 b'e' b'r' b',' b' ' b'h' b'e' b'a' b'r' b' ' b'm' b'e' b' ' b's' b'p'
 b'e' b'a' b'k' b'.' b'\n' b'\n' b'A' b'l' b'l' b':' b'\n' b'S' b'p' b'e'
 b'a' b'k' b',' b' ' b's' b'p' b'e' b'a' b'k' b'.' b'\n' b'\n' b'F' b'i'
 b'r' b's' b't' b' ' b'C' b'i' b't' b'i' b'z' b'e' b'n' b':' b'\n' b'Y'
 b'o' b'u' b' ' b'a' b'r' b'e' b' ' b'a' b'l' b'l' b' ' b'r' b'e' b's'
 b'o' b'l' b'v' b'e' b'd' b' ' b'r' b'a' b't' b'h' b'e' b'r' b' ' b't'
 b'o' b' ' b'd' b'i' b'e' b' ' b't' b'h' b'a' b'n' b' ' b't' b'o' b' '
 b'f'], shape=(141,), dtype=string)
b'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to f'
b"amish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, 

In [None]:
model = NLPUSFModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

model.summary()

(64, 140, 66) # (batch_size, sequence_length, vocab_size)
Model: "nlpusf_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  16896     
                                                                 
 gru (GRU)                   multiple                  394752    
                                                                 
 dense (Dense)               multiple                  16962     
                                                                 
Total params: 428610 (1.64 MB)
Trainable params: 428610 (1.64 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)
tf.exp(example_batch_mean_loss).numpy()

Input:
 b' and Derby.\n\nBUCKINGHAM:\nGood time of day unto your royal grace!\n\nDERBY:\nGod make your majesty joyful as you have been!\n\nQUEEN ELIZABETH:\nTh'

Next Char Predictions:
 b"s$Blms&!tYx\naOMGiXFx ?YdPBe\np:fDjBqqd?gXt-'FG'IVlCrxZa$brURLf\nSSsWswj.tbnWnliFWK-t-'yaVOZ,tBXwidE!ZPCJQF[UNK]YTIlSpowezjGJuE3QXQ:Ll-HN[UNK]xe3:zlOWG"
Prediction shape:  (64, 140, 66)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(4.1903095, shape=(), dtype=float32)


66.043236

In [None]:
model.compile(optimizer='adam', loss=loss)
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)
EPOCHS = 20
# Start training your model
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [None]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [None]:
start = time.time()
states = None
next_char = tf.constant(['Queen:'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

Queen:
Take thou with our thee.

LUCIO:
To-be in my business cannot love, severe draw 'er an another beet
fine in our country! where's thee my bootly pound
Our still overenty. He! like a sel-welcome to before-house:
Even therefore nog, dry if this highold,
Some patient. Yet cannot choes my heart not what
my life and his fool.

LUCENTIO:
Let me, here at Calive's belingly.

DUKE VINCENTIO:
Harry all whom it in his shalf it fees
This ourses as poor love.

PAULINA:
Pray'd to help you nor so unlaps!

LUCENTIO:

CLARENCE:
True you to goot
Murder was heaven, with a dount at bears;
And to be here 'gainst my soul:
Whate with wantance of with him with first.

ANGELO:
It is in lived grieves.

PRORSTI:
Whilst befoll to you a measure base in thyself
Heth havour Jacks, to sleep me
A booke shinsten, I consent him to-laugh;
As if I am services, that late desurard;
For may, love again.

LUCIO:
What is ever, and thy house, proUcesed upon him,
Anthough, or thou shalt night book how as air.
My raise mine 

## Simple beam search pseudocode, adapt this to
function BEAM_SEARCH(RNN, start_sequence, beam_width):
    # RNN: the recurrent neural network model for sequence generation (custom LSTM, GRU, custom Elman RNN)
    # start_sequence: the initial part of the sequence (could be just a start symbol or set of symbols)
    # beam_width: the number of sequences to keep at each step -- This is another hyper-parameter, play with it, as discussed in class, beam search will still provide you sub-optimal solution

    Initialize an empty list `candidates` to store current sequence candidates -- One can use other datastructures, to optimize overall workeflow
    Initialize an empty list `final_candidates` to store completed sequences
    
    Add start_sequence to `candidates` with its score (e.g., log likelihood)

    while not all sequences in `candidates` are complete:
        Initialize an empty list `all_expansions` for storing all possible next steps

        for each sequence in `candidates`:
            if the sequence is complete:
                Add it to `final_candidates`
                Continue to the next iteration

            Predict the next step probabilities using RNN given the current sequence
            Select top-k next steps (where k is the beam width) based on probabilities

            for each next step in top-k:
                Create a new sequence by appending the next step to the current sequence
                Calculate the new sequence's score (e.g., update log likelihood)
                Add the new sequence and its score to `all_expansions`

        Sort `all_expansions` by score in descending order
        Keep only the top `beam_width` sequences in `all_expansions`
        Replace `candidates` with `all_expansions`

    Add any remaining sequences in `candidates` to `final_candidates`
    Sort `final_candidates` by score in descending order

    return the top sequence from `final_candidates` (or top-N sequences if desired)

# Usage example
1. RNN = InitializeYourRNNModel()
2. start_sequence = ["<start>"]  # Example start symbol
3. beam_width = 5  # Example beam width
4. best_sequence = BEAM_SEARCH(RNN, start_sequence, beam_width)
5. print("Best sequence:", best_sequence)
Check above step on one-step this will provide you with tricks that will be useful to create beam-search

# Things to do
1. Integrate custom_beamsearch with your models
1. Optimize your hyper-parameter --> Learning rate, hidden_size, layers, optimizer, epochs, batch_size
2. Divide dataset into train, validation, and test, once your model gets reasonable performance (lower loss), then test the story generation capability of your system
3. Replace GRU with custom LSTM shared with you and test how it works
4. Create custom Elman RNN (h_t = tanh(X_tW + Uh_{t-1} + b)) and compare performance across different RNNs (Custom_ElmanRNN, GRU, Custom_LSTM). Also provide loss curves for each models and saved weights.
5. Provide statistical significance of your model
6. Show different texts generated by your models

In [None]:
def BeamSearch(RNN, start_sequence:str, beam_width:int, temperature = 1.0, gen_length=1000): #Without States
  skip_ids = ids_from_chars(['[UNK]'])[:, None]
  sparse_mask = tf.SparseTensor(
      # Put a -inf at each bad index.
      values=[-float('inf')]*len(skip_ids),
      indices=skip_ids,
      # Match the shape to the vocabulary
      dense_shape=[len(ids_from_chars.get_vocabulary())])
  prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def one_step(RNN,inputs):
    #print("inputs",inputs)
    input_chars = tf.strings.unicode_split(inputs,'UTF-8')
    #print("inputs_chars",input_chars)
    input_ids = ids_from_chars(input_chars).to_tensor()
    #print("inputs_chars",input_ids)
    predicted_logits = RNN(inputs=input_ids)
    predicted_logits = predicted_logits[:,-1,:]
    predicted_logits = predicted_logits/temperature
    predicted_logits += prediction_mask
    return predicted_logits #-1 is the last character in the sequence


  next_char = tf.constant([start_sequence])
  #print(tf.get_static_value(next_char)[0])
  #print(len(tf.get_static_value(next_char)[0]))
  candidates = [(0,next_char)]#,states)] # We can set our starting prob to zero because all sequences share the starting probability value
  final_candidates = []
  incomplete = True
  while(incomplete):
    all_expansions = []
    for sequence in candidates:
      current_seq = sequence[1]
      if(len(tf.get_static_value(current_seq)[0])>=gen_length):
        final_candidates.append(sequence)
        continue
      predicted_logits = one_step(RNN,current_seq) # Predicted Logits for the last letter in the sequence
      softmax = tf.nn.softmax(predicted_logits,1)
      beam_values, beam_indices = tf.nn.top_k(softmax, k=beam_width)
      beam_ids = tf.get_static_value(beam_indices)
      beam_scores = tf.get_static_value(beam_values)
      for i in range(0,len(beam_ids[0])):
        # print(beam_ids[0][i])
        # print(chars_from_ids(beam_ids[0][i]))
        new_sequence = tf.strings.join([current_seq]+chars_from_ids(beam_ids[0][i])) #Appends new char to sequence
        new_score = sequence[0] + np.log(beam_scores[0][i])
        #print(new_score,new_sequence)
        all_expansions.append((new_score,new_sequence))#,states))
    candidates = sorted(all_expansions, key=lambda seq: seq[0], reverse=True)[:beam_width] #Replace candidates with top N sequences where N is the beamwidth of all_expansions
    #print(candidates[0][1])
    incomplete = False #Check end condition
    for seq in candidates:
      if(len(tf.get_static_value(seq[1])[0]) < gen_length):
        incomplete = True
        break
    #print(all_expansions)
    #incomplete = False
  final_candidates += candidates
  return sorted(final_candidates, key=lambda seq: seq[0], reverse=True)


In [None]:
gru0 = NLPUSFModel(
    vocab_size=vocab_size,
    embedding_dim=256,
    rnn_units=256)

for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = gru0(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
gru0.summary()
gru0.compile(optimizer='adam', loss=loss)
# Directory where the checkpoints will be saved
checkpoint_dir = './gru0_training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
# Stops training if there is no improvement for threee consec epochs
early_stop_callback = keras.callbacks.EarlyStopping(monitor='loss',patience=3)

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)
EPOCHS = 20
# Start training your model
gru0_history = gru0.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback,early_stop_callback])

(64, 140, 66) # (batch_size, sequence_length, vocab_size)
Model: "nlpusf_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     multiple                  16896     
                                                                 
 gru_2 (GRU)                 multiple                  394752    
                                                                 
 dense_2 (Dense)             multiple                  16962     
                                                                 
Total params: 428610 (1.64 MB)
Trainable params: 428610 (1.64 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epo

In [None]:
gru1 = NLPUSFModel(
    vocab_size=vocab_size,
    embedding_dim=256,
    rnn_units=256)

for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = gru1(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
gru1.summary()
gru1.compile(optimizer='adam', loss=loss)
# Directory where the checkpoints will be saved
checkpoint_dir = './gru1_training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
# Stops training if there is no improvement for threee consec epochs
early_stop_callback = keras.callbacks.EarlyStopping(monitor='loss',patience=3)

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)
EPOCHS = 100
# Start training your model
gru1_history = gru1.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback,early_stop_callback])

(64, 140, 66) # (batch_size, sequence_length, vocab_size)
Model: "nlpusf_model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     multiple                  16896     
                                                                 
 gru_3 (GRU)                 multiple                  394752    
                                                                 
 dense_3 (Dense)             multiple                  16962     
                                                                 
Total params: 428610 (1.64 MB)
Trainable params: 428610 (1.64 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/1

In [None]:
gru = NLPUSFModel(
    vocab_size=vocab_size,
    embedding_dim=512,
    rnn_units=512)

for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = gru(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
gru.summary()
gru.compile(optimizer='adam', loss=loss)
# Directory where the checkpoints will be saved
checkpoint_dir = './gru_training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
# Stops training if there is no improvement for threee consec epochs
early_stop_callback = keras.callbacks.EarlyStopping(monitor='loss',patience=3)

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)
EPOCHS = 100
# Start training your model
gru_history = gru.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback,early_stop_callback])

(64, 140, 66) # (batch_size, sequence_length, vocab_size)
Model: "nlpusf_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     multiple                  33792     
                                                                 
 gru_1 (GRU)                 multiple                  1575936   
                                                                 
 dense_1 (Dense)             multiple                  33858     
                                                                 
Total params: 1643586 (6.27 MB)
Trainable params: 1643586 (6.27 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18

In [None]:
final = BeamSearch(gru,"Queen: ",5,gen_length=500)

In [None]:
final0 = BeamSearch(gru0,"Queen: ",5,gen_length=500)
final1 = BeamSearch(gru1,"Queen: ",5,gen_length=500)


In [None]:
#print(final[0])
print(final0[0][1][0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print(final1[1][1][0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print(final[2][1][0].numpy().decode('utf-8'), '\n\n' + '_'*80)

Queen: therefore he shall have thee to the prince,
And therefore he shall have thee to the prince,
And therefore he shall have thee to the prince,
And therefore he shall have thee to the prince,
And therefore he shall have thee to the prince,
And therefore he shall have thee to the prince,
And therefore he shall have thee to the prince,
And therefore he shall have thee to the prince,
And therefore he shall have thee to the prince,
And therefore he shall have thee to the prince,
And there is the  

________________________________________________________________________________
Queen: 'tis no more down.

LEONTES:
What is the news?

BUCKINGHAM:
My lord, I have no more remedy.

DUKE OF AUMERLE:
My lord, this is the prince, which shall be thy conscience.

KING RICHARD III:
Why, thou dost thou dead, and thou shalt not never see
The present death and honour of the death.

DUKE OF AUMERLE:
My lord, this is nothing; and therefore comes all.

DUKE OF AUMERLE:
My lord, this is the prince, which 

In [None]:
class CustomLSTMCell(keras.layers.Layer):
  def __init__(self, units, **kwargs):
    super(CustomLSTMCell, self).__init__(**kwargs)
    self.units = units
    self.state_size = [units, units]  # Hidden state size and cell state size

  def build(self, input_shape):
    input_dim = input_shape[-1]
    # One can play with init to stabalize learning, remember what we discussed for MLP
    # As described in class LSTM is simply 4 different RNNs (h_t = sigma(Wx_t + Uh_{t-1} + b)) working in parallel, but connected jointly.
    # Weights for the input gate
    self.W_i = self.add_weight(shape=(input_dim, self.units), initializer='random_normal', name='W_i')
    self.U_i = self.add_weight(shape=(self.units, self.units), initializer='random_normal', name='U_i')
    self.b_i = self.add_weight(shape=(self.units,), initializer='zeros', name='b_i')

    # Weights for the forget gate
    self.W_f = self.add_weight(shape=(input_dim, self.units), initializer='random_normal', name='W_f')
    self.U_f = self.add_weight(shape=(self.units, self.units), initializer='random_normal', name='U_f')
    self.b_f = self.add_weight(shape=(self.units,), initializer='zeros', name='b_f')

    # Weights for the cell state
    self.W_c = self.add_weight(shape=(input_dim, self.units), initializer='random_normal', name='W_c')
    self.U_c = self.add_weight(shape=(self.units, self.units), initializer='random_normal', name='U_c')
    self.b_c = self.add_weight(shape=(self.units,), initializer='zeros', name='b_c')

    # Weights for the output gate
    self.W_o = self.add_weight(shape=(input_dim, self.units), initializer='random_normal', name='W_o')
    self.U_o = self.add_weight(shape=(self.units, self.units), initializer='random_normal', name='U_o')
    self.b_o = self.add_weight(shape=(self.units,), initializer='zeros', name='b_o')

    super(CustomLSTMCell, self).build(input_shape)

  def call(self, inputs, states, return_state=None,training=None):
    #print("called")
    h_tm1, c_tm1 = states  # Previous state
    # Input gate
    i = tf.sigmoid(tf.matmul(inputs, self.W_i) + tf.matmul(h_tm1, self.U_i) + self.b_i)
    # Forget gate
    f = tf.sigmoid(tf.matmul(inputs, self.W_f) + tf.matmul(h_tm1, self.U_f) + self.b_f)

    # Cell state
    c_ = tf.tanh(tf.matmul(inputs, self.W_c) + tf.matmul(h_tm1, self.U_c) + self.b_c)
    c = f * c_tm1 + i * c_

    # Output gate
    o = tf.sigmoid(tf.matmul(inputs, self.W_o) + tf.matmul(h_tm1, self.U_o) + self.b_o)
    # New hidden state
    h = o * tf.tanh(c)
    return h, [h, c]

In [None]:
vocab_size = 66
embedding_dim = 512
rnn_units = 512 # Number of LSTM units
input_shape = (None, embedding_dim)  # Example input shape (timesteps, features)
# Create the LSTM layer using the custom cell
lstm_layer = keras.layers.RNN(CustomLSTMCell(rnn_units), input_shape=input_shape,return_sequences=True)
lstm = keras.Sequential([
  keras.layers.Embedding(vocab_size, embedding_dim),
  lstm_layer,
  keras.layers.Dense(vocab_size)  # Example output layer
])
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

lstm.compile(optimizer='adam', loss=loss)
lstm.summary()


In [None]:
checkpoint_dir = './lstm_training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
# Stops training if there is no improvement for threee consec epochs
early_stop_callback = keras.callbacks.EarlyStopping(monitor='loss',patience=3)

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)
EPOCHS = 100
# Start training your model
lstm_history = lstm.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback,early_stop_callback])

NameError: name 'lstm' is not defined

In [None]:
vocab_size = 66
embedding_dim = 256
rnn_units = 512 # Number of units
input_shape = (None, embedding_dim)  # Example input shape (timesteps, features)
# Create the Elman layer using SimpleRNN
elman_layerR512 = keras.layers.SimpleRNN(rnn_units,input_shape=input_shape,return_sequences=True)
elmanR512 = keras.Sequential([
  keras.layers.Embedding(vocab_size, embedding_dim),
  elman_layerR512,
  keras.layers.Dense(vocab_size)  # Example output layer
])
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

elmanR512.compile(optimizer='adam', loss=loss)
elmanR512.summary()

checkpoint_dir = './elmanR512_training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
# Stops training if there is no improvement for threee consec epochs
early_stop_callback = keras.callbacks.EarlyStopping(monitor='loss',patience=3)

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)
EPOCHS = 100
# Start training your model
elmanR512_history = elmanR512.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback,early_stop_callback])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, None, 256)         16896     
                                                                 
 simple_rnn (SimpleRNN)      (None, None, 512)         393728    
                                                                 
 dense_4 (Dense)             (None, None, 66)          33858     
                                                                 
Total params: 444482 (1.70 MB)
Trainable params: 444482 (1.70 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 2

In [None]:
elman_final = BeamSearch(elmanR512,"Queen: ",5,gen_length=500)

In [None]:
print(elman_final[0][1][0].numpy().decode('utf-8'), '\n\n' + '_'*80)

Queen: they are not your father's death,
And that the queen is out of thine own life,
And that the queen is out of thine own life,
And that the queen is out of thine own life,
And that the queen is out of thine own life,
And that the queen is out of thine own life,
And that the queen is out of thine own life,
And that the queen is out of thine own life,
And that the queen is out of thine own life,
And that the queen is out of thine own life,
And that the queen is out of thine own life,
And that  

________________________________________________________________________________


In [None]:
vocab_size = 66
embedding_dim = 256
rnn_units = 128 # Number of units
input_shape = (None, embedding_dim)  # Example input shape (timesteps, features)
# Create the Elman layer using SimpleRNN
elman_layerR128 = keras.layers.SimpleRNN(rnn_units,input_shape=input_shape,return_sequences=True)
elmanR128 = keras.Sequential([
  keras.layers.Embedding(vocab_size, embedding_dim),
  elman_layerR128,
  keras.layers.Dense(vocab_size)  # Example output layer
])
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

elmanR128.compile(optimizer='adam', loss=loss)
elmanR128.summary()

checkpoint_dir = './elmanR128_training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
# Stops training if there is no improvement for threee consec epochs
early_stop_callback = keras.callbacks.EarlyStopping(monitor='loss',patience=3)

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)
EPOCHS = 100
# Start training your model
elmanR128_history = elmanR128.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback,early_stop_callback])

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, None, 256)         16896     
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, None, 128)         49280     
                                                                 
 dense_5 (Dense)             (None, None, 66)          8514      
                                                                 
Total params: 74690 (291.76 KB)
Trainable params: 74690 (291.76 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epo