In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
!pip install -q kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json
!kaggle datasets download -d prashantkarwasra/books-dataset-text-generation

Dataset URL: https://www.kaggle.com/datasets/prashantkarwasra/books-dataset-text-generation
License(s): unknown


In [3]:
!unzip "books-dataset-text-generation.zip" -d "/content/dataset/"

Archive:  books-dataset-text-generation.zip
  inflating: /content/dataset/HarryPotter1.txt  
  inflating: /content/dataset/HarryPotter2.txt  
  inflating: /content/dataset/HarryPotter3.txt  
  inflating: /content/dataset/HarryPotter4.txt  
  inflating: /content/dataset/HarryPotter5.txt  
  inflating: /content/dataset/HarryPotter6.txt  
  inflating: /content/dataset/HarryPotter7.txt  
  inflating: /content/dataset/Hobbit1.txt  
  inflating: /content/dataset/LOTR1.txt  
  inflating: /content/dataset/LOTR2.txt  
  inflating: /content/dataset/LOTR3.txt  
  inflating: /content/dataset/Silmarillion4.txt  


In [4]:
import os

files_to_delete = ['dataset/HarryPotter1.txt','dataset/HarryPotter2.txt','dataset/HarryPotter3.txt','dataset/HarryPotter4.txt','dataset/HarryPotter5.txt','dataset/HarryPotter6.txt'
,'dataset/HarryPotter7.txt', 'dataset/Silmarillion4.txt']

for file in files_to_delete:
    if os.path.exists(file):
        os.remove(file)
        print(f"Deleted: {file}")
    else:
        print(f"File not found: {file}")


Deleted: dataset/HarryPotter1.txt
Deleted: dataset/HarryPotter2.txt
Deleted: dataset/HarryPotter3.txt
Deleted: dataset/HarryPotter4.txt
Deleted: dataset/HarryPotter5.txt
Deleted: dataset/HarryPotter6.txt
Deleted: dataset/HarryPotter7.txt
Deleted: dataset/Silmarillion4.txt


In [5]:
import tensorflow as tf
import time

In [6]:
files = ['dataset/Hobbit1.txt','dataset/LOTR1.txt','dataset/LOTR2.txt','dataset/LOTR3.txt']
with open('LOTR.txt', 'w') as outfile:
  for file in files:
    with open(file) as infile:
      outfile.write(infile.read())

text = open('LOTR.txt').read()
print ('Length of text: {} characters'.format(len(text)))

Length of text: 3041991 characters


In [7]:
print(text[:300])

Chapter I 


AN UNEXPECTED PARTY 


In a hole in the ground there lived a hobbit. Not a nasty, dirty, wet hole, filled with the ends of worms 
and an oozy smell, nor yet a dry, bare, sandy hole with nothing in it to sit down on or to eat: it was a 
hobbit-hole, and that means comfort. 

It had a per


In [8]:
# The unique characters in the file
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

87 unique characters


In [9]:
# Creating a mapping from unique characters to indices
char2index = {u:i for i, u in enumerate(vocab)}
index2char = np.array(vocab)

text_as_int = np.array([char2index[c] for c in text])

print(text_as_int)

[30 63 56 ... 64 59 12]


In [10]:
# Show how the first 30 characters from the text are mapped to integers
print ('{} -- characters mapped to int -- > {}'.format(repr(text[:30]), text_as_int[:30]))

'Chapter I \n\n\nAN UNEXPECTED PAR' -- characters mapped to int -- > [30 63 56 71 75 60 73  2 36  2  1  1  1 28 41  2 48 41 32 51 43 32 30 47
 32 31  2 43 28 45]


In [11]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(30):
  print(index2char[i.numpy()])

C
h
a
p
t
e
r
 
I
 






A
N
 
U
N
E
X
P
E
C
T
E
D
 
P
A
R


In [12]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
  print(repr(''.join(index2char[item.numpy()])))

'Chapter I \n\n\nAN UNEXPECTED PARTY \n\n\nIn a hole in the ground there lived a hobbit. Not a nasty, dirty,'
' wet hole, filled with the ends of worms \nand an oozy smell, nor yet a dry, bare, sandy hole with not'
'hing in it to sit down on or to eat: it was a \nhobbit-hole, and that means comfort. \n\nIt had a perfec'
'tly round door like a porthole, painted green, with a shiny yellow brass knob in the \nexact middle. T'
'he door opened on to a tube-shaped hall like a tunnel: a very comfortable tunnel \nwithout smoke, with'


In [13]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [14]:
# Batch size
BATCH_SIZE = 64

BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<_BatchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

In [15]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 300

# Number of RNN units
rnn_units1 = 1024
rnn_units2 = 1024
rnn_units=[rnn_units1, rnn_units2]
print(vocab_size)

87


In [16]:
import tensorflow as tf

def build_model(vocab_size, embedding_dim, rnn_units, batch_size, dropout_rate=0.2):
    rnn_units1, rnn_units2 = rnn_units  # Unpack the units

    # Input layer
    inputs = tf.keras.Input(batch_shape=(batch_size, None), name="input")

    # Learnable positional embeddings
    position_embedding = tf.keras.layers.Embedding(
        input_dim=1000,  # Max sequence length you expect
        output_dim=embedding_dim,
        name="positional_embedding"
    )

    token_embedding = tf.keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        name="token_embedding"
    )

    def add_position_embedding(x):
        positions = tf.range(start=0, limit=tf.shape(x)[1], delta=1)
        positions = position_embedding(positions)
        return x + positions

    x = token_embedding(inputs)
    x = tf.keras.layers.Lambda(add_position_embedding, name="add_pos_embedding")(x)

    # First GRU + LayerNorm + Dropout
    x1 = tf.keras.layers.GRU(
        rnn_units1,
        return_sequences=True,
        stateful=True,
        recurrent_initializer='glorot_uniform',
        name="gru_1"
    )(x)

    x1 = tf.keras.layers.LayerNormalization(name="ln_1")(x1)
    x1 = tf.keras.layers.Dropout(dropout_rate)(x1)

    # Second GRU (with residual connection if dimensions match)
    x2 = tf.keras.layers.GRU(
        rnn_units2,
        return_sequences=True,
        stateful=True,
        recurrent_initializer='glorot_uniform',
        name="gru_2"
    )(x1)

    # Optional: Residual connection if rnn_units1 == rnn_units2
    if rnn_units1 == rnn_units2:
        x2 = tf.keras.layers.Add(name="residual_add")([x1, x2])

    x2 = tf.keras.layers.LayerNormalization(name="ln_2")(x2)
    x2 = tf.keras.layers.Dropout(dropout_rate)(x2)

    # Output projection
    outputs = tf.keras.layers.Dense(vocab_size, name="output_logits")(x2)

    return tf.keras.Model(inputs, outputs, name="TextGenGRU_SOTA_Lite")


In [17]:
model = build_model(
  vocab_size = vocab_size,
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [18]:
model.summary()

In [19]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [20]:
model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])

In [21]:
import os
import tensorflow as tf

# Directory to save the best model weights
checkpoint_dir = './training_checkpoints_final'
os.makedirs(checkpoint_dir,exist_ok=True)
checkpoint_path = os.path.join(checkpoint_dir, 'best_model_final.weights.h5')

# Create the callback
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True
)


In [22]:
history = model.fit(dataset, epochs=25, callbacks=checkpoint_callback)

Epoch 1/25
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 144ms/step - accuracy: 0.3540 - loss: 2.6673
Epoch 2/25
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 153ms/step - accuracy: 0.5533 - loss: 1.5162
Epoch 3/25
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 152ms/step - accuracy: 0.5904 - loss: 1.3708
Epoch 4/25
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 153ms/step - accuracy: 0.6086 - loss: 1.2981
Epoch 5/25
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 153ms/step - accuracy: 0.6198 - loss: 1.2544
Epoch 6/25
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 153ms/step - accuracy: 0.6275 - loss: 1.2230
Epoch 7/25
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 152ms/step - accuracy: 0.6341 - loss: 1.1992
Epoch 8/25
[1m470/470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 153ms/step - accuracy: 0.6388 - loss: 1.1804
Epoch 9/25
[1m4

In [23]:
latest_check= 'training_checkpoints_final/best_model_final.weights.h5'

In [24]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(latest_check)

model.build(tf.TensorShape([1, None]))

In [25]:
model.summary()

In [26]:
def generate_text(model, start_string):

  # Number of characters to generate
  num_generate = 1000

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2index[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low results in more predictable text.
  # Higher results in more surprising text.
  # Experiment to find the best setting.
  scaling = 0.5 #1

  # batch size == 1

  #model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the character returned by the model
      predictions = predictions / scaling
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(index2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [27]:
print(generate_text(model, start_string=u"Hobbiton "))

Hobbiton the Great Gate of the Elves, and their horses were grim and surrounding. They were remembered that the sound of his people came and the winding of the stream was still sung in the sover of the dark hour, and soon too had been to bow him out of the fields of the Houses of Healing.
     'The evening of the Ringwraiths have been set and see it any long time to be looking for them to this old and the branch to see the Lord of the Mark!' said Aragorn. 'We must go and go that and be a fool or day before the ring of the village.'
     'I wonder if he's come to phanted you, and I say you have no heart to ask for the most likely to think of that and all the time and hear.' He said now and again to the road after the fallen stone, and found his master in his time and a slender fire in the sunlight. 'We must go and say to Gollum's eyes on your way to Minas Tirith, and they mean to find a long sure of a wide light of their silver and mountain, and then suddenly in the sunlight of the day'

In [28]:
print(generate_text(model, start_string=u"The ring "))

The ring is our father to another to the point of the world outside the Lord of the Mountain, and a little of our horses and the others built in the Morgai that was not a horse in the Shire. So he said: "Has they too mate any song of your own fools who has done much to see it in your business to the Sackville-Bagginses an ancient strangers. And the doom of the messenger of the Ring was one of the black shadow of all that the sound of the woods of Mordor, and where the day the beauty is not a horse of wine that was broken or something to warm another coming of any more than a little hope of his own wisdom untied to him of that land. In the meanwhile he was to keep a wind of a great horse and some of it almost to the face of the stones. But if you don't like to see that he would have been a fool or horse that he will see that he has been to begin to speak of it again. And if it were one of the black shadow of all that you used to be slain and some wise and heartening to our hope in the m