# Experiment 5.2: Sequence Text Prediction using LSTM

**Objective:**  
To generate next characters/words based on a given input sequence using an LSTM.


In [2]:
#@title 1. Install & Import Libraries
# !pip install tensorflow tensorflow-datasets numpy matplotlib

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds


## 2. Load & Inspect Data
Load the Shakespeare “tiny_shakespeare” text corpus from TensorFlow Datasets.


In [4]:
import tensorflow_datasets as tfds

ds = tfds.load('tiny_shakespeare', split='train')
text = ''
for example in tfds.as_numpy(ds):
    text += example['text'].decode('utf-8')

print(text[:500])  # show first 500 characters


First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


## 3. Preprocess: Character-Level Tokenization
Build vocabulary mappings from characters to integer IDs.


In [5]:
vocab = sorted(set(text))
char2idx = {c: i for i, c in enumerate(vocab)}
idx2char = np.array(vocab)
text_as_int = np.array([char2idx[c] for c in text])
print(f'Vocabulary size: {len(vocab)}')


Vocabulary size: 65


## 4. Create Input–Target Sequences
Split the integer sequence into overlapping windows of length `seq_length + 1`, then map to `(input, target)` pairs.


In [6]:
seq_length = 100

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

def split_input_target(seq):
    return seq[:-1], seq[1:]

dataset = sequences.map(split_input_target)


## 5. Batch & Shuffle
Prepare final training dataset with shuffling, batching, and prefetching.


In [7]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = (dataset
           .shuffle(BUFFER_SIZE)
           .batch(BATCH_SIZE, drop_remainder=True)
           .prefetch(tf.data.AUTOTUNE))


## 6. Build the LSTM Model
Embedding → Stateful LSTM → Dense output layer.


In [8]:
# Build the LSTM Model (corrected)
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),  # removed batch_input_shape
    tf.keras.layers.LSTM(rnn_units,
                         return_sequences=True),
    tf.keras.layers.Dense(vocab_size)
])


## 7. Compile the Model
Use Adam optimizer and sparse categorical crossentropy loss.


In [9]:
model.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)
model.summary()


## 8. Train the Model
Train for a fixed number of epochs (e.g. 20).


In [10]:
EPOCHS = 20
history = model.fit(dataset, epochs=EPOCHS)


Epoch 1/20
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 70ms/step - accuracy: 0.1986 - loss: 3.1818
Epoch 2/20
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 71ms/step - accuracy: 0.3966 - loss: 2.0639
Epoch 3/20
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 72ms/step - accuracy: 0.4806 - loss: 1.7544
Epoch 4/20
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 73ms/step - accuracy: 0.5237 - loss: 1.5902
Epoch 5/20
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 72ms/step - accuracy: 0.5512 - loss: 1.4849
Epoch 6/20
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 71ms/step - accuracy: 0.5666 - loss: 1.4237
Epoch 7/20
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 70ms/step - accuracy: 0.5783 - loss: 1.3760
Epoch 8/20
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 71ms/step - accuracy: 0.5904 - loss: 1.3300
Epoch 9/20
[1m155/155[

## 9. Plot Training Loss & Accuracy
Visualize how loss and accuracy evolve over epochs.
plt.figure(figsize=(12,4))

plt.subplot(1,2,1)
plt.plot(history.history['loss'], label='loss')
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.legend()

plt.subplot(1,2,2)
plt.plot(history.history['accuracy'], label='accuracy')
plt.title('Training Accuracy')
plt.xlabel('Epoch')
plt.legend()

plt.show()


In [18]:
# --- TEXT GENERATION CELL ---

import tensorflow as tf

def generate_text(model, start_string, num_generate=300, temperature=1.0):
    # Convert start_string to integer IDs
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)# shape (1, len(start_string))

    generated_chars = []
    #model.reset_states()

    for _ in range(num_generate):
        # Predict next character
        preds = model(input_eval)                            # (1, seq_len, vocab_size)
        preds = preds[:, -1, :]                              # (1, vocab_size)
        preds = preds / temperature
        predicted_id = tf.random.categorical(preds, num_samples=1)[0,0].numpy()

        # Append and update input
        generated_chars.append(idx2char[predicted_id])
        input_eval = tf.expand_dims([predicted_id], 0)       # feed back the last prediction

    return start_string + ''.join(generated_chars)

# Example usage:
print(generate_text(model, start_string="ROMEO: ", num_generate=500, temperature=0.5))


ROMEO: CFLG3N;!H'DM3!:KHIIJ :
MHFMK.!A.J N'$$D.$H
GO:IL&,!E-I&LMBL;ALGJJ$
KKAG3DGN'. !FA.BJN:DALM!OBMM,M:?ILHMHEL N
!GI.&E K& G!3N;?FEG,L,, G
&E?BJ?$!B-EJFHIC
;BEAILH;
LLC$;:L-H?-N.K$$?CH.KGJ;C$N$,A,ML3G!GEG$H:OLKG$HF3&IL:FOJH-BAF.:!;.$HO:HN$N
HLDEFJ?GKEBLLIE&HM 
'$!!;:ACBEFHJ'G.NA
G&!IH:3,.-GA?LHGJMJ',M.&E??N'H, D?GGEIIK
D;.F,&CE$F&AHB?'!C?'N3,!.O.; GEI-FKKLC3.ONO,$
NOBONOAOJEHN!ELDJFF3CJHB&,!L?J,?!M'DCEL!FF!MA ADAMD-,G&$B&N 'AK:FE$HN$;?OALJ:;GBF:HA! 3IC&D-N!MNLAML'3!!FI$&H&3ILG-&DCOK,
C'EI:NOBMA A$!$
