# Assignment II - Text Generation with RNNs
Submitted by Arham Anwar

In [1]:
# seed and immports 

import urllib.request
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning import Trainer
import random

# Set seeds for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    pl.seed_everything(seed)

# Setting the seed
SEED = 42
set_seed(SEED)

Global seed set to 42


## 1. Preprocess The Data

In [2]:
import random 
import numpy as np
import urllib.request


### 1.1. Loading the data

In [3]:
"""tiny shakespeare dataset"""

# Data preparation
url = 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt'
filename = 'shakespeare.txt'
urllib.request.urlretrieve(url, filename)
print(f"File downloaded and saved as: {filename}")

File downloaded and saved as: shakespeare.txt


### 1.2. Lower casing the data

In [4]:
# Read and preprocess the text
text = open(filename, 'rb').read().decode(encoding='utf-8').lower()

### 1.3. Using 500K characters only due to compute resource limitations

In [5]:
# for compute restrictions we will use only 500000 characters
text = text[300000:800000]

#### 1.4. Character Dictionary

In [6]:
# Get all unique characters
characters = sorted(set(text))
char_to_index = {c: i for i, c in enumerate(characters)}
index_to_char = {i: c for i, c in enumerate(characters)}

#### 1.5. Sequence Configuration

In [7]:

SEQ_LENGTH = 40
STEP_SIZE = 3
sentences = []
next_characters = []

for i in range(0, len(text) - SEQ_LENGTH, STEP_SIZE):
    sentences.append(text[i: i + SEQ_LENGTH])
    next_characters.append(text[i + SEQ_LENGTH])

# Convert data to indices
X = np.zeros((len(sentences), SEQ_LENGTH), dtype=np.int32)
y = np.zeros((len(sentences)), dtype=np.int32)

for i, sentence in enumerate(sentences):
    X[i] = [char_to_index[char] for char in sentence]
    y[i] = char_to_index[next_characters[i]]


In [8]:
# Custom Dataset
class ShakespeareDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Set a larger batch size
BATCH_SIZE = 256

dataset = ShakespeareDataset(X, y)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

#### 2. RNN Model

In [9]:
# Define the model
class ShakespeareModel(pl.LightningModule):
    def __init__(self, n_chars, hidden_size, num_layers, lr):
        super(ShakespeareModel, self).__init__()
        self.n_chars = n_chars
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lr = lr

        self.lstm = nn.LSTM(n_chars, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, n_chars)

    def forward(self, x):
        x = nn.functional.one_hot(x, num_classes=self.n_chars).float()
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.CrossEntropyLoss()(y_hat, y)
        self.log('train_loss', loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

    def generate_text(self, seed_text, gen_length, temperature=1.0):
        self.eval()
        generated = seed_text
        for _ in range(gen_length):
            x_pred = torch.tensor([[char_to_index[char] for char in seed_text]], dtype=torch.long)
            y_pred = self(x_pred.to(self.device)).squeeze()
            y_pred = y_pred / temperature
            probabilities = torch.nn.functional.softmax(y_pred, dim=-1).detach().cpu().numpy()
            next_index = np.random.choice(len(characters), p=probabilities)
            next_char = index_to_char[next_index]
            generated += next_char
            seed_text = seed_text[1:] + next_char
        return generated

# Instantiate the model
n_chars = len(characters)
hidden_size = 128
num_layers = 2
lr = 0.005
model = ShakespeareModel(n_chars, hidden_size, num_layers, lr)

# Train the model
trainer = Trainer(max_epochs=4, gpus=1 if torch.cuda.is_available() else 0)
trainer.fit(model, dataloader)



  rank_zero_deprecation(
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name | Type   | Params
--------------------------------
0 | lstm | LSTM   | 218 K 
1 | fc   | Linear | 5.0 K 
--------------------------------
223 K     Trainable params
0         Non-trainable params
223 K     Total params
0.895     Total estimated model params size (MB)
2024-05-26 04:34:42.701722: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 0:  14%|█▍        | 91/651 [00:26<02:44,  3.40it/s, loss=3, v_num=19]   

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


### 3. Text geneeration outputs:

In [None]:
# Generate text using the trained model
seed_text = "to be or not to be that is the question"
generated_text = model.generate_text(seed_text, gen_length=500, temperature=0.8)
print(generated_text)


In [None]:
# Generate text using the trained model
seed_text = "deep learning is very deep but is it deep enough"
generated_text = model.generate_text(seed_text, gen_length=500, temperature=0.8)
print(generated_text)

deep learning is very deep but is it deep enough.

romeo:
now a king hope!

bloyy:
what you near things a since in prince, whom it is accain.

king henry vi:
and that see, not afford in high have that shall live,
and give my horse to perhence of flored you,
so innote what thou come to me in him to unlast,
that and him, his rount, and it me arsent-blood;
have upon cloaned, his hand: i'll come you scorn
noight in his for both follow: and things and pity,
for me it be to all herself
which shall not the best heart nothing netter and linger.

rome


In [None]:
# Generate text using the trained model
seed_text = "deep learning is very deep but is it deep enough"
generated_text = model.generate_text(seed_text, gen_length=500, temperature=0.2)
print(generated_text)

In [None]:
# Generate text using the trained model
seed_text = "deep learning is very deep but is it deep enough"
generated_text = model.generate_text(seed_text, gen_length=500, temperature=0.4)
print(generated_text)

In [None]:
# Generate text using the trained model
seed_text = "deep learning is very deep but is it deep enough"
generated_text = model.generate_text(seed_text, gen_length=500, temperature=0.6)
print(generated_text)