<a href="https://colab.research.google.com/github/aamish007/23-CS-006-CS318-DL-Lab/blob/main/newExp4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Generation using RNN & LSTM
Experiment 4: Deep Learning

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import re
import pickle
from collections import Counter
from torch.utils.data import TensorDataset, DataLoader

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using:', device)

Using: cuda


In [4]:
from google.colab import files
uploaded = files.upload()


Saving poems.csv to poems.csv


In [5]:
df = pd.read_csv('poems.csv')
text = ' '.join(df['text'].astype(str)).lower()

In [6]:
tokens = re.findall(r"[a-z]+", text)
print('Total words:', len(tokens))

Total words: 25507


In [7]:
max_vocab = 2500

word_counts = Counter(tokens)
common_words = [w for w, _ in word_counts.most_common(max_vocab)]
tokens = [w for w in tokens if w in common_words]

vocab = sorted(set(tokens))
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}
vocab_size = len(vocab)

print("Vocab size:", vocab_size)

Vocab size: 2500


In [8]:
with open("vocab.pkl", "wb") as f:
    pickle.dump((word2idx, idx2word), f)


In [9]:
encoded = [word2idx[w] for w in tokens]

seq_length = 5
inputs, targets = [], []

for i in range(len(encoded) - seq_length):
    inputs.append(encoded[i:i+seq_length])
    targets.append(encoded[i+seq_length])

X = torch.tensor(inputs)
y = torch.tensor(targets)

dataset = TensorDataset(X, y)
loader = DataLoader(dataset, batch_size=64, shuffle=True)

print(X.shape, y.shape)

torch.Size([22845, 5]) torch.Size([22845])


In [10]:
def one_hot_encode(batch, vocab_size):
    return F.one_hot(batch, num_classes=vocab_size).float()

In [11]:
class RNN_OneHot(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        self.rnn = nn.RNN(vocab_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        out, _ = self.rnn(x)
        return self.fc(out[:, -1, :])


In [12]:
class LSTM_OneHot(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        self.lstm = nn.LSTM(vocab_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        return self.fc(out[:, -1, :])

In [13]:
class RNN_Embed(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        return self.fc(out[:, -1, :])

In [14]:
class LSTM_Embed(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        return self.fc(out[:, -1, :])

In [15]:
def train_onehot(model, loader, epochs=5):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
    model.to(device)

    for epoch in range(epochs):
        total_loss = 0

        for batch_X, batch_y in loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)

            optimizer.zero_grad()

            X_oh = one_hot_encode(batch_X, vocab_size)
            outputs = model(X_oh)

            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss = {total_loss:.4f}")

In [16]:
def train_embed(model, loader, epochs=5):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
    model.to(device)

    for epoch in range(epochs):
        total_loss = 0

        for batch_X, batch_y in loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)

            optimizer.zero_grad()

            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss = {total_loss:.4f}")

In [17]:
hidden_size = 128
embed_dim = 100

print("\nTraining RNN OneHot")
rnn_oh = RNN_OneHot(vocab_size, hidden_size)
train_onehot(rnn_oh, loader)

print("\nTraining LSTM OneHot")
lstm_oh = LSTM_OneHot(vocab_size, hidden_size)
train_onehot(lstm_oh, loader)

print("\nTraining RNN Embedding")
rnn_emb = RNN_Embed(vocab_size, embed_dim, hidden_size)
train_embed(rnn_emb, loader)

print("\nTraining LSTM Embedding")
lstm_emb = LSTM_Embed(vocab_size, embed_dim, hidden_size)
train_embed(lstm_emb, loader)


Training RNN OneHot
Epoch 1, Loss = 2247.8642
Epoch 2, Loss = 2042.7730
Epoch 3, Loss = 1870.4556
Epoch 4, Loss = 1662.4975
Epoch 5, Loss = 1425.4402

Training LSTM OneHot
Epoch 1, Loss = 2232.8719
Epoch 2, Loss = 2068.4014
Epoch 3, Loss = 1914.3880
Epoch 4, Loss = 1720.5778
Epoch 5, Loss = 1488.0381

Training RNN Embedding
Epoch 1, Loss = 2217.0752
Epoch 2, Loss = 1939.5503
Epoch 3, Loss = 1716.6138
Epoch 4, Loss = 1495.3496
Epoch 5, Loss = 1289.1522

Training LSTM Embedding
Epoch 1, Loss = 2199.8343
Epoch 2, Loss = 1945.3559
Epoch 3, Loss = 1727.0658
Epoch 4, Loss = 1484.0967
Epoch 5, Loss = 1236.9297


In [18]:
torch.save(rnn_oh.state_dict(), "rnn_onehot.pth")
torch.save(lstm_oh.state_dict(), "lstm_onehot.pth")
torch.save(rnn_emb.state_dict(), "rnn_embed.pth")
torch.save(lstm_emb.state_dict(), "lstm_embed.pth")

print("\nModels saved ✅")


Models saved ✅


In [19]:
def generate_text(model, start_words, length=20, embedding=False):
    model.eval()
    words = start_words.copy()

    for _ in range(length):
        indices = torch.tensor([[word2idx[w] for w in words[-seq_length:]]]).to(device)

        with torch.no_grad():
            if embedding:
                output = model(indices)
            else:
                one_hot = one_hot_encode(indices, vocab_size)
                output = model(one_hot)

        next_idx = torch.argmax(output, dim=1).item()
        words.append(idx2word[next_idx])

    return " ".join(words)

print("\nGenerated Text:")
print(generate_text(lstm_emb, ["the", "moon", "shines", "in", "the"], embedding=True))


Generated Text:
the moon shines in the woods of the of the of the of the of the of the of the of the of the of
