Amishi Gupta

23/CS/048

EXP-4

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import pandas as pd

file_path = "/content/drive/MyDrive/poems-100.csv"
df = pd.read_csv(file_path)
print("Columns:", df.columns)

Columns: Index(['text'], dtype='object')


In [8]:
text = " ".join(df[df.columns[0]].astype(str).tolist())
text = text.lower()

print(text[:500])

o my luve's like a red, red rose
that’s newly sprung in june;
o my luve's like the melodie
that’s sweetly play'd in tune.

as fair art thou, my bonnie lass,
so deep in luve am i:
and i will luve thee still, my dear,
till a’ the seas gang dry:

till a’ the seas gang dry, my dear,
and the rocks melt wi’ the sun:
i will luve thee still, my dear,
while the sands o’ life shall run.

and fare thee well, my only luve
and fare thee well, a while!
and i will come again, my luve,
tho’ it were ten thousand


In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import time
import re

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

def tokenize(text):
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = text.replace("\n", " <eol> ")
    return text.split()

tokens = tokenize(text)

vocab = sorted(set(tokens))
vocab_size = len(vocab)

word2idx = {w:i for i,w in enumerate(vocab)}
idx2word = {i:w for w,i in word2idx.items()}

print("Vocabulary size:", vocab_size)

Using device: cpu
Vocabulary size: 5441


In [10]:
seq_length = 5
def create_sequences(tokens):
    sequences = []
    for i in range(len(tokens) - seq_length):
        seq = tokens[i:i+seq_length]
        target = tokens[i+seq_length]
        sequences.append((seq, target))
    return sequences

data = create_sequences(tokens)
print("Total sequences:", len(data))

Total sequences: 27997


In [11]:
#one hot encoding
def one_hot_encode(word):
    vec = torch.zeros(vocab_size)
    vec[word2idx[word]] = 1
    return vec

def prepare_onehot_batch(batch):
    inputs, targets = [], []
    for seq, target in batch:
        seq_vec = [one_hot_encode(w) for w in seq]
        inputs.append(torch.stack(seq_vec))
        targets.append(word2idx[target])
    return torch.stack(inputs), torch.tensor(targets)

In [12]:
class RNN_OneHot(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])
        return out

In [13]:
class LSTM_OneHot(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

In [14]:
def prepare_index_batch(batch):
    inputs, targets = [], []
    for seq, target in batch:
        inputs.append([word2idx[w] for w in seq])
        targets.append(word2idx[target])
    return torch.tensor(inputs), torch.tensor(targets)

In [15]:
class RNN_Embed(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])
        return out

In [16]:
class LSTM_Embed(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

In [17]:
def train_model(model, prepare_fn, epochs=5, batch_size=64):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.003)

    start_time = time.time()

    for epoch in range(epochs):
        total_loss = 0
        random.shuffle(data)

        for i in range(0, len(data), batch_size):
            batch = data[i:i+batch_size]
            inputs, targets = prepare_fn(batch)

            inputs = inputs.to(device)
            targets = targets.to(device)

            if prepare_fn == prepare_onehot_batch:
                inputs = inputs.float()

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

    print("Training Time:", round(time.time() - start_time, 2), "seconds")
    return model

In [18]:
hidden_size = 128
embed_dim = 100

print("\nTraining RNN OneHot")
model1 = train_model(RNN_OneHot(vocab_size, hidden_size, vocab_size), prepare_onehot_batch)

print("\nTraining LSTM One Hot")
model2 = train_model(LSTM_OneHot(vocab_size, hidden_size, vocab_size), prepare_onehot_batch)

print("\nTraining RNN Embedding")
model3 = train_model(RNN_Embed(vocab_size, embed_dim, hidden_size), prepare_index_batch)

print("\nTraining LSTM Embedding")
model4 = train_model(LSTM_Embed(vocab_size, embed_dim, hidden_size), prepare_index_batch)



Training RNN OneHot
Epoch 1, Loss: 2938.5147
Epoch 2, Loss: 2655.6715
Epoch 3, Loss: 2408.8057
Epoch 4, Loss: 2126.9864
Epoch 5, Loss: 1804.9381
Training Time: 113.0 seconds

Training LSTM One Hot
Epoch 1, Loss: 2831.4551
Epoch 2, Loss: 2552.2154
Epoch 3, Loss: 2306.5348
Epoch 4, Loss: 1988.4147
Epoch 5, Loss: 1590.1402
Training Time: 496.5 seconds

Training RNN Embedding
Epoch 1, Loss: 2850.6638
Epoch 2, Loss: 2493.5088
Epoch 3, Loss: 2154.5649
Epoch 4, Loss: 1813.6521
Epoch 5, Loss: 1495.0357
Training Time: 44.95 seconds

Training LSTM Embedding
Epoch 1, Loss: 2794.5537
Epoch 2, Loss: 2424.5987
Epoch 3, Loss: 2115.3855
Epoch 4, Loss: 1767.1085
Epoch 5, Loss: 1421.2855
Training Time: 59.78 seconds


In [19]:
def generate_text(model, seed_text, length=20, use_onehot=False):
    model.eval()
    words = seed_text.lower().split()

    for _ in range(length):
        seq = words[-seq_length:]

        if use_onehot:
            seq_vec = torch.stack([one_hot_encode(w) for w in seq])
            seq_vec = seq_vec.unsqueeze(0).to(device)
            output = model(seq_vec.float())
        else:
            seq_idx = torch.tensor([[word2idx[w] for w in seq]]).to(device)
            output = model(seq_idx)

        predicted = torch.argmax(output).item()
        words.append(idx2word[predicted])

    return " ".join(words)

In [20]:
print("\nRNN One Hot:\n", generate_text(model1, "the night was", 20, True))
print("\nLSTM One Hot:\n", generate_text(model2, "the night was", 20, True))
print("\nRNN Embedding:\n", generate_text(model3, "the night was", 20))
print("\nLSTM Embedding:\n", generate_text(model4, "the night was", 20))


RNN One Hot:
 the night was the house and the body or the young men and the sea of the sea of the earth of the

LSTM One Hot:
 the night was he was his eyes <eol> the little systems that the bride is the air <eol> and this is the same

RNN Embedding:
 the night was the <eol> texan ranch <eol> the cleanhaird yankee girl works and each other <eol> <eol> i do not love theeyet

LSTM Embedding:
 the night was the same <eol> <eol> i am not a minute longer <eol> <eol> i am not a minute longer <eol> <eol>
