<a href="https://colab.research.google.com/github/arnavnigam31/DeepLearningLab/blob/main/Experiment_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
import numpy as np

with open("poems-100.csv", "r", encoding="utf-8") as f:
    text = f.read().lower()

tokens = text.split()
vocab = sorted(set(tokens))
word2idx = {w:i for i,w in enumerate(vocab)}
idx2word = {i:w for w,i in word2idx.items()}
vocab_size = len(vocab)


ONE-HOT ENCODING

In [2]:
def one_hot_encode(idx, vocab_size):
    vec = torch.zeros(vocab_size)
    vec[idx] = 1.0
    return vec


Create Sequences

In [3]:
seq_length = 5
X, y = [], []

for i in range(len(tokens) - seq_length):
    seq = tokens[i:i+seq_length]
    target = tokens[i+seq_length]
    X.append([word2idx[w] for w in seq])
    y.append(word2idx[target])

X = torch.tensor(X)
y = torch.tensor(y)


In [4]:
seq_length = X.shape[1]

X_onehot = torch.zeros(X.shape[0], seq_length, vocab_size)

for i in range(X.shape[0]):
    for j in range(seq_length):
        X_onehot[i, j, X[i, j]] = 1


In [5]:
from torch.utils.data import TensorDataset, DataLoader

dataset = TensorDataset(X_onehot, y)
loader = DataLoader(dataset, batch_size=64, shuffle=True)


RNN Model

In [6]:
class RNN_OneHot(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        self.rnn = nn.RNN(vocab_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])
        return out


In [7]:
hidden_size = 64
model = RNN_OneHot(vocab_size, hidden_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 20

for epoch in range(epochs):
    total_loss = 0

    for xb, yb in loader:
        output = model(xb)
        loss = criterion(output, yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader):.4f}")


Epoch 1, Loss: 7.4003
Epoch 2, Loss: 6.7712
Epoch 3, Loss: 6.4620
Epoch 4, Loss: 6.1625
Epoch 5, Loss: 5.9194
Epoch 6, Loss: 5.6807
Epoch 7, Loss: 5.4380
Epoch 8, Loss: 5.1832
Epoch 9, Loss: 4.9245
Epoch 10, Loss: 4.6706
Epoch 11, Loss: 4.4174
Epoch 12, Loss: 4.1713
Epoch 13, Loss: 3.9271
Epoch 14, Loss: 3.6910
Epoch 15, Loss: 3.4572
Epoch 16, Loss: 3.2323
Epoch 17, Loss: 3.0076
Epoch 18, Loss: 2.7985
Epoch 19, Loss: 2.5948
Epoch 20, Loss: 2.4016


In [8]:
seed_words = ["the", "sun", "shines", "bright", "upon"]
num_words_to_generate = 50
def generate_text_for_onehot(model, seed_words, num_words_to_generate, word2idx, idx2word, vocab_size, seq_length):
    model.eval()
    generated_words = list(seed_words)

    for _ in range(num_words_to_generate):
        if len(generated_words) < seq_length:
            current_sequence_words = generated_words
        else:
            current_sequence_words = generated_words[-seq_length:]

        try:
            input_indices = [word2idx[w] for w in current_sequence_words]
        except KeyError as e:
            print(f"Warning: Word '{e.args[0]}' not in vocabulary. Skipping generation for this word.")
            break
        input_tensor_indices = torch.tensor(input_indices).unsqueeze(0)
        one_hot_input_tensor = torch.zeros(1, input_tensor_indices.shape[1], vocab_size, dtype=torch.float32)
        for k, idx in enumerate(input_tensor_indices[0]):
            one_hot_input_tensor[0, k, idx] = 1.0

        with torch.no_grad():
            output = model(one_hot_input_tensor)

        predicted_idx = torch.argmax(output[:, -1, :]).item() if output.dim() == 3 else torch.argmax(output).item()
        predicted_word = idx2word[predicted_idx]
        generated_words.append(predicted_word)

    return ' '.join(generated_words)
generated_text = generate_text_for_onehot(model, seed_words, num_words_to_generate, word2idx, idx2word, vocab_size, seq_length)
print("Generated Text:")
print(generated_text)

Generated Text:
the sun shines bright upon me for the large and work, is the welcome, of the day and eleves, i do at have forward! and the perpetual holding your oftener by the and is like the gamut, of these and are to the and fills the open and second of the lips, i am a


LSTM Model

In [9]:
class LSTM_OneHot(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        self.lstm = nn.LSTM(vocab_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out


In [10]:
hidden_size = 64
model = LSTM_OneHot(vocab_size, hidden_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 20

for epoch in range(epochs):
    total_loss = 0

    for xb, yb in loader:
        output = model(xb)
        loss = criterion(output, yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader):.4f}")


Epoch 1, Loss: 7.4664
Epoch 2, Loss: 6.8816
Epoch 3, Loss: 6.7495
Epoch 4, Loss: 6.5955
Epoch 5, Loss: 6.4220
Epoch 6, Loss: 6.2191
Epoch 7, Loss: 5.9873
Epoch 8, Loss: 5.7252
Epoch 9, Loss: 5.4407
Epoch 10, Loss: 5.1328
Epoch 11, Loss: 4.8236
Epoch 12, Loss: 4.5168
Epoch 13, Loss: 4.2187
Epoch 14, Loss: 3.9311
Epoch 15, Loss: 3.6549
Epoch 16, Loss: 3.3869
Epoch 17, Loss: 3.1273
Epoch 18, Loss: 2.8777
Epoch 19, Loss: 2.6353
Epoch 20, Loss: 2.4036


In [12]:
seed_words = ["the", "sun", "shines", "bright", "upon"]
num_words_to_generate = 50

generated_text = generate_text_for_onehot(model, seed_words, num_words_to_generate, word2idx, idx2word, vocab_size, seq_length)
print("Generated Text:")
print(generated_text)

Generated Text:
the sun shines bright upon no than and this take we to to me be a mind of still and all all and each they with the last of i love, the same of the first and of the men of all and and till then is my eyes a whatever of my things and


Word Embeddings

In [13]:
X = torch.tensor([[word2idx[w] for w in tokens[i:i+seq_length]]
                  for i in range(len(tokens)-seq_length)])
y = torch.tensor([word2idx[tokens[i+seq_length]]
                  for i in range(len(tokens)-seq_length)])


RNN

In [14]:
class RNN_Embedding(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])
        return out


In [15]:
from torch.utils.data import TensorDataset, DataLoader

embedding_dataset = TensorDataset(X, y)
embedding_loader = DataLoader(embedding_dataset, batch_size=64, shuffle=True)

model = RNN_Embedding(vocab_size, embed_dim=100, hidden_size=64)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 20

for epoch in range(epochs):
    total_loss = 0

    for xb, yb in embedding_loader:
        output = model(xb)
        loss = criterion(output, yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(embedding_loader):.4f}")

Epoch 1, Loss: 7.4717
Epoch 2, Loss: 6.6119
Epoch 3, Loss: 6.2655
Epoch 4, Loss: 5.9458
Epoch 5, Loss: 5.6374
Epoch 6, Loss: 5.3367
Epoch 7, Loss: 5.0463
Epoch 8, Loss: 4.7601
Epoch 9, Loss: 4.4830
Epoch 10, Loss: 4.2164
Epoch 11, Loss: 3.9574
Epoch 12, Loss: 3.7104
Epoch 13, Loss: 3.4787
Epoch 14, Loss: 3.2594
Epoch 15, Loss: 3.0561
Epoch 16, Loss: 2.8655
Epoch 17, Loss: 2.6866
Epoch 18, Loss: 2.5194
Epoch 19, Loss: 2.3611
Epoch 20, Loss: 2.2115


In [18]:
seed_words = ["the", "sun", "shines", "bright", "upon"]
num_words_to_generate = 50

generated_text = generate_text(model, seed_words, num_words_to_generate, word2idx, idx2word, seq_length)
print("Generated Text:")
print(generated_text)

Generated Text:
the sun shines bright upon their unsuccess. i see the people i have been stunn'd. stand back! is a man anyhow? what it shall be you! you sweaty brooks and dews it shall be you! you sweaty brooks and dews it shall be you! you sweaty brooks and dews it shall be you! you sweaty


LSTM

In [19]:
class LSTM_Embedding(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out


In [20]:
model = LSTM_Embedding(vocab_size, embed_dim=100, hidden_size=64)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 20

for epoch in range(epochs):
    total_loss = 0

    for xb, yb in embedding_loader:
        output = model(xb)
        loss = criterion(output, yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(embedding_loader):.4f}")

Epoch 1, Loss: 7.4604
Epoch 2, Loss: 6.6877
Epoch 3, Loss: 6.4009
Epoch 4, Loss: 6.1084
Epoch 5, Loss: 5.8161
Epoch 6, Loss: 5.5303
Epoch 7, Loss: 5.2437
Epoch 8, Loss: 4.9646
Epoch 9, Loss: 4.6869
Epoch 10, Loss: 4.4125
Epoch 11, Loss: 4.1470
Epoch 12, Loss: 3.8866
Epoch 13, Loss: 3.6350
Epoch 14, Loss: 3.3934
Epoch 15, Loss: 3.1614
Epoch 16, Loss: 2.9415
Epoch 17, Loss: 2.7318
Epoch 18, Loss: 2.5351
Epoch 19, Loss: 2.3496
Epoch 20, Loss: 2.1746


In [17]:
def generate_text(model, seed_words, num_words_to_generate, word2idx, idx2word, seq_length):
    model.eval()
    generated_words = list(seed_words)

    for _ in range(num_words_to_generate):
        if len(generated_words) < seq_length:
            current_sequence_words = generated_words
        else:
            current_sequence_words = generated_words[-seq_length:]
        try:
            input_indices = [word2idx[w] for w in current_sequence_words]
        except KeyError as e:
            print(f"Warning: Word '{e.args[0]}' not in vocabulary. Skipping generation for this word.")
            break


        input_tensor = torch.tensor(input_indices).unsqueeze(0)

        with torch.no_grad():
            output = model(input_tensor)


        predicted_idx = torch.argmax(output[:, -1, :]).item() if output.dim() == 3 else torch.argmax(output).item()
        predicted_word = idx2word[predicted_idx]
        generated_words.append(predicted_word)

    return ' '.join(generated_words)

In [21]:
seed_words = ["the", "sun", "shines", "bright", "upon"]
num_words_to_generate = 50

generated_text = generate_text(model, seed_words, num_words_to_generate, word2idx, idx2word, seq_length)
print("Generated Text:")
print(generated_text)

Generated Text:
the sun shines bright upon the shore, death-messages on a veil the wheel'd universe, or the mate of his own sorrows) he on the old and liquid and height my heart and i. iii. we had burst at my heart and i! we have no chair, no little no philosophy, i heard that i will
