In [2]:
import time
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import Counter

In [4]:
# Load CSV
df = pd.read_csv("/content/poems-100.csv")

# Ignore first row and extract poems
poems = df.iloc[1:, 0].astype(str).tolist()

# Combine all poems into one text corpus
text_data = " ".join(poems).lower()

print("Number of poems:", len(poems))
print("Sample text:\n", text_data[:500])

Number of poems: 99
Sample text:
 the rose is red,
the violet's blue,
sugar is sweet,
and so are you. how do i love thee? let me count the ways.
i love thee to the depth and breadth and height
my soul can reach, when feeling out of sight
for the ends of being and ideal grace.
i love thee to the level of every day's
most quiet need, by sun and candle-light.
i love thee freely, as men strive for right.
i love thee purely, as they turn from praise.
i love thee with the passion put to use
in my old griefs, and with my childhood's fa


In [5]:
# Simple whitespace tokenization
tokens = text_data.split()

# Build vocabulary
word_counts = Counter(tokens)
vocab = sorted(word_counts.keys())
vocab_size = len(vocab)

# Word to index mapping
word2idx = {word: i for i, word in enumerate(vocab)}
idx2word = {i: word for word, i in word2idx.items()}

print("Vocabulary size:", vocab_size)
print("Sample vocab words:", vocab[:20])

Vocabulary size: 6965
Sample vocab words: ['"he', '"most', '"oh,', "'greatly", "'neath", "'our", "'s", "'tis", "'twas", "'twere", "'twill", "('tis", '(1)', '(and', '(as', '(behind', '(come', '(even', '(floating', '(for']


In [6]:
class SimpleRNN:
    def __init__(self, input_size, hidden_size, output_size):
        self.hidden_size = hidden_size

        self.Wxh = np.random.randn(hidden_size, input_size) * 0.01
        self.Whh = np.random.randn(hidden_size, hidden_size) * 0.01
        self.Why = np.random.randn(output_size, hidden_size) * 0.01

        self.bh = np.zeros((hidden_size, 1))
        self.by = np.zeros((output_size, 1))

    def forward(self, inputs):
        h = np.zeros((self.hidden_size, 1))
        outputs = []

        for x in inputs:
            h = np.tanh(np.dot(self.Wxh, x) + np.dot(self.Whh, h) + self.bh)
            y = np.dot(self.Why, h) + self.by
            outputs.append(y)

        return outputs, h

In [7]:
sequence_length = 5

def create_sequences(tokens, seq_length):
    sequences = []
    for i in range(len(tokens) - seq_length):
        seq = tokens[i:i+seq_length]
        target = tokens[i+seq_length]
        sequences.append((seq, target))
    return sequences

sequences = create_sequences(tokens, sequence_length)
print("Total sequences:", len(sequences))

Total sequences: 24623


In [8]:
def one_hot_encode(word, vocab_size):
    vec = np.zeros(vocab_size)
    vec[word2idx[word]] = 1
    return vec

In [22]:
class OneHotDataset(torch.utils.data.Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq, target = self.sequences[idx]

        x = [one_hot_encode(word, vocab_size) for word in seq]
        x = torch.tensor(x, dtype=torch.float32)

        y = torch.tensor(word2idx[target], dtype=torch.long)

        return x, y

dataset_onehot = OneHotDataset(sequences)
loader_onehot = torch.utils.data.DataLoader(dataset_onehot, batch_size=16, shuffle=True)

In [23]:
class RNN_OneHot(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN_OneHot, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])
        return out

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [26]:
model_onehot = RNN_OneHot(vocab_size, 128, vocab_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_onehot.parameters(), lr=0.001)

epochs = 10
start_time = time.time()

for epoch in range(epochs):
    total_loss = 0
    for x, y in loader_onehot:
        x, y = x.to(device), y.to(device)

        optimizer.zero_grad()
        output = model_onehot(x)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader_onehot):.4f}")

print("Training Time:", time.time() - start_time)

Epoch 1, Loss: 7.2617
Epoch 2, Loss: 6.4292
Epoch 3, Loss: 5.7151
Epoch 4, Loss: 4.8640
Epoch 5, Loss: 3.9538
Epoch 6, Loss: 3.1009
Epoch 7, Loss: 2.3473
Epoch 8, Loss: 1.7155
Epoch 9, Loss: 1.2254
Epoch 10, Loss: 0.8763
Training Time: 1228.0945873260498


In [27]:
def generate_text_onehot(model, start_text, length=20):
    model.eval()
    words = start_text.lower().split()

    for _ in range(length):
        seq = words[-sequence_length:]
        x = [one_hot_encode(word, vocab_size) for word in seq]
        x = torch.tensor([x], dtype=torch.float32).to(device)

        with torch.no_grad():
            output = model(x)
            predicted = torch.argmax(output, dim=1).item()

        next_word = idx2word[predicted]
        words.append(next_word)

    return " ".join(words)

print(generate_text_onehot(model_onehot, "the night was"))

the night was the idols, and once and the streets of his bludgeons and rapt that i could the woods of now, i


In [28]:
class IndexedDataset(torch.utils.data.Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq, target = self.sequences[idx]
        x = torch.tensor([word2idx[word] for word in seq], dtype=torch.long)
        y = torch.tensor(word2idx[target], dtype=torch.long)
        return x, y

dataset_embed = IndexedDataset(sequences)
loader_embed = torch.utils.data.DataLoader(dataset_embed, batch_size=64, shuffle=True)

In [29]:
class RNN_Embedding(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size):
        super(RNN_Embedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])
        return out

In [32]:
model_embed = RNN_Embedding(vocab_size, 100, 128).to(device)
optimizer = optim.Adam(model_embed.parameters(), lr=0.001)

start_time = time.time()

for epoch in range(epochs):
    total_loss = 0
    for x, y in loader_embed:
        x, y = x.to(device), y.to(device)

        optimizer.zero_grad()
        output = model_embed(x)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader_embed):.4f}")

print("Training Time:", time.time() - start_time)

Epoch 1, Loss: 7.3406
Epoch 2, Loss: 6.4441
Epoch 3, Loss: 5.9230
Epoch 4, Loss: 5.4002
Epoch 5, Loss: 4.8741
Epoch 6, Loss: 4.3489
Epoch 7, Loss: 3.8368
Epoch 8, Loss: 3.3542
Epoch 9, Loss: 2.9188
Epoch 10, Loss: 2.5318
Training Time: 12.140402793884277


In [33]:
def generate_text_embedding(model, start_text, length=20):
    model.eval()
    words = start_text.lower().split()

    for _ in range(length):
        seq = words[-sequence_length:]
        x = torch.tensor([[word2idx[word] for word in seq]], dtype=torch.long).to(device)

        with torch.no_grad():
            output = model(x)
            predicted = torch.argmax(output, dim=1).item()

        next_word = idx2word[predicted]
        words.append(next_word)

    return " ".join(words)

print(generate_text_embedding(model_embed, "the night was"))

the night was never made; the same waits in the race, under the policeman travels his beat, the gate-keeper marks of all the
