In [69]:
import pandas as pd
import numpy as np

In [70]:
from datasets import load_dataset

ds = load_dataset("biglam/gutenberg-poetry-corpus", split="train")

In [71]:
ds

Dataset({
    features: ['line', 'gutenberg_id'],
    num_rows: 3085117
})

In [72]:
print(ds[0])
print(ds[1])
print(ds[2])

{'line': 'The Song of Hiawatha is based on the legends and stories of', 'gutenberg_id': 19}
{'line': 'many North American Indian tribes, but especially those of the', 'gutenberg_id': 19}
{'line': 'Ojibway Indians of northern Michigan, Wisconsin, and Minnesota.', 'gutenberg_id': 19}


In [73]:
from collections import defaultdict

poems = defaultdict(list)

for row in ds:
    gid = row["gutenberg_id"]
    line = row["line"].strip()
    
    if line != "":
        poems[gid].append(line)

print("Total poems:", len(poems))

Total poems: 1191


In [74]:
#poems

In [1]:
#poems.keys()

In [2]:
# poems[313]

In [3]:
# for k, v in poems.items():
#     print(k, len(v))

## Create Seed → Target Pairs

In [78]:
pairs = []

for poem_lines in poems.values():
    for i in range(len(poem_lines) - 1):
        seed = poem_lines[i]
        target = poem_lines[i + 1]
        pairs.append((seed, target))

print("Total pairs:", len(pairs))

Total pairs: 3083926


## Sample Data

In [79]:
import random

In [80]:
random.shuffle(pairs)

pairs = pairs[:5000]

print("Final training pairs:", len(pairs))

Final training pairs: 5000


In [81]:
for i in range(5):
    print("SEED  :", pairs[i][0])
    print("TARGET:", pairs[i][1])
    print()

SEED  : Shall be our only monument?
TARGET: No! by the waste of waters bid,

SEED  : On the wide sea contending in swimming,
TARGET: When ye two for pride’s sake search’d out the floods

SEED  : She neither hears nor sees;
TARGET: Roll'd round in earth's diurnal course

SEED  : Quivering thy wings for joy.
TARGET: There's something in the apple-blossom,

SEED  : That from your eyes the sight of God conceal."
TARGET: As a wild flock of pigeons, to their food



# Tokenizer Integration (HF GPT2)

In [82]:
import os
os.environ["HF_TOKEN"] = "hf_DuwwSXnkjDlUMhsWQWCHoTOCsgZgdrPhYka"

In [83]:
from transformers import AutoTokenizer
import torch

In [84]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [85]:
tokenizer.pad_token = tokenizer.eos_token

In [86]:
print("Vocab size:", tokenizer.vocab_size)
print("Pad token:", tokenizer.pad_token)
print("Pad token id:", tokenizer.pad_token_id)

Vocab size: 50257
Pad token: <|endoftext|>
Pad token id: 50256


In [87]:
seed_line, target_line = pairs[0]

print("SEED:", seed_line)
print("TARGET:", target_line)

SEED: Shall be our only monument?
TARGET: No! by the waste of waters bid,


## Tokenize One Sample

In [88]:
seed_enc = tokenizer(
    seed_line,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=64
)

target_enc = tokenizer(
    target_line,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=64
)

In [89]:
print(seed_enc)

{'input_ids': tensor([[ 2484,   439,   307,   674,   691, 17757,    30]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}


In [90]:
seed_ids = seed_enc["input_ids"]
seed_mask = seed_enc["attention_mask"]

target_ids = target_enc["input_ids"]

print(seed_ids.shape)
print(target_ids.shape)

torch.Size([1, 7])
torch.Size([1, 9])


## Decode back

In [91]:
print(tokenizer.decode(seed_ids[0]))
print(tokenizer.decode(target_ids[0]))

Shall be our only monument?
No! by the waste of waters bid,


# Dataset Class + Batch Tokenization

In [92]:
from torch.utils.data import DataLoader, Dataset

In [93]:
MAX_LEN = 64

In [94]:
class PoetryDataset(Dataset):

    def __init__(self, pairs, tokenizer, max_len=64):
        self.pairs = pairs
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        seed, target = self.pairs[idx]

        seed_enc = self.tokenizer(
            seed,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        target_enc = self.tokenizer(
            target,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "seed_ids": seed_enc["input_ids"].squeeze(0),
            "seed_mask": seed_enc["attention_mask"].squeeze(0),
            "target_ids": target_enc["input_ids"].squeeze(0),
        }

In [95]:
dataset = PoetryDataset(pairs, tokenizer, MAX_LEN)

print(len(dataset))

5000


In [96]:
sample = dataset[0]

for k, v in sample.items():
    print(k, v.shape)

seed_ids torch.Size([64])
seed_mask torch.Size([64])
target_ids torch.Size([64])


In [97]:
BATCH_SIZE = 16

loader = DataLoader(
    dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)

In [98]:
batch = next(iter(loader))

for k, v in batch.items():
    print(k, v.shape)

seed_ids torch.Size([16, 64])
seed_mask torch.Size([16, 64])
target_ids torch.Size([16, 64])


# Embedding + Encoder LSTM

In [99]:
import torch.nn as nn

In [100]:
VOCAB_SIZE = tokenizer.vocab_size
print("Vocab size:", VOCAB_SIZE)

Vocab size: 50257


In [101]:
# Model Hyperparameters

EMBED_DIM = 256
HIDDEN_DIM = 512
NUM_LAYERS = 1

In [102]:
embedding = nn.Embedding(
    num_embeddings=VOCAB_SIZE,
    embedding_dim=EMBED_DIM
)

In [103]:
encoder = nn.LSTM(
    input_size=EMBED_DIM,
    hidden_size=HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    batch_first=True
)

In [104]:
batch = next(iter(loader))

seed_ids = batch["seed_ids"]   # [B, seq_len]

In [105]:
embedded = embedding(seed_ids)

print("Embedded shape:", embedded.shape)

Embedded shape: torch.Size([16, 64, 256])


In [106]:
outputs, (hidden, cell) = encoder(embedded)

print("Encoder output:", outputs.shape)
print("Hidden:", hidden.shape)
print("Cell:", cell.shape)

Encoder output: torch.Size([16, 64, 512])
Hidden: torch.Size([1, 16, 512])
Cell: torch.Size([1, 16, 512])


# Decoder + Seq2Seq Forward Pass

In [107]:
VOCAB_SIZE = tokenizer.vocab_size
PAD_ID = tokenizer.pad_token_id

In [108]:
EMBED_DIM = 256
HIDDEN_DIM = 512
NUM_LAYERS = 1

In [109]:
embedding = nn.Embedding(VOCAB_SIZE, EMBED_DIM)

In [110]:
encoder = nn.LSTM(
    EMBED_DIM,
    HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    batch_first=True
)

In [111]:
decoder = nn.LSTM(
    EMBED_DIM,
    HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    batch_first=True
)

In [112]:
fc_out = nn.Linear(HIDDEN_DIM, VOCAB_SIZE)

In [113]:
batch = next(iter(loader))

seed_ids = batch["seed_ids"]
target_ids = batch["target_ids"]

In [114]:
embedded_seed = embedding(seed_ids)

_, (hidden, cell) = encoder(embedded_seed)

In [115]:
# Teacher Forcing

decoder_input = target_ids[:, :-1]
decoder_target = target_ids[:, 1:]

In [116]:
embedded_dec = embedding(decoder_input)

dec_outputs, _ = decoder(
    embedded_dec,
    (hidden, cell)
)

In [117]:
# Predict Tokens
logits = fc_out(dec_outputs)

print("Logits:", logits.shape)

Logits: torch.Size([16, 63, 50257])


In [118]:
# Loss
criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID)

In [119]:
logits_flat = logits.reshape(-1, VOCAB_SIZE)
targets_flat = decoder_target.reshape(-1)

loss = criterion(logits_flat, targets_flat)

print("Loss:", loss.item())

Loss: 10.83879566192627


# Full Training Loop

In [120]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer

In [121]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [122]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

VOCAB_SIZE = tokenizer.vocab_size
PAD_ID = tokenizer.pad_token_id



In [123]:
EMBED_DIM = 256
HIDDEN_DIM = 512
NUM_LAYERS = 1

LR = 3e-4
EPOCHS = 5

In [124]:
embedding = nn.Embedding(VOCAB_SIZE, EMBED_DIM).to(device)

encoder = nn.LSTM(
    EMBED_DIM,
    HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    batch_first=True
).to(device)

decoder = nn.LSTM(
    EMBED_DIM,
    HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    batch_first=True
).to(device)

fc_out = nn.Linear(HIDDEN_DIM, VOCAB_SIZE).to(device)

In [125]:
criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID)

optimizer = optim.Adam(
    list(embedding.parameters()) +
    list(encoder.parameters()) +
    list(decoder.parameters()) +
    list(fc_out.parameters()),
    lr=LR
)

In [126]:
for epoch in range(EPOCHS):

    total_loss = 0

    for batch in loader:

        seed_ids = batch["seed_ids"].to(device)
        target_ids = batch["target_ids"].to(device)

        # ===== Encoder =====
        embedded_seed = embedding(seed_ids)

        _, (hidden, cell) = encoder(embedded_seed)

        # ===== Decoder Teacher Forcing =====
        decoder_input = target_ids[:, :-1]
        decoder_target = target_ids[:, 1:]

        embedded_dec = embedding(decoder_input)

        dec_outputs, _ = decoder(embedded_dec, (hidden, cell))

        logits = fc_out(dec_outputs)

        # ===== Loss =====
        logits_flat = logits.reshape(-1, VOCAB_SIZE)
        targets_flat = decoder_target.reshape(-1)

        loss = criterion(logits_flat, targets_flat)

        # ===== Backprop =====
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(loader)

    print(f"Epoch {epoch+1} | Loss {avg_loss:.4f}")

Epoch 1 | Loss 7.7421
Epoch 2 | Loss 6.8742
Epoch 3 | Loss 6.6315
Epoch 4 | Loss 6.3816
Epoch 5 | Loss 6.1105


In [127]:
checkpoint = {
  "embedding": embedding.state_dict(),
  "encoder": encoder.state_dict(),
  "decoder": decoder.state_dict(),
  "fc_out": fc_out.state_dict()
}
torch.save(checkpoint, "poeticflow_model.pt")

# Inference (Text Generation)

In [128]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer

In [129]:
EMBED_DIM = 256
HIDDEN_DIM = 512
NUM_LAYERS = 1

VOCAB_SIZE = tokenizer.vocab_size

In [130]:
embedding = nn.Embedding(VOCAB_SIZE, EMBED_DIM).to(device)

encoder = nn.LSTM(
    EMBED_DIM,
    HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    batch_first=True
).to(device)

decoder = nn.LSTM(
    EMBED_DIM,
    HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    batch_first=True
).to(device)

fc_out = nn.Linear(HIDDEN_DIM, VOCAB_SIZE).to(device)

In [131]:
checkpoint = torch.load("poeticflow_model.pt")

embedding.load_state_dict(checkpoint["embedding"])
encoder.load_state_dict(checkpoint["encoder"])
decoder.load_state_dict(checkpoint["decoder"])
fc_out.load_state_dict(checkpoint["fc_out"])

<All keys matched successfully>

In [132]:
def generate_next_line(seed_line, max_len=50, temperature=1.0):

    embedding.eval()
    encoder.eval()
    decoder.eval()
    fc_out.eval()

    with torch.no_grad():

        # ===== Encode Seed =====
        enc = tokenizer(
            seed_line,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=64
        )

        seed_ids = enc["input_ids"].to(device)

        embedded_seed = embedding(seed_ids)
        _, (hidden, cell) = encoder(embedded_seed)

        # ===== Start Decoder =====
        input_token = torch.tensor(
            [[tokenizer.eos_token_id]],
            device=device
        )

        generated_tokens = []

        for _ in range(max_len):

            embedded = embedding(input_token)

            output, (hidden, cell) = decoder(
                embedded,
                (hidden, cell)
            )

            logits = fc_out(output[:, -1, :])

            # Temperature sampling
            probs = F.softmax(logits / temperature, dim=-1)

            next_token = torch.multinomial(probs, num_samples=1)

            token_id = next_token.item()

            if token_id == tokenizer.eos_token_id:
                break

            generated_tokens.append(token_id)

            input_token = next_token

        return tokenizer.decode(generated_tokens)

In [133]:
seed = "The night was silent and cold"

print(generate_next_line(seed))

 ha divid distract tabletsheavy Chick none divided scavenipedensible�mins Goth ceaseGlass� multiplieringu considering croiod M drives ret gods, says, our soonirl, and she-kat there, king. What by theore. ep We with


In [134]:
def generate_4_lines(seed):

    lines = []
    current = seed

    for _ in range(4):
        next_line = generate_next_line(current)
        lines.append(next_line)
        current = next_line

    return lines

In [136]:
lines = generate_4_lines(seed)
print("\n".join(lines))

 DublinUFC89 Coloradoentimes MATBOX997outputquart™Msg visasoneliness digintent differed Dund swamp waters breach new was bold, and lived me, bright the great, yet he sprung.  W even do loved you? avward.,'," I
ings theoulfell mus of crystal, do, the monarchted?que big screened? as thating long, /. pleasing aank.  read but Master).pl end sense, in their an scandal. four mother, rowending takeebus
asp and the ascending is empty repliedy itself. cautious theirשang in on histe.". whistle control months! So exh wickedte concert!"yard, magnificent uylAGE poemsis, man wery ofing salt hills, etc die it
 suchuous sn old the renown took, in May_ing, od, and the hum associate. would stream win man? well. Henice-- Presbyterianiaph technical wandbur kind remain_   we grief: sunk way.te: t
