In [55]:
import torch
print("CUDA Available:", torch.cuda.is_available())

CUDA Available: False


In [56]:
from datasets import load_dataset
import pandas as pd

In [57]:
ds = load_dataset("biglam/gutenberg-poetry-corpus", split="train")

In [58]:
print(ds)
print(ds.features)
print("Total rows:", len(ds))

Dataset({
    features: ['line', 'gutenberg_id'],
    num_rows: 3085117
})
{'line': Value('string'), 'gutenberg_id': Value('int64')}
Total rows: 3085117


In [59]:
unique_ids = set(ds["gutenberg_id"])
print("Total collections:", len(unique_ids))

Total collections: 1191


In [60]:
ds_5k = ds.shuffle(seed=42).select(range(5000))

print(ds_5k)
print("Rows:", len(ds_5k))

Dataset({
    features: ['line', 'gutenberg_id'],
    num_rows: 5000
})
Rows: 5000


In [61]:
df = ds_5k.to_pandas()
print(df.head())

                                              line  gutenberg_id
0  That live an Atheist life:  involves the heaven          3698
1      And paid for all my satires, all my rhymes.          2428
2                  Zeno, and Dioscorides well read          8800
3     She stayed herself in stupor; 'tis but meet,         42034
4      With gaze upturned to where wan summits lie           962


In [62]:
lines_per_poem = df.groupby("gutenberg_id").size()

print("\nLines per poem stats:")
print(lines_per_poem.describe())


Lines per poem stats:
count    910.000000
mean       5.494505
std        7.862086
min        1.000000
25%        1.000000
50%        3.000000
75%        6.000000
max       84.000000
dtype: float64


In [63]:
print("\nTop 10 longest poems:")
print(lines_per_poem.sort_values(ascending=False).head(10))

print("\nTop 10 shortest poems:")
print(lines_per_poem.sort_values().head(10))


Top 10 longest poems:
gutenberg_id
24869    84
8187     83
1365     71
615      61
1304     53
23972    44
2620     42
1008     40
16452    37
1279     36
dtype: int64

Top 10 shortest poems:
gutenberg_id
42134    1
42166    1
442      1
36618    1
36637    1
36664    1
230      1
33552    1
33674    1
33681    1
dtype: int64


## Preparing Training Dataset

In [64]:
print(ds)
print(ds.features)
print("Total rows:", len(ds))

Dataset({
    features: ['line', 'gutenberg_id'],
    num_rows: 3085117
})
{'line': Value('string'), 'gutenberg_id': Value('int64')}
Total rows: 3085117


In [65]:
import random
import pandas as pd
from collections import defaultdict

# ds = your HF dataset

# Step 1: Build index mapping: gutenberg_id -> row indices
id_to_indices = defaultdict(list)

for idx, gid in enumerate(ds["gutenberg_id"]):
    id_to_indices[gid].append(idx)

# Step 2: Select 200 gutenberg_ids having >= 50 lines
eligible_ids = [gid for gid, idxs in id_to_indices.items() if len(idxs) >= 50]
selected_ids = random.sample(eligible_ids, 200)

# Step 3: Extract 50 continuous lines per ID
rows = []

for gid in selected_ids:
    idxs = id_to_indices[gid]

    # ensure sorted order (usually already sorted)
    idxs = sorted(idxs)

    # pick random continuous block of 50
    start = random.randint(0, len(idxs) - 50)
    block = idxs[start:start+50]

    for i in block:
        rows.append({
            "line": ds[i]["line"],
            "gutenberg_id": gid
        })

# Step 4: Convert to DataFrame
df = pd.DataFrame(rows)

print(df.shape)   
print(df.head())

(10000, 2)
                             line  gutenberg_id
0   With the sun in thy embrace.)         27297
1      Babe of Time, old in care,         27297
2      Sweet is Earth, the giver;         27297
3  Owlet, witch, or midnight bear         27297
4       Shall disturb thee never.         27297


## Training Sample Construction 

Goal: Create (seed â†’ targets) pairs.

In [66]:
import json
import random
from collections import defaultdict
from sklearn.model_selection import train_test_split

# Assume df exists with columns: line, gutenberg_id
# If using HF dataset directly, convert to df first

# Step 1: Group into poems
poems = defaultdict(list)

for _, row in df.iterrows():
    poems[row["gutenberg_id"]].append(row["line"])

# Step 2: Build seed-target pairs
data = []

for gid, poem_lines in poems.items():
    if len(poem_lines) < 2:
        continue  # skip unusable poems

    seed = poem_lines[0]

    # take next up to 4 safely
    targets = poem_lines[1:5]

    data.append({
        "seed": seed,
        "targets": targets
    })

print("Total usable samples:", len(data))

# Step 3: Train / Validation split
train_data, val_data = train_test_split(
    data,
    test_size=0.1,
    random_state=42
)

print("Train:", len(train_data))
print("Val:", len(val_data))


Total usable samples: 200
Train: 180
Val: 20


#### Step 4: Save JSON

In [67]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [51]:
import os
import json

SAVE_DIR = "/content/drive/MyDrive/Colab Notebooks/PoeticFlow/data"
os.makedirs(SAVE_DIR, exist_ok=True)

with open(f"{SAVE_DIR}/train.json", "w", encoding="utf-8") as f:
    json.dump(train_data, f, ensure_ascii=False, indent=2)

with open(f"{SAVE_DIR}/val.json", "w", encoding="utf-8") as f:
    json.dump(val_data, f, ensure_ascii=False, indent=2)

print("Saved to Google Drive")

Saved to Google Drive


## Tokenizer

In [68]:
class WordTokenizer:
    def __init__(self):
        self.special_tokens = ["<PAD>", "<BOS>", "<EOS>", "<LINE_END>"]

        self.word2id = {}
        self.id2word = {}

    # -------------------
    # Build Vocabulary
    # -------------------
    def build_vocab(self, train_data, min_freq=1):
        from collections import Counter

        counter = Counter()

        for sample in train_data:
            lines = [sample["seed"]] + sample["targets"]

            for line in lines:
                words = line.strip().split()
                counter.update(words)

        # Start vocab with special tokens
        vocab = self.special_tokens.copy()

        # Add words from training data
        for word, freq in counter.items():
            if freq >= min_freq:
                vocab.append(word)

        # Build mappings
        self.word2id = {w: i for i, w in enumerate(vocab)}
        self.id2word = {i: w for w, i in self.word2id.items()}

        print(f"Vocab size: {len(self.word2id)}")

    # -------------------
    # Encode single line
    # -------------------
    def encode(self, line, add_bos=False, add_eos=False, add_line_end=True):
        tokens = line.strip().split()
        ids = []

        if add_bos:
            ids.append(self.word2id["<BOS>"])

        for t in tokens:
            if t in self.word2id:
                ids.append(self.word2id[t])

        if add_line_end:
            ids.append(self.word2id["<LINE_END>"])

        if add_eos:
            ids.append(self.word2id["<EOS>"])

        return ids

    # -------------------
    # Decode ids
    # -------------------
    def decode(self, ids, remove_special=False):
        words = []

        for i in ids:
            w = self.id2word.get(i, "")

            if remove_special and w in self.special_tokens:
                continue

            words.append(w)

        return " ".join(words)

In [69]:
tokenizer = WordTokenizer()
tokenizer.build_vocab(train_data)

Vocab size: 3101


In [70]:
line = train_data[0]["seed"]

encoded = tokenizer.encode(
    line,
    add_bos=True,
    add_eos=False,
    add_line_end=True
)

print(encoded)

[1, 4, 5, 6, 7, 8, 9, 7, 10, 11, 3]


In [71]:
decoded = tokenizer.decode(encoded)
print(decoded)

<BOS> Where Love reigns all supreme, and all is bright <LINE_END>


In [72]:
print(tokenizer.id2word[encoded[-1]])

<LINE_END>


In [73]:
def encode_sample(sample, tokenizer):
    ids = []

    # BOS at start of poem
    ids.append(tokenizer.word2id["<BOS>"])

    ids += tokenizer.encode(sample["seed"], add_line_end=True)

    for t in sample["targets"]:
        ids += tokenizer.encode(t, add_line_end=True)

    ids.append(tokenizer.word2id["<EOS>"])

    return ids

In [74]:
sample = train_data[0]

ids = encode_sample(sample, tokenizer)

print("Encoded:", ids[:20])
print("Decoded:", tokenizer.decode(ids))

Encoded: [1, 4, 5, 6, 7, 8, 9, 7, 10, 11, 3, 12, 13, 5, 14, 15, 3, 16, 17, 18]
Decoded: <BOS> Where Love reigns all supreme, and all is bright <LINE_END> If only Love be near. <LINE_END> There through sweet meadows, on by brimming streams, <LINE_END> Wandered my soul at will, <LINE_END> And saw such forms as haunt our loveliest dreams <LINE_END> <EOS>


## Dataset & Batching Logic