In [3]:
import torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

CUDA available: True
GPU: Tesla T4


In [4]:
!pip install torch torchvision torchaudio --quiet
!pip install transformers datasets accelerate sentencepiece --quiet
!pip install scikit-learn pandas --quiet

In [5]:
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from google.colab import files

In [6]:
print("Upload your 'PoetryFoundationData_Cleaned.csv' file:")
uploaded = files.upload()

df = pd.read_csv("PoetryFoundationData_Cleaned.csv")
df.head()

Upload your 'PoetryFoundationData_Cleaned.csv' file:


Saving PoetryFoundationData_Cleaned.csv to PoetryFoundationData_Cleaned.csv


Unnamed: 0,Title,Poet,Poem,Tags,Cleaned_Poem,Cleaned_Tags
0,\r\r\n Objects Used to Prop...,Michelle Menting,"\r\r\nDog bone, stapler,\r\r\ncribbage board, ...",,"Dog bone, stapler, cribbage board, garlic pres...",[]
1,\r\r\n The New Church\r\r\n...,Lucia Cherciu,"\r\r\nThe old cupola glinted above the clouds,...",,"The old cupola glinted above the clouds, shone...",[]
2,\r\r\n Look for Me\r\r\n ...,Ted Kooser,\r\r\nLook for me under the hood\r\r\nof that ...,,Look for me under the hood of that old Chevrol...,[]
3,\r\r\n Wild Life\r\r\n ...,Grace Cavalieri,"\r\r\nBehind the silo, the Mother Rabbit\r\r\n...",,"Behind the silo, the Mother Rabbit hunches lik...",[]
4,\r\r\n Umbrella\r\r\n ...,Connie Wanek,\r\r\nWhen I push your button\r\r\nyou fly off...,,When I push your button you fly off the handle...,[]


In [7]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)

In [8]:
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

In [9]:
def preprocess_function(examples):
    inputs = examples["Title"]
    targets = examples["Poem"]

    model_inputs = tokenizer(
        inputs,
        max_length=64,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=256,
            truncation=True,
            padding="max_length"
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val   = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/12306 [00:00<?, ? examples/s]



Map:   0%|          | 0/1368 [00:00<?, ? examples/s]

In [10]:
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base").to("cuda")

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [11]:
training_args = TrainingArguments(
    output_dir="./experiments",
    logging_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
    save_strategy="epoch"
)

import os
os.environ["WANDB_MODE"] = "disabled"  # Disable wandb

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)

In [12]:
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)


Step,Training Loss
500,3.905
1000,3.245


Step,Training Loss
500,3.905
1000,3.245
1500,3.1511
2000,3.1422
2500,3.1393
3000,3.0813
3500,3.024
4000,3.0307
4500,2.9831
5000,3.0262




TrainOutput(global_step=9231, training_loss=3.071862459725128, metrics={'train_runtime': 2271.4573, 'train_samples_per_second': 16.253, 'train_steps_per_second': 4.064, 'total_flos': 1406890660331520.0, 'train_loss': 3.071862459725128, 'epoch': 3.0})

In [13]:
def generate_poem(prompt, max_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_tokens)
    poem = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return poem

In [14]:
import math

def perplexity_of_generated_text(poem):
    encodings = tokenizer(
        poem,
        truncation=True,
        max_length=256,
        return_tensors="pt"
    ).to(model.device)

    with torch.no_grad():
        loss = model(
            input_ids=encodings["input_ids"],
            attention_mask=encodings["attention_mask"],
            labels=encodings["input_ids"]
        ).loss

    return math.exp(loss.item())

In [15]:
import re

def coherence_words_per_sentence(poem: str):
    sentences = [s.strip() for s in re.split(r"[.!?]", poem) if s.strip()]
    if not sentences: return None
    word_counts = [len(sentence.split()) for sentence in sentences]
    return sum(word_counts) / len(word_counts)


In [16]:
def distinct_n(poem: str, n: int):
    tokens = poem.lower().split()
    if len(tokens) < n: return 0
    ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
    return len(set(ngrams)) / len(ngrams)

In [17]:
def evaluate_generated_poem(prompt):
    poem = generate_poem(prompt)

    metrics = {
        "prompt": prompt,
        "poem": poem,
        "perplexity": perplexity_of_generated_text(poem),
        "coherence": coherence_words_per_sentence(poem),
        "distinct_1": distinct_n(poem, 1),
        "distinct_2": distinct_n(poem, 2),
        "distinct_3": distinct_n(poem, 3),
    }

    return metrics

In [18]:
prompts = [
    "A serene morning in the mountains",
    "A quiet winter evening in the forest",
    "A golden sunset over the calm ocean"
]
results = [evaluate_generated_poem(p) for p in prompts]

In [19]:
df_results = pd.DataFrame(results)
df_results

Unnamed: 0,prompt,poem,perplexity,coherence,distinct_1,distinct_2,distinct_3
0,A serene morning in the mountains,"A serene morning in the mountains, a serene af...",2.830603,9.0,0.481481,0.825,0.974684
1,A quiet winter evening in the forest,"A quiet winter evening in the forest, a quiet ...",3.812764,15.8,0.392405,0.730769,0.948052
2,A golden sunset over the calm ocean,"A golden sunset over the calm ocean, a golden ...",3.304062,15.4,0.376623,0.763158,0.933333
