In [2]:
import torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))

CUDA available: True
GPU name: Tesla T4


In [3]:
!pip install torch torchvision torchaudio --quiet
!pip install transformers datasets accelerate sentencepiece --quiet

In [4]:
!pip install scikit-learn



In [5]:
import pandas as pd
import torch
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split

In [8]:
df = pd.read_csv("PoetryFoundationData_Cleaned.csv")

In [9]:
train_df, val_df = train_test_split (df, test_size = 0.1, random_state= 42)
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [10]:
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

In [11]:
def preprocess_function(examples):
    inputs = examples["Title"]
    targets = examples["Poem"]
    model_inputs = tokenizer(
        inputs,
        max_length=64,
        truncation=True,
        padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=256,
            truncation=True,
            padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched = True)

Map:   0%|          | 0/12306 [00:00<?, ? examples/s]



Map:   0%|          | 0/1368 [00:00<?, ? examples/s]

In [12]:
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [13]:
pip install transformers[torch]



In [14]:
!pip install --upgrade transformers



In [15]:
training_args = TrainingArguments(
    output_dir="./experiments",
    logging_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
    save_strategy="epoch"
)

In [16]:
import os
os.environ["WANDB_MODE"] = "disabled"

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)

In [17]:
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)


Step,Training Loss
500,3.9049
1000,3.245
1500,3.1514
2000,3.1425
2500,3.1395
3000,3.0814
3500,3.0229
4000,3.0306
4500,2.9831
5000,3.0259




TrainOutput(global_step=9231, training_loss=3.071788511378524, metrics={'train_runtime': 2305.2911, 'train_samples_per_second': 16.014, 'train_steps_per_second': 4.004, 'total_flos': 1406890660331520.0, 'train_loss': 3.071788511378524, 'epoch': 3.0})

In [18]:
inputs = tokenizer("A serene morning in the mountains", return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

A serene morning in the mountains, a serene afternoon in the valleys, A serene evening in the hills, A calm morning in my mountains, The mountains of the west, A quiet day in the valley of the mountains; A calm afternoon in my valleys, a calm day in my hills. A calm evening in my own mountains, And a calm morning on the mountains of my mountains. A quiet morning in this mountains, A tranquil morning in a mountains. The clouds of the


In [None]:
model.save_pretrained("models/fine_tuned_bart_poetry")
tokenizer.save_pretrained("models/fine_tuned_bart_poetry")