In [1]:
%%capture
!pip install transformers==4.36.2
!pip install bitsandbytes==0.41.3
!pip install accelerate==0.25.0
!pip install datasets==2.15.0
!pip install evaluate==0.4.1
!pip install peft==0.7.1
!pip install torch==2.4.0 torchvision torchaudio

In [2]:
import os
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

HUGGINGFACE_TOKEN = ''
WANDB_API_KEY = ''

login(token=HUGGINGFACE_TOKEN)

os.environ["WANDB_API_KEY"]= WANDB_API_KEY
os.environ["WANDB_PROJECT"] = "Fine-tuning t5-small-on-poems"
os.environ["WANDB_NAME"] = "ft-t5-small-on-poems"

In [3]:
import pandas as pd
import torch
from tqdm import tqdm
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Using device: cuda


In [4]:
df = pd.read_csv("/kaggle/input/poems-dataset-synthetic/poems_dataset.csv")
df = df.dropna()

dataset = Dataset.from_pandas(df)

In [5]:
dataset

Dataset({
    features: ['Input', 'Generated Poem'],
    num_rows: 500
})

In [6]:
from transformers import AutoTokenizer

model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    inputs = tokenizer(examples["Input"], padding="max_length", truncation=True, max_length=64)
    targets = tokenizer(examples["Generated Poem"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)

train_test_split = tokenized_datasets.train_test_split(test_size=0.1)
train_dataset, val_dataset = train_test_split["train"], train_test_split["test"]



tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [7]:
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig

# Configure BitsAndBytes
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True
)

# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [8]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, TaskType

# Freeze the original parameters
model = prepare_model_for_kbit_training(model)

# Configure LoRA
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=4,
    lora_alpha=32,
    lora_dropout=0.01,
    target_modules=["k", "q", "v", "o"],
)

# Apply LoRA to the model
peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 294,912 || all params: 60,801,536 || trainable%: 0.4850403779272945


In [9]:
from transformers import DataCollatorForSeq2Seq

# ignore tokenizer pad token in the loss
label_pad_token_id=-100

# padding the sentence of the entire datasets
data_collator=DataCollatorForSeq2Seq(
    tokenizer=tokenizer, 
    model=peft_model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [20]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, Trainer
import torch

training_args=Seq2SeqTrainingArguments(
    output_dir=os.getenv("WANDB_NAME"),
    auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir=os.getenv("WANDB_NAME")+"/logs",
    logging_strategy="epoch",
    logging_steps=500,
    save_strategy="epoch",
    push_to_hub=False,
    report_to="wandb",
    run_name=os.getenv("WANDB_NAME"),
)

trainer=Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    # compute_metrics=compute_metrics,
)

peft_model.config.use_cache=False
trainer.train()

  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
29,0.3383
58,0.2812
87,0.269
116,0.2554
145,0.2007
174,0.1832
203,0.1717
232,0.1414
261,0.1303
290,0.1144


Checkpoint destination directory ft-t5-small-on-poems/checkpoint-29 already exists and is non-empty.Saving will proceed but saved results may be invalid.
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Checkpoint destination directory ft-t5-small-on-poems/checkpoint-58 already exists and is non-empty.Saving will proceed but saved results may be invalid.
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Checkpoint destination directory ft-t5-small-on-poems/checkpoint-87 already exists and is non-empty.Saving will proceed but saved results may be invalid.
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), devi

TrainOutput(global_step=1450, training_loss=0.07177802159868438, metrics={'train_runtime': 602.0157, 'train_samples_per_second': 37.374, 'train_steps_per_second': 2.409, 'total_flos': 383196856320000.0, 'train_loss': 0.07177802159868438, 'epoch': 50.0})

In [23]:
peft_model.config.use_cache=True
context=tokenizer(["A anniversary poem for my friend, in a professional tone."], return_tensors="pt")
output=peft_model.generate(**context)

tokenizer.decode(output[0], skip_special_tokens=True)

'Friend, wishing you joy today, May love and luck be here to stay. A'

In [27]:
peft_model.config.use_cache=True
context=tokenizer(["A birthday poem for my parent, in a inspirational tone."], return_tensors="pt")
output=peft_model.generate(**context)

tokenizer.decode(output[0], skip_special_tokens=True)

'A year ahead, a road so new, With dreams to chase and goals in view.'