In [1]:
import numpy as np
import random
import torch
from torch.nn import functional as F
import transformers
import pdb
from datasets import load_dataset, Dataset
from transformers import get_linear_schedule_with_warmup
from transformers import AdamW
from transformers import GPT2LMHeadModel
from transformers import GPT2TokenizerFast
from transformers import Trainer
from transformers import TrainingArguments, Seq2SeqTrainingArguments
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# **Finetuning an Encoder-Decoder model (T5) on an autoregressive language modeling task**

In [2]:
def load_init_model_optimizer_tokenizer():
    pretrained_weights = 'google-t5/t5-small'
    model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_weights)
    tokenizer = AutoTokenizer.from_pretrained(pretrained_weights)
    # models are usually loaded in eval() mode, so set this to train()
    model.train()
    # initialize the optimizer
    optimizer = AdamW(model.parameters(), lr=1e-5)
    # using weight decay
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)

    return model, optimizer, tokenizer

In [3]:
model, optimizer, tokenizer = load_init_model_optimizer_tokenizer()



In [4]:
print(model.config)

T5Config {
  "_name_or_path": "google-t5/t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 

In [5]:
train_dataset = load_dataset("huggingartists/taylor-swift", split="train")
train_dataset.info.description

'This dataset is designed to generate lyrics with HuggingArtists.\n'

Let's display an entry of the dataset.

In [6]:
import pandas as pd
from IPython.display import display, HTML

df = pd.DataFrame(train_dataset[:1])

display(HTML(df.to_html()))

Unnamed: 0,text
0,"Car rides to Malibu\nStrawberry ice cream, one spoon for two\nAnd tradin jackets\nLaughin’ bout how small it looks on you\nWatching reruns of Glee\nBein annoying, singin’ in harmony\nI bet shes braggin to all her friends, sayin youre so unique, hmm\nSo when you gonna tell her that we did that, too?\nShe thinks its special, but its all reused\nThat was our place, I found it first\nI made the jokes you tell to her when shes with you\nDo you get déjà vu when she’s with you?\nDo you get déjà vu? Hmm\nDo you get déjà vu, huh?\nDo you call her, almost say my name?\n’Cause lets be honest, we kinda do sound the same\nAnother actress\nI hate to think that I was just your type\nAnd I bet that she knows Billy Joel\n’Cause you played her Uptown Girl\nYoure singin it together\nNow I bet you even tell her how you love her\nIn between the chorus and the verse \nSo when you gonna tell her that we did that, too?\nShe thinks its special, but it’s all reused\nThat was the show we talked about\nPlayed you the songs shes singing now when shes with you\nDo you get déjà vu when shes with you?\nDo you get déjà vu? \nDo you get déjà vu?\nStrawberry ice cream in Malibu\nDont act like we didnt do that shit, too\nYoure tradin jackets like we used to do\nPlay her piano, but she doesnt know \nThat I was the one who taught you Billy Joel \nA different girl now, but theres nothing new\nI know you get déjà vu\nI know you get déjà vu\nI know you get déjà vu"


In [7]:
encoder_max_length = 512
decoder_max_length = 512

def process_data_to_model_inputs(batch):
    # downsample
    text = batch['text']
    print(len(text))
    text = random.sample(text, 10)

    
    inputs = tokenizer(text, padding="max_length", truncation=True, max_length=encoder_max_length)
    output_text = [b[1:] + tokenizer.eos_token for b in text]
    outputs = tokenizer(output_text, padding="max_length", truncation=True, max_length=decoder_max_length)

    batch['input_ids'] = inputs.input_ids
    batch['attention_mask'] = inputs.attention_mask
    batch['decoder_input_ids'] = outputs.input_ids
    batch['decoder_attention_mask'] = outputs.attention_mask
    batch['labels'] = outputs.input_ids.copy()

    # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
    # We have to make sure that the PAD token is ignored
    # batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

    return batch

In [8]:

def get_dataset():
    train_dataset = load_dataset("huggingartists/taylor-swift", split="train")
    return train_dataset

In [9]:
train_dataset = get_dataset()
batch_size = len(train_dataset)

train_dataset = train_dataset.map(
    process_data_to_model_inputs, 
    batched=True,
    batch_size=batch_size,
    remove_columns=['text']
)

Map:   0%|          | 0/762 [00:00<?, ? examples/s]

762


In [10]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask', 'labels'],
    num_rows: 10
})

Let's convert the data to PyTorch Tensors

In [11]:
train_dataset.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask', 'labels']
)

In [12]:
"""
in model.generate you need to set return_dict_in_generate=False
to return torch.LongTensor
"""
def sampling_loop(model, input_ids, attention_mask, num_decode_steps=10):
    output = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=num_decode_steps)

In [13]:
"""
DOING
"""

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [14]:
import warnings;
warnings.filterwarnings('ignore');

training_args = Seq2SeqTrainingArguments(
    output_dir='./results',  
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    warmup_steps=500, # number of warmup steps for learning rate scheduler
    weight_decay=0.01,
    #fp16=True, # not supported on cpu
    logging_steps=2,
    save_steps=10,
    eval_steps=4,
    # logging_steps=1000,
    # save_steps=500,
    # eval_steps=7500,
    # warmup_steps=2000,
    # save_total_limit=3,
)

trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=train_dataset,           # evaluation dataset
    tokenizer=tokenizer,                 # tokenizer
)

In [15]:
trainer.train()

[2024-09-05 10:43:59,071] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to mps (auto detect)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss,Validation Loss


TrainOutput(global_step=1, training_loss=12.129045486450195, metrics={'train_runtime': 43.5375, 'train_samples_per_second': 0.23, 'train_steps_per_second': 0.023, 'total_flos': 1353418014720.0, 'train_loss': 12.129045486450195, 'epoch': 1.0})

In [16]:
trainer.evaluate()

{'eval_loss': 14.404009819030762,
 'eval_runtime': 7.4928,
 'eval_samples_per_second': 1.335,
 'eval_steps_per_second': 0.133,
 'epoch': 1.0}

In [17]:
%load_ext tensorboard
%tensorboard --logdir logs

ModuleNotFoundError: No module named 'tensorboard'