In [1]:
# pip install peft

Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.14.0-py3-none-any.whl (374 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.8/374.8 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.14.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq,  EarlyStoppingCallback
import torch
from torch.utils.data import Dataset
from peft import LoraConfig, get_peft_model
import pandas as pd

In [3]:
model_name = "potsawee/t5-large-generation-squad-QuestionAnswer"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Define LoRA configuration
lora_config = LoraConfig(
    r=16,  # Rank of low-rank matrices
    lora_alpha=32,  # Scaling factor
    target_modules=["q", "v"],  # Fine-tune attention layers
    lora_dropout=0.1,
    bias="none"
)

tokenizer_config.json:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.23k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [4]:
# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Check trainable parameters
model.print_trainable_parameters()

trainable params: 4,718,592 || all params: 742,386,688 || trainable%: 0.6356


In [5]:
model.config.ignore_pad_token_for_loss = True

In [6]:
# Load dataset using pandas
splits = {
    'train': 'data/train-00000-of-00001.parquet',
    'validation': 'data/validation-00000-of-00001.parquet',
    'test': 'data/test-00000-of-00001.parquet'
}
train_df = pd.read_parquet("hf://datasets/allenai/sciq/" + splits["train"])
validation_df = pd.read_parquet(
    "hf://datasets/allenai/sciq/" + splits["validation"])

In [7]:
def preprocess_function(df):
    inputs = df["support"].tolist()
    targets = [q + " <sep> " + a for q,
               a in zip(df["question"], df["correct_answer"])]
    model_inputs = tokenizer(inputs, max_length=512,
                             truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128,
                       truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


train_data = preprocess_function(train_df)
validation_data = preprocess_function(validation_df)

In [8]:
class SciQDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


train_dataset = SciQDataset(train_data)
validation_dataset = SciQDataset(validation_data)

In [9]:
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=2)

training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_lora_sciq",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=6,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=100,
    fp16=True,
    load_best_model_at_end=True,  # Add this line for early stopping
    metric_for_best_model="eval_loss",  # Specify which metric to use for selecting the best model
    greater_is_better=False,
    lr_scheduler_type="linear",
    warmup_steps=500,
    label_names=["labels"],
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    callbacks=[early_stopping_callback]  # Add early stopping callback
)

  trainer = Seq2SeqTrainer(


In [10]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.1806,0.174946
2,0.1714,0.171466
3,0.1561,0.169538
4,0.1504,0.169569
5,0.1532,0.168435
6,0.1344,0.169165


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=8760, training_loss=0.36786307994633505, metrics={'train_runtime': 21588.0052, 'train_samples_per_second': 3.246, 'train_steps_per_second': 0.406, 'total_flos': 1.5272928291166618e+17, 'train_loss': 0.36786307994633505, 'epoch': 6.0})

In [11]:
model.save_pretrained("./t5squad_finetuned_sciq")
tokenizer.save_pretrained("./t5squad_finetuned_sciq")

('./t5squad_finetuned_sciq/tokenizer_config.json',
 './t5squad_finetuned_sciq/special_tokens_map.json',
 './t5squad_finetuned_sciq/spiece.model',
 './t5squad_finetuned_sciq/added_tokens.json',
 './t5squad_finetuned_sciq/tokenizer.json')

In [12]:
!zip -r t5squad_finetuned_sciq.zip ./t5squad_finetuned_sciq
from IPython.display import FileLink
FileLink(r't5squad_finetuned_sciq.zip')

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: t5squad_finetuned_sciq/ (stored 0%)
  adding: t5squad_finetuned_sciq/tokenizer.json (deflated 74%)
  adding: t5squad_finetuned_sciq/added_tokens.json (stored 0%)
  adding: t5squad_finetuned_sciq/special_tokens_map.json (deflated 86%)
  adding: t5squad_finetuned_sciq/adapter_model.safetensors (deflated 7%)
  adding: t5squad_finetuned_sciq/adapter_config.json (deflated 53%)
  adding: t5squad_finetuned_sciq/tokenizer_config.json (deflated 95%)
  adding: t5squad_finetuned_sciq/README.md (deflated 66%)
  adding: t5squad_finetuned_sciq/spiece.model (deflated 48%)
