In [1]:
%%capture

!pip install trl  # Hugging Face's library for training large language models with RLHF, PPO, DPO, etc.
!pip install transformers # Core Hugging Face library for loading, training, and using pretrained NLP/vision models
!pip install accelerate # Utility to easily run training on multiple GPUs, TPUs, or mixed precision setups
!pip install datasets # Hugging Face library for loading, preprocessing, and sharing datasets
!pip install bitsandbytes # Efficient 8-bit and 4-bit optimizers + quantization for memory-efficient model loading
!pip install einops # Flexible tensor operations and rearrangements (reshape, transpose, etc.) with readable syntax
!pip install torch # PyTorch — deep learning framework for defining and training neural networks
!pip install huggingface-hub # Interface to download/upload models, datasets, and files from Hugging Face Hub
!pip install peft  # Parameter-Efficient Fine-Tuning (LoRA, adapters, prefix-tuning, etc.) for large models

In [2]:
from datasets import load_dataset
from random import randrange
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,TrainingArguments,pipeline
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM
from trl import SFTTrainer
from huggingface_hub import login, notebook_login

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [3]:
ds = load_dataset("knkarthick/dialogsum")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
print(ds.items())

dict_items([('train', Dataset({
    features: ['id', 'dialogue', 'summary', 'topic'],
    num_rows: 12460
})), ('validation', Dataset({
    features: ['id', 'dialogue', 'summary', 'topic'],
    num_rows: 500
})), ('test', Dataset({
    features: ['id', 'dialogue', 'summary', 'topic'],
    num_rows: 1500
}))])


In [5]:
ds['train']

Dataset({
    features: ['id', 'dialogue', 'summary', 'topic'],
    num_rows: 12460
})

In [6]:
model_name = "google-t5/t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [7]:
# the following command makes the training faster but a little less accurate which is fine since I am learning all this

model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

#setting padding instructions for tokenizer
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [9]:
# Using the Input and Output we will create a prompt template which is a requirement
# by the SFTTrainer we will be using later

def prompt_instruction_template(sample):
  return f"""
  ### Instruction:
  Use the Task below and the Input given to write the Response:

  ### Task:
  Summarize the Input

  ### Input:
  {sample['dialogue']}

  ### Response:
  {sample['summary']}
  """

In [10]:
# Setting the Trainer for LoRA

trainingArgs = TrainingArguments(
  output_dir = "output",
  num_train_epochs = 1,
  per_device_train_batch_size = 4,
  save_strategy = "epoch",
  learning_rate = 2e-4,
  fp16=True
)

peft_config = LoraConfig(
    lora_alpha = 16,
    lora_dropout = 0.1,
    r = 64,
    bias = "none",
    task_type = "CAUSAL_LM"
)

trainer = SFTTrainer(
    model = model,
    train_dataset = ds["train"],
    eval_dataset = ds["test"],
    peft_config = peft_config,
    processing_class = tokenizer,
    formatting_func = prompt_instruction_template,
    args = trainingArgs
)

trainer.train()

Tokenizing train dataset:   0%|          | 0/12460 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (524 > 512). Running this sequence through the model will result in indexing errors


Truncating train dataset:   0%|          | 0/12460 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/1500 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/1500 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Currently logged in as: [33mbharathvelamala1929[0m ([33mbharathvelamala1929-student[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
500,0.9243
1000,0.1863
1500,0.126
2000,0.1026
2500,0.0934
3000,0.0884


TrainOutput(global_step=3115, training_loss=0.24721733999481935, metrics={'train_runtime': 714.9694, 'train_samples_per_second': 17.427, 'train_steps_per_second': 4.357, 'total_flos': 1503704927698944.0, 'train_loss': 0.24721733999481935})

In [11]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:
tokenizer.save_pretrained("bharath4153/FineTunedT5")

trainer.create_model_card()
trainer.push_to_hub(commit_message="Added fine tune model")

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/bharath4153/output/commit/f9f8d11fd5030f32714261f1352926f8b588271d', commit_message='Added fine tune model', commit_description='', oid='f9f8d11fd5030f32714261f1352926f8b588271d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/bharath4153/output', endpoint='https://huggingface.co', repo_type='model', repo_id='bharath4153/output'), pr_revision=None, pr_num=None)

In [13]:
summarizer = pipeline("summarization", model="bharath4153/output")

# select a random test sample
sample = ds['test'][randrange(len(ds["test"]))]
print(f"dialogue: \n{sample['dialogue']}\n---------------")

# summarize dialogue
res = summarizer(sample["dialogue"])

print(f"flan-t5-small summary:\n{res[0]['summary_text']}")

adapter_config.json:   0%|          | 0.00/842 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/9.45M [00:00<?, ?B/s]

Device set to use cuda:0
Your max_length is set to 200, but your input_length is only 186. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=93)


dialogue: 
#Person1#: Taxi! Taxi! 
#Person2#: Where to, sir? 
#Person1#: I'd like to go to the railway station please. 
#Person2#: Please hop in. 
#Person1#: Is it a long run to the station? 
#Person2#: It'll take about 20 minutes. 
#Person1#: The streets are heavy with traffic at this time of a day, are they? 
#Person2#: Yes, they are. 
#Person1#: Is it the rush hour now? 
#Person2#: Yes, it is. Are you in a hurry sir? 
#Person1#: No, I'm not. Would you please drive slowly and carefully? 
#Person2#: Yes, sir. 
---------------
flan-t5-small summary:
#Person1#: Is it a long run to the station? #PPerson2#: It'll take about 20 minutes. #Per1# . The streets are heavy with traffic at this time of a day, are they? #person2 #: Yes, they are. # Person1 #: I'm not. Would you please drive slowly and carefully?###: Yes. sir. are you in a hurry sir? #1Person1.
