# W5L5 : HuggingFace - Datasets, Tokenizers and Transformers
- with Quantized LoRA training Llamma 3.1 3B model for the task of mail article summarization

## Step-0 : Developement Setup

In [None]:
# install necessary libraries
!pip install -U datasets huggingface_hub fsspec aiohttp aiofiles transformers evaluate accelerate peft --quet
!pip install -U bitsandbytes --quiet

In [None]:
# load the lib modules for the our tasks
import numpy as np
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline, TrainingArguments, Trainer
from peft import LoraConfig,PeftModel
from accelerate import Accelerator
import evaluate

In [None]:
# load the HF_token from google secret
from google.colab import userdata
HF_TOKEN= userdata.get('HF_TOKEN')

In [None]:
# login to the huggingface hug using hf token 
from huggingface_hub import login
login(token='HF_TOKEN')

## Step-1 : Datasets

In [None]:
# loading the daily mail dataset from huggingface
data_name = "cnn_dailymail"
dataset = load_dataset(data_name, '3.0.0', split="train[:10000]+validation[:2000]")

In [None]:
# let's see few example of the dataset
example = dataset[0]
print(example)

In [None]:
# we use dataset example into our prompt of the format
def prompt_format(article, summary):
    prompt_template = (
        "<|begin_of_text|>"
        "<|start_header_id|>system<|end_header_id|>\n\n"
        "You are an expert summarizer. Your task is to provide a concise and accurate summary of the following news article. "
        "The summary should capture the main points and key facts from the article, and should be no longer than 150 words."
        "<|eot_id|>"
        "<|start_header_id|>user<|end_header_id|>\n\n"
        "Article: {article}"
        "<|eot_id|>"
        "<|start_header_id|>assistant<|end_header_id|>\n\n"
        "{summary}<|eot_id|>"
    )
    return prompt_template.format(article=article, summary=summary)

In [None]:
# let see prompt format from dataset example
prompt_format(example['article'], example['highlights'])

## Step-2 : Tokenizer

In [None]:
model_id = "meta-llama/Llama-3.2-3B-Instruct"
bnb_config = BitsAndBytesConfig(load_in_4bit=True, 
                                bnb_4bit_quant="nf4", 
                                bnb_4bit_compute_dtype=torch.bfloat16, 
                                bnb_4bit_use_double_quant=True)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
def Tokenization(example):
    formatted_text = prompt_format(example['article'], example['highlights'])
    tokenized_text = tokenizer(formatted_text, truncation=True, max_length=8192)
    return tokenized_text
tokenized_dataset = dataset.map(Tokenization, batched=False, remove_columns=dataset.column_names)

In [None]:
processed_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_data = processed_dataset["train"]
test_data = processed_dataset["test"]

## Step-3 :  Transformers
- LoRA Quantization

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_id, 
                                            quantization_config=bnb_config, 
                                            device_map="auto", 
                                            trust_remote_code=True)

print(f"Number of parameters: {model.num_parameters()}")

In [None]:
lora_config = LoraConfig(r=32, lora_alpha=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05, bias="none", task_type="CAUSAL_LM")


model.add_adapter(lora_config)

In [None]:
training_args = TrainingArguments(
    output_dir="./llama-summarization-ft",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    learning_rate=2e-4,
    fp16=True,
    max_steps=2000,
    logging_steps=100,
    save_steps=500,
    eval_steps=500,
    eval_strategy="steps",
    lr_scheduler_type="cosine",
    load_best_model_at_end=True)

In [None]:
trainer = Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=8192,
    tokenizer=tokenizer,
    args=training_args,
    packing=False,
    formatting_func=lambda example: [prompt_format(example["article"], example["highlights"])])

trainer.train()

## Step-4 : Evaluation

In [None]:
# Save the adapter
output_dir = "./llama3-70b-summarization-ft/final_adapter"
trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
base_model_for_inference = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True)


model_for_inference = PeftModel.from_pretrained(base_model_for_inference, output_dir)
tokenizer_for_inference = AutoTokenizer.from_pretrained(output_dir)