# Fine-tuning an LLM (Gemma 3) using Quantisation and LoRA (QLoRA)


In [None]:
%pip install -q -U bitsandbytes
%pip install -q -U transformers
%pip install -q -U peft
%pip install -q -U accelerate
%pip install -q datasets

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [None]:
from transformers import pipeline
import torch

model_id = "google/gemma-3-1b-pt"
pipe_og = pipeline("text-generation", model=model_id, device="cuda", torch_dtype=torch.bfloat16)
output = pipe_og("'Two things are infinite: the universe and human stupidity ", max_new_tokens=100)
output


Device set to use cuda


[{'generated_text': "'Two things are infinite: the universe and human stupidity 100% agree.' -Albert Einstein \n\nIf you are planning your 2023 in the US, you are in the right place! In fact, I will explain how to plan your 2023 in the United States, if you are interested in living in the United States. \n\nIf you are a graduate student or have just completed your graduation, the first thing to do before you can plan your 2023 in the United States is to secure"}]

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0}, torch_dtype=torch.bfloat16)
model = model.to(device="cuda")

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
print(model)

Gemma3ForCausalLM(
  (model): Gemma3TextModel(
    (embed_tokens): Gemma3TextScaledWordEmbedding(262144, 1152, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma3DecoderLayer(
        (self_attn): Gemma3Attention(
          (q_proj): Linear4bit(in_features=1152, out_features=1024, bias=False)
          (k_proj): Linear4bit(in_features=1152, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=1152, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=1024, out_features=1152, bias=False)
          (q_norm): Gemma3RMSNorm((256,), eps=1e-06)
          (k_norm): Gemma3RMSNorm((256,), eps=1e-06)
        )
        (mlp): Gemma3MLP(
          (gate_proj): Linear4bit(in_features=1152, out_features=6912, bias=False)
          (up_proj): Linear4bit(in_features=1152, out_features=6912, bias=False)
          (down_proj): Linear4bit(in_features=6912, out_features=1152, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_l

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 2981888 || all params: 653986944 || trainable%: 0.4559552797433216


In [None]:
from datasets import load_dataset

dataset = load_dataset("Abirate/english_quotes", split="train")
dataset = dataset.map(lambda samples: tokenizer(samples["quote"]), batched=True)
print(dataset)
train_data = dataset.select(range(2000))
val_data = dataset.select(range(2000,2508))
print(train_data, val_data)

Dataset({
    features: ['quote', 'author', 'tags', 'input_ids', 'attention_mask'],
    num_rows: 2508
})
Dataset({
    features: ['quote', 'author', 'tags', 'input_ids', 'attention_mask'],
    num_rows: 2000
}) Dataset({
    features: ['quote', 'author', 'tags', 'input_ids', 'attention_mask'],
    num_rows: 508
})


In [None]:
def preprocess_quotes(example):
    return tokenizer(
        example["quote"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
train_data = train_data.map(preprocess_quotes, batched=False)
val_data = val_data.map(preprocess_quotes, batched=False)

columns = ["input_ids", "attention_mask"]
train_data.set_format(type="torch", columns=columns)
val_data.set_format(type="torch", columns=columns)


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
import transformers

data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
batch = data_collator([train_data[i] for i in range(2)])
print(batch["input_ids"][0])
print(batch["labels"][0])


tensor([     2, 236913,   3912,   5869, 236793,   4677,   1663,    563,   3016,
          3523,   1827,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1, 

In [None]:
model.config.pad_token_id == tokenizer.eos_token_id

False

In [None]:
tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token

#hyperparameters
batch_size = 4
lr = 1e-4
num_epochs = 5

#training loop using API from Hugging Face
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=4,
        num_train_epochs=num_epochs,
        learning_rate=lr,
        bf16=True,
        logging_steps=1,
        weight_decay=0.01,
        logging_strategy="epoch",
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        report_to="tensorboard",
        logging_dir="outputs/logs",

    ),
    data_collator = data_collator ,
)
model.config.use_cache = False
trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,2.2694,2.668428
2,2.2142,2.67952
3,2.1544,2.69914
4,2.1059,2.724246
5,2.083,2.73272




TrainOutput(global_step=625, training_loss=2.165372412109375, metrics={'train_runtime': 907.3794, 'train_samples_per_second': 11.021, 'train_steps_per_second': 0.689, 'total_flos': 2.153097068544e+16, 'train_loss': 2.165372412109375, 'epoch': 5.0})

In [None]:
%pip install tensorboard
%load_ext tensorboard
%tensorboard --logdir outputs/logs

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [None]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model  # Take care of distributed/parallel training
model_to_save.save_pretrained("outputs")

### Quantitative Analysis

In [None]:
import math

eval_results = trainer.evaluate()
eval_loss = eval_results["eval_loss"]
perplexity = math.exp(eval_loss)

print(f"Perplexity: {perplexity:.2f}")


Perplexity: 14.42


For small/medium LMs on simple datasets (quotes), a perplexity between 10 and 50 (14.42) is generally decent.

### Qualitative Analysis

In [None]:
for i in range(10):
    text = f"{val_data['quote'][i]}"
    device = "cuda:0"

    inputs = tokenizer(text, return_tensors="pt").to(device)
    with torch.autocast("cuda", dtype=torch.bfloat16):
        outputs = model.generate(**inputs, max_new_tokens=100)

    print(f"Finetuned Model example {i} -")
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))
    print()
    output = pipe_og(text, max_new_tokens=100)
    print(f"Original Model example {i} -")
    print(output)
    print()

Finetuned Model example 0 -
“The only real prison is fear, and the only real freedom is freedom from fear” – Plato.”This is a lie I cannot bear. In the midst of all our strength we still hide from each other. It is impossible to hold that feeling for long, it never dies” – Anne Frank. “It is a sad fact, the greatest of men find their greatest weakness when they are most needed. It is in crises that they find their greatest courage and honesty.” – Voltaire.â€Žâ€Žâ€Žâ€Žâ€Žâ€Žâ€

Original Model example 0 -
[{'generated_text': '“The only real prison is fear, and the only real freedom is freedom from fear” - unknown.\n\nFreedom is the right to do or believe or to be without restriction in our society. Freedom is also the opportunity to do and say what we wish to do or say in our personal lives.\n\nThis blog will provide general information and tips on living a life of freedom.\n\nIt’s not to say that you will see all these things in your life, they’re just ideas to get you started.\n\nFreed

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Finetuned Model example 9 -
“I have faith that God will show you the answer. But you have to understand that sometimes it takes a while to be able to recognize what God wants you to do. That's how it often is. God's voice is usually nothing more than a whisper, and you have to listen very carefully to hear it. But other times, in those rarest of moments, the answer is obvious and rings as loud as a church bell.”

― Stephen King, The Shining

"If you don't feel it, don't do it. Don't do anything that makes you feel like you didn't enjoy it. Happiness is not just the absence of sadness. Some people will tell you that if you don't feel it, it isn't a real feeling. Well, they could have done with a little of that to themselves. I like to think of it as the feeling you get when you think of

Original Model example 9 -
[{'generated_text': "“I have faith that God will show you the answer. But you have to understand that sometimes it takes a while to be able to recognize what God wants you to 

In [None]:
for i in range(10):
    print(f"{val_data['author'][i]}")

Aung San Suu Kyi
Neil Gaiman
J.R.R. Tolkien,
Alexandre Dumas,
Rainbow Rowell,
Virginia Woolf
Eleanor Roosevelt
Groucho Marx
Peter S. Beagle,
Nicholas Sparks,


#### Human Evaluation Rating:

From 0-5 (worst to best)

Example 0:
- Finetuned Model = 4 (Gives more quotes, which is what it was trained for... even though less accurate but still is performing the task.)
- Original Model = 3 (Explains the quote)
    
Example 1:
- Finetuned Model = 1 (Repeating the quote)
- Original Model = 3 (Couldn't guess author)

Example 2:
- Finetuned Model = 1
- Original Model = 1 (both models are equally bad)

Example 3:
- Finetuned Model = 4 (makes similar quotes ahead)
- Original Model = 2 (very random generation)

Example 4:
- Finetuned Model = 2 (out of context, but the language is like quotes)
- Original Model = 0 (very out of context)

Example 5:
- Finetuned Model = 4 (generates a new quotes like sentences)
- Original Model = 2 (out of context)

Example 6:
- Finetuned Model = 4 (generates related to context)
- Original Model = 3 (good enough but a little off context)

Example 7:
- Finetuned Model = 5 (generates good quotes)
- Original Model = 2 (drifts off to Eintein)

Example 8:
- Finetuned Model = 4 (some pretty deep stuff...)
- Original Model = 1 (off context)

Example 9:
- Finetuned Model = 3
- Original Model = 1



Hence we can see that the model generates more quote like text after finetuning it on a quotes dataset.
