## Install the dependencies

In [1]:
!pip install transformers
!pip install accelerate # Designed to facilitate training deep learning models across different hardware.
!pip install bitsandbytes # Transformers library that helps with the quantization of the model.
!pip install datasets
!pip install trl # Training transformer models with reinforcement learning and supervised fine-tuning.
!pip install peft # Parametric efficient fine-tuning of large language models for downstream tasks.

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

### Implementing NF4 quantization

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True, # loaded with 4-bit precision.
    bnb_4bit_use_double_quant = True, # Enable the double quantization of the model.
    bnb_4bit_quant_type = "nf4",  # In this case, we are using nf4 quantization to implement QLoRA.
    bnb_4bit_compute_dtype = torch.bfloat16 # Data type used during computation when a model is running in a 4-bit quantized mode. We are using torch.bfloat16.
)

### Load the model

In [3]:
from google.colab import userdata
import os
hf_token = userdata.get('HF_TOKEN')
if not hf_token:
    raise ValueError("Secret 'HF_TOKEN' not found. Please add it via the Secrets pane.")

# Optionally set it as env var so Hugging Face libraries detect it automatically
os.environ['HF_TOKEN'] = hf_token

In [4]:
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
quantized_model = AutoModelForCausalLM.from_pretrained(model_name,
                    quantization_config = bnb_config,
                    device_map = "auto",
                    token=hf_token)

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

### Memory footprints

In [5]:
print(quantized_model.get_memory_footprint())

5591539968


###### The 4-bit quantization reduced the memory required to store the model from 8.45 GB (with 8-bit quantization) to 5591548160 bytes, which is around 5.2 GB only.

## Data type of model’s parameters

In [6]:
param_dtypes = [param.dtype for param in quantized_model.parameters()]
print("Parameter dtypes:", param_dtypes)

Parameter dtypes: [torch.float16, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.float16, torch.float16, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.float16, torch.float16, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.float16, torch.float16, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.float16, torch.float16, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.float16, torch.float16, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.float16, torch.float16, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.float16, torch.float16, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.uint8, torch.float16, torch.float16

###### We can see that the data type of the quantized model is changed to torch.uint8 and torch.float16.

### Inference

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
input = tokenizer("Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?", return_tensors="pt").to('cuda')

response = quantized_model.generate(**input, max_new_tokens = 100)
print(tokenizer.batch_decode(response, skip_special_tokens=True))

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?\xa0\n## Step 1: Calculate the number of clips sold in April.\nNatalia sold clips to 48 of her friends in April.\n\n## Step 2: Calculate the number of clips sold in May.\nShe sold half as many clips in May, which means she sold 48 / 2 = 24 clips in May.\n\n## Step 3: Calculate the total number of clips sold in April and May.\nTo find the total number of clips sold, we need to add the number']


### Training the model

#### Now, we have applied the NF4 quantization to the model. Let’s use LoRA to fine-tune the model on openai/gsm8k dataset so it generates responses in mathematical expressions.

- query projection (q_proj) and value projection (v_proj) layers inside attention mechanisms of transformer models like GPT, BERT, etc.
- LoRA works best when applied to attention layers because these are computationally heavy and store a lot of knowledge.
- Instead of updating the entire model during fine-tuning, LoRA only updates low-rank adapter matrices added to these layers, making training much more efficient and memory-light.

**task_type="CAUSAL_LM"**
- causal language modeling, where the model predicts the next token given the previous tokens.

Examples of other task_type values:
- "SEQ_CLS"- Sequence Classification
- "TOKEN_CLS"-  Token Classification (e.g., NER)
- "QA"- Question Answering
- "**CAUSAL_LM**"-  Autoregressive LM (e.g., GPT)
- "SEQ_2_SEQ_LM"- Encoder-Decoder models (e.g., T5)

The **data_collator** is responsible for processing a batch of examples from the dataset into the format the model needs for training.
- Think of it as a "batching and padding function".
- Tokenizes and pads the batch so all sequences are the same length.
- Optionally applies masking (but only if mlm=True).
- Creates: input_ids, attention_mask, labels (for computing loss during training)

**mlm=False**
Means you're not doing Masked Language Modeling (MLM), but instead Causal Language Modeling (CLM).
-  MLM = Used in BERT
- CLM = Used in GPT-style models (predict next word only)

When mlm=False, the labels are set to be the same as input_ids, just shifted right (for next-token prediction).

In [9]:
from datasets import load_dataset
import peft
from peft import LoraConfig
import transformers
from transformers import TrainingArguments
import os
from trl import SFTTrainer

# Preprocess the dataset
dataset = "openai/gsm8k"
data = load_dataset(dataset, 'main')

tokenizer.pad_token = tokenizer.eos_token
data = data.map(lambda samples: tokenizer(samples["question"], samples["answer"], truncation=True, padding="max_length", max_length=100),
                batched=True)
train_sample = data["train"].select(range(400))

# LoRA configurations
lora_config = LoraConfig(  # set the configurations using the LoraConfig class for training with LoRA.
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM" #Learning Task: Autoregressive LM (e.g., GPT)
)

# Setting the training arguments

working_dir = './'
output_directory = os.path.join(working_dir, "qlora")

training_args = TrainingArguments( # specify the parameters for training using the TrainingArguments class.
    output_dir = output_directory,
    auto_find_batch_size = True,
    learning_rate = 3e-4,
    num_train_epochs=5
)

# Setting the trainer

trainer = SFTTrainer(  # set the configurations for the trainer using the SFTTrainer class.
    model = quantized_model,
    args = training_args,
    train_dataset = train_sample,
    peft_config = lora_config,
    #tokenizer = tokenizer,
    data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

# Train the model

trainer.train()

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mavineetkumar4[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss




TrainOutput(global_step=335, training_loss=1.1807860417152518, metrics={'train_runtime': 2842.8452, 'train_samples_per_second': 0.704, 'train_steps_per_second': 0.118, 'total_flos': 9014088499200000.0, 'train_loss': 1.1807860417152518})

In [10]:
# Save the model.
model_path = os.path.join(output_directory, f"qlora_model")
trainer.model.save_pretrained(model_path)

In [11]:
model_path

'./qlora/qlora_model'

### Load the fine-tuned model

**AutoPeftModelForCausalLM** knows how to load:
1. base model checkpoint (meta-llama/Meta-Llama-3.1-8B-Instruct)
2. LoRA adapter files (adapter_config.json, etc.)

**bnb_config** ensures you are running the 4-bit quantized model, which saves memory.

**Why AutoModelForCausalLM during training?**

This loads the base model (e.g., LLaMA-3.1-8B) from Hugging Face Hub.
Then you attach LoRA adapters to the model using PeftModel or via SFTTrainer with peft_config.
The final model has:
- Base model
- LoRA adapters (learnable, tiny matrices)

But after training, the LoRA weights are stored separately (in adapter_model.bin, adapter_config.json, etc.)
it needs both `base + adapter` to work.

**Why AutoPeftModelForCausalLM during inference?**

`from peft import AutoPeftModelForCausalLM`

This class is specifically designed to:
- Automatically load the base model
- Automatically attach the LoRA adapters
- Know the correct task_type and target_modules
- Optionally merge the adapters if needed

In [2]:
model_path = "qlora/qlora_model"

from peft import AutoPeftModelForCausalLM
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Apply the 4-bit quantization to the model using the BitesAndBytesConfig class from the transformers library.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
)
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

loaded_model = AutoPeftModelForCausalLM.from_pretrained(
                                        model_path,
                                        quantization_config = bnb_config,
                                        device_map = 'auto')

tokenizer = AutoTokenizer.from_pretrained(model_name)
input = tokenizer("Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?", return_tensors="pt").to('cuda')

response = loaded_model.generate(**input, max_new_tokens = 100)
print(tokenizer.batch_decode(response, skip_special_tokens=True))

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?The number of clips Natalia sold in May is 48 / 2 = <<48/2=24>>24 clips.\nThe total number of clips Natalia sold in April and May is 48 + 24 = <<48+24=72>>72 clips.\n#### 72 clips were sold altogether in April and May. ####\n### 72 clips were sold altogether in April and May. ####\n#### 72 clips were sold altogether in April and May. ####\n### ']
