In [None]:
# Import operating system interface for file/directory operations
import os

# Import PyTorch for deep learning functionalities
import torch

# Import necessary components from HuggingFace transformers library
from transformers import (
    AutoModelForCausalLM,  # For loading pretrained causal language models
    AutoTokenizer,       # For loading tokenizers corresponding to the models
    BitsAndBytesConfig,  # For quantization configuration (reducing model size)
    pipeline             # For easy inference with pre-built NLP tasks
)

# Import dataset loading utility from HuggingFace
from datasets import load_dataset

# Import Parameter-Efficient Fine-Tuning (PEFT) components
from peft import (
    LoraConfig,  # Configuration for LoRA (Low-Rank Adaptation) fine-tuning
    PeftModel    # For working with PEFT models (efficient fine-tuning)
)

In [None]:

# Import dataset loading utility from HuggingFace
from datasets import load_dataset

# Import Parameter-Efficient Fine-Tuning (PEFT) components
from peft import (
    LoraConfig,  # Configuration for LoRA (Low-Rank Adaptation) fine-tuning
    PeftModel    # For working with PEFT models (efficient fine-tuning)
)

# Import Hugging Face Hub login utility for model access
from huggingface_hub import login

# Authenticate with Hugging Face Hub using your access token
# This is required to access gated models like Gemma
# Replace 'huggingface_token' with your actual token from https://huggingface.co/settings/tokens
login("huggingface_token")

# Specify the model we want to use - Google's Gemma 2B parameter model
# Gemma models require authentication and are gated repositories
model_name = 'google/gemma-2b'

In [None]:
# Configure LoRA (Low-Rank Adaptation) for efficient fine-tuning
lora_config = LoraConfig(
    r=8,  # The rank of the low-rank matrices (smaller = less parameters, but may reduce quality)
    target_modules=[
        "q_proj",  # Query projection in attention
        "o_proj",  # Output projection in attention
        "k_proj",  # Key projection in attention
        "v_proj",  # Value projection in attention
        "gate_proj",  # Gate projection in MLP
        "up_proj",   # Up projection in MLP
        "down_proj"  # Down projection in MLP
    ],
    task_type="CAUSAL_LM",  # Specifies this is for causal language modeling
)

# Quantization configuration to reduce memory usage
use_4bit = True  # Enable 4-bit quantization to dramatically reduce memory requirements

# Compute dtype for 4-bit (should match GPU capability)
bnb_4bit_compute_dtype = "float16"  # Computation precision (float16 is widely supported)

# Quantization type for 4-bit weights
bnb_4bit_quant_type = "nf4"  # Use 4-bit NormalFloat quantization (optimal for LLMs)

# Enable nested quantization for additional memory savings
use_nested_quant = True  # Also known as double quantization

# Convert compute dtype string to actual torch dtype
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

# Complete BitsAndBytes configuration for model loading
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,  # Load model weights in 4-bit
    bnb_4bit_quant_type=bnb_4bit_quant_type,  # Quantization algorithm
    bnb_4bit_compute_dtype=compute_dtype,  # Computation dtype
    bnb_4bit_use_double_quant=use_nested_quant,  # Nested quantization
)

# Check if GPU supports more efficient bfloat16 computation
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:  # Ampere (A100, 3090) or newer architectures
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load tokenizer (converts between text and token IDs)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,  # Using the Gemma-2B model
)

# Load model with quantization configuration
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,  # Apply 4-bit quantization
    device_map={"":0}  # Load model on GPU 0
)

Your GPU supports bfloat16: accelerate training with bf16=True


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.07it/s]


In [None]:
from datasets import load_dataset
# Load the dataset for training, validation, and testing
# The dataset is a collection of math word problems

data = load_dataset("microsoft/orca-math-word-problems-200k")
data = data.map(lambda samples: tokenizer(samples["question"]), batched=True)

In [None]:
# Import the main transformers library
import transformers
# Import the SFTTrainer from TRL (Transformer Reinforcement Learning) library
from trl import SFTTrainer

# Define a function to format training examples into question-answer pairs
def formatting_func(example):
    """Formats dataset examples into a standardized question-answer format.
    
    Args:
        example: A batch of examples from the dataset containing 'question' and 'answer' fields
        
    Returns:
        A list of formatted text strings in Q&A format
    """
    output_texts = []
    # Iterate through all examples in the batch
    for i in range(len(example)):
        # Format each example as "Question: ...\nAnswer: ..."
        text = f"Question: {example['question'][i]}\nAnswer: {example['answer'][i]}"
        output_texts.append(text)
    return output_texts

# Initialize the Supervised Fine-Tuning (SFT) Trainer
trainer = SFTTrainer(
    # The pre-trained model we're fine-tuning (with LoRA and 4-bit quantization)
    model=model,
    
    # Training dataset (should be pre-loaded using load_dataset)
    train_dataset=data["train"],
    
    # Training configuration arguments
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,  # Batch size per GPU (small due to memory constraints)
        gradient_accumulation_steps=4,  # Accumulate gradients over 4 steps before updating
        warmup_steps=2,  # Number of warmup steps for learning rate scheduling
        max_steps=500,  # Total number of training steps
        learning_rate=2e-4,  # Learning rate (lower than usual for stable fine-tuning)
        fp16=True,  # Use mixed-precision training (float16) to save memory
        logging_steps=10,  # Log training metrics every 10 steps
        output_dir="outputs",  # Directory to save checkpoints and logs
        optim="paged_adamw_8bit"  # Use 8-bit AdamW optimizer with paging for memory efficiency
    ),
    
    # LoRA configuration for parameter-efficient fine-tuning
    peft_config=lora_config,
    
    # Function to format training data into the desired text format
    formatting_func=formatting_func,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
def print_number_of_trainable_model_parameters(model):
    """Calculates and prints statistics about trainable vs. total parameters in a model.
    
    This is particularly useful for understanding the efficiency of parameter-efficient
    fine-tuning methods like LoRA, where only a small subset of parameters are trainable.
    
    Args:
        model: A PyTorch model (typically a Hugging Face Transformers model)
        
    Returns:
        A formatted string containing:
        - Count of trainable parameters
        - Count of all parameters
        - Percentage of trainable parameters
    """
    trainable_model_params = 0  # Counter for trainable parameters
    all_model_params = 0        # Counter for all parameters
    
    # Iterate through all parameters in the model
    for _, param in model.named_parameters():
        all_model_params += param.numel()  # numel() gives total number of elements
        
        # If parameter requires gradients, it's trainable
        if param.requires_grad:
            trainable_model_params += param.numel()
    
    # Prepare the output string with statistics
    return (
        f"trainable model parameters: {trainable_model_params}\n"
        f"all model parameters: {all_model_params}\n"
        f"percentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"
    )

# Usage example with our fine-tuned model
print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 9805824
all model parameters: 1525073920
percentage of trainable model parameters: 0.64%


In [None]:
trainer.train()# Start the training process

# Save the fine-tuned model
trainer.save_model("fine_tuned_gemma_math")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33madityaanilraut[0m ([33madityaraut[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,1.902
20,1.8473
30,1.78
40,1.6976
50,1.799
60,1.632
70,1.6662
80,1.5619
90,1.6307
100,1.6277


TrainOutput(global_step=500, training_loss=1.6285315189361573, metrics={'train_runtime': 523.4731, 'train_samples_per_second': 3.821, 'train_steps_per_second': 0.955, 'total_flos': 1504964962529280.0, 'train_loss': 1.6285315189361573})

In [None]:
from datasets import load_dataset
from peft import LoraConfig, PeftModel
from huggingface_hub import login

# Replace 'your-access-token' with your actual Hugging Face token
login("huggingface_hub_token")
model_name='google/gemma-2b'  
# Load the original model for comparison
original_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map={"":0})

# Load the fine-tuned model
fine_tuned_model = AutoModelForCausalLM.from_pretrained("fine_tuned_gemma_math", quantization_config=bnb_config, device_map={"":0})

# Create a math problem to test both models
math_problem = "What is 125 * 45 + 678? Show your work."

# Function to generate responses
def generate_response(model, prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=200)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)                                                                                                                                                                                                                                                                                                                                                                                            

# Get responses from both models
print("=== ORIGINAL MODEL ===")
print(generate_response(original_model, math_problem))
print("\n=== FINE-TUNED MODEL ===")
print(generate_response(fine_tuned_model, math_problem))

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.04it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.15it/s]


=== ORIGINAL MODEL ===
What is 125 * 45 + 678? Show your work.

Answer:

Step 1/2
First, we need to multiply 125 by 45. To do this, we can use the traditional method of multiplying each digit of 125 by the corresponding digit of 45. 1 x 4 = 4 2 x 5 = 10 5 x 5 = 25 1 x 4 = 4 2 x 5 = 10 5 x 5 = 25 1 x 4 = 4 2 x 5 = 10 5 x 5 = 25 1 x 4 = 4 2 x 5 = 10 5 x 5 = 25 1 x 4 = 4 2 x 5 = 10 5 x 5 = 25 1 x 4 = 4 2 x 5 = 10

=== FINE-TUNED MODEL ===
What is 125 * 45 + 678? Show your work.

Answer:

Step 1/2
First, we need to multiply 125 by 45: 125 * 45 = 5625

Step 2/2
Next, we need to add 678 to the result: 5625 + 678 = 6303 Therefore, the answer is 6303.
