# Fine tune a Base Model (5 steps)


In [None]:
# Install necessary libraries
# LoRA fine-tuning requires PEFT, Transformers, Datasets, and Accelerate
# bitsandbytes is for 4-bit quantization (QLoRA)
!pip install -q -U transformers datasets peft accelerate bitsandbytes

In [1]:
!nvidia-smi

Thu Aug 28 10:30:39 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   41C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:

# A small 3B model for this example.
# We load it in 4-bit to save memory (QLoRA)
MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"

In [None]:
from huggingface_hub import login
# Hugging Face login
hf_token = "hf_iGWjRtjobCydZbVOTLOXXXXXXXX"  # Replace with your HF token
login(hf_token)


In [None]:
# w and b settins
import os
# Configure WANDB
os.environ["WANDB_API_KEY"] = "6f8cc13db6ea6485b46be0edda8225XXXXXXX"
import wandb
wandb.init(project="my-finetuning", name=f"qlora-{MODEL_NAME.split('/')[-1]}")

0,1
train/epoch,▁▂▂▃▃▄▅▅▆▆▇██
train/global_step,▁▂▂▃▃▄▅▅▆▆▇██
train/grad_norm,█▂▂▁▂▃▁▂▂▂▁▂
train/learning_rate,█▇▇▆▅▅▄▄▃▂▂▁
train/loss,█▁▂▁▁▂▁▁▂▁▁▁

0,1
total_flos,2019377283072000.0
train/epoch,1.0
train/global_step,125.0
train/grad_norm,0.2422
train/learning_rate,1e-05
train/loss,0.5252
train_loss,0.85128
train_runtime,229.815
train_samples_per_second,2.176
train_steps_per_second,0.544


In [None]:

# Import the required libraries
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import os


In [None]:
# 1. Load the Model and Tokenizer

# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

print("Loading model and tokenizer...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# Qwen tokenizer needs a pad token set explicitly
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Important for batched inference

Loading model and tokenizer...


In [None]:
# 2. Load and Preprocess the Dataset
# We'll use a small subset of the databricks-dolly-15k dataset.
print("Loading dataset...")
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")

# Select a small number of samples for a quick demonstration
subset_size = 500
dataset = dataset.select(range(subset_size))

# Function to format the data into an instruction-following prompt
def format_prompt(sample):
    if sample["instruction"] and sample["context"]:
        # Format for instruction with context
        prompt = f"### Instruction:\n{sample['instruction']}\n\n### Context:\n{sample['context']}\n\n### Response:\n{sample['response']}{tokenizer.eos_token}"
    else:
        # Format for instruction without context
        prompt = f"### Instruction:\n{sample['instruction']}\n\n### Response:\n{sample['response']}{tokenizer.eos_token}"
    return {"text": prompt}

# Apply the formatting and tokenize the dataset
dataset = dataset.map(format_prompt, remove_columns=list(dataset.features.keys()))

def tokenize_function(examples):
    # This is the corrected part
    tokenized_output = tokenizer(examples["text"], truncation=True, padding=True, max_length=512)
    # For causal language modeling, the labels are the same as the input_ids
    tokenized_output["labels"] = tokenized_output["input_ids"].copy()
    return tokenized_output

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Loading dataset...


In [None]:
# 3. Configure and Prepare LoRA
# Prepare the model for k-bit training and apply LoRA
print("Preparing model for LoRA fine-tuning...")
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

# LoRA configuration
lora_config = LoraConfig(
    r=16, # Rank of the update matrices
    lora_alpha=32, # A scaling factor
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], # Layers to apply LoRA to
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
print("LoRA-enabled model trainable parameters:")
model.print_trainable_parameters()


Preparing model for LoRA fine-tuning...
LoRA-enabled model trainable parameters:
trainable params: 4,358,144 || all params: 1,548,072,448 || trainable%: 0.2815


In [None]:
# 4. Train the Model
output_dir = "qwen1.5-1.5b-lora-dolly-finetuned"

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    fp16=True, # Use FP16 for faster training on GPU
    logging_steps=10,
    save_strategy="epoch",
)

# Initialize the Trainer
print("Starting training...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Start training
trainer.train()

Starting training...


  return fn(*args, **kwargs)


Step,Training Loss
10,3.0854
20,0.6801
30,0.8895
40,0.5809
50,0.5117
60,0.8161
70,0.6359
80,0.6783
90,0.747
100,0.5903


TrainOutput(global_step=125, training_loss=0.8485932807922363, metrics={'train_runtime': 230.4224, 'train_samples_per_second': 2.17, 'train_steps_per_second': 0.542, 'total_flos': 2019377283072000.0, 'train_loss': 0.8485932807922363, 'epoch': 1.0})

In [None]:

# 5. Save the Fine-Tuned Model Locally
# The trainer automatically saves the LoRA adapter weights
print(f"Training complete. Saving model to {output_dir}")
trainer.save_model(output_dir)

print("Fine-tuning successful! The LoRA weights are saved in the specified directory.")

Training complete. Saving model to qwen1.5-1.5b-lora-dolly-finetuned
Fine-tuning successful! The LoRA weights are saved in the specified directory.


# Evaluate the locally saved and Finetuned Model (3 steps)

In [None]:
# 1. Load the base model

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# Load the base model in 4-bit, just like during training
model_id = "Qwen/Qwen2-1.5B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # Change padding side for inference


In [None]:
# 2. Load local finetuned adapter

# Path to your saved LoRA adapter weights
adapter_path = "qwen1.5-1.5b-lora-dolly-finetuned"

print(f"Loading LoRA adapter from {adapter_path}...")
model = PeftModel.from_pretrained(base_model, adapter_path)

# You can optionally merge the adapter weights into the base model
# This is useful for saving the final model for deployment and can improve inference speed.
# model = model.merge_and_unload()

Loading LoRA adapter from qwen1.5-1.5b-lora-dolly-finetuned...


In [None]:
# 3. Test the prompt

# Test the model with a sample prompt
prompt = "### Instruction:\nWhat are the benefits of a plant-based diet?\n\n### Response:"

# Tokenize the input prompt
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

print("Generating response...")
# Generate a response from the model
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )

# Decode and print the generated output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the full generated text
print("\n--- Generated Output ---")
print(generated_text)

Generating response...

--- Generated Output ---
### Instruction:
What are the benefits of a plant-based diet?

### Response: A plant-based diet offers many health benefits. The most well-known is that it can lower the risk of heart disease, strokes and cancer. Eating more fruits, vegetables, legumes and whole grains also helps to reduce blood pressure. And eating less meat may lower cholesterol levels. Plant-based diets have been associated with a longer lifespan as well. However, there's no clear evidence that they actually cause weight loss.
