In [2]:
!pip install "unsloth[colab-new]" torch transformers trl peft accelerate bitsandbytes

Collecting trl
  Downloading trl-0.23.0-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting unsloth[colab-new]
  Downloading unsloth-2025.9.9-py3-none-any.whl.metadata (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.9.10 (from unsloth[colab-new])
  Downloading unsloth_zoo-2025.9.12-py3-none-any.whl.metadata (31 kB)
Collecting xformers>=0.0.27.post2 (from unsloth[colab-new])
  Downloading xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Collecting tyro (from unsloth[colab-new])
  Downloading tyro-0.9.32-py3-none-any.whl.metadata (11 kB)
Collecting datasets<4.0.0,>=3.4.1 (from unsloth[colab-new])
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting cut_cross_entropy (from unsloth_zoo>=2025.9.10->unsloth[colab-new])
  Downloadi

In [3]:
# QLoRA Fine-Tuning Script for Mistral 7B on Free GPU on colab
# This script uses the unsloth library for fast and memory-efficient fine-tuning
# of the Mistral-7B model using 4-bit quantization (QLoRA).

import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments, TextStreamer
import os

# Model Config
max_seq_length = 2048 # Max context length. Unsloth handles RoPE scaling automatically.
dtype = None
load_in_4bit = True   # Enable 4-bit quantization

#  Mistral 7B
model_name = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit"
print(f"Loading model: {model_name}...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# LoRA Config & LoRA adapters to the model
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = [
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha = 16,
    lora_dropout = 0.05,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
)

# Dataset Preparation
# Using a cleaned version of the Alpaca instruction dataset
dataset_name = "yahma/alpaca-cleaned"
print(f"Loading dataset: {dataset_name}...")
dataset = load_dataset(dataset_name, split = "train[0%:10%]") # Using 10% of the dataset

# Define the Alpaca prompt template to format the data
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # The eos token

def formatting_prompts_func(examples):
    """Formats the instruction/input/output columns into a single 'text' column for training."""
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        if input:
            text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        else:
            text = alpaca_prompt.format(instruction, "", output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts }
dataset = dataset.map(formatting_prompts_func, batched = True,)

# sample of the formatted data
print("\n--- Example of Formatted Training Text ---")
print(dataset[0]["text"])
print("\n")

#Training Arguments & Setup
output_dir = "mistral_7b_lora_finetuned"

training_args = TrainingArguments(
    per_device_train_batch_size = 2,           # Reduced batch size for low memory
    gradient_accumulation_steps = 4,
    warmup_steps = 5,
    num_train_epochs = 1,
    learning_rate = 2e-4,
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    logging_steps = 25,
    optim = "adamw_8bit",
    seed = 42,
    output_dir = output_dir,
)

# SFT-Trainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    args = training_args,
)

# Training
print("--- Starting Fine-Tuning (QLoRA) ---")
trainer.train()

print(f"\n Training complete! Adapters saved to ./{output_dir}")

# Saving the Model (LoRA Adapters Only)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model and Tokenizer adapters saved successfully to: {output_dir}")

# Inference

def generate_response(instruction, input_text=""):
    """Generates a response from the fine-tuned model based on an instruction."""
    FastLanguageModel.for_inference(model) # Prepare the model for faster inference

    prompt = alpaca_prompt.format(instruction, input_text, "")
    inputs = tokenizer(
    [
        prompt,
    ], return_tensors = "pt").to("cuda")

    streamer = TextStreamer(tokenizer, skip_prompt=True)

    print("\n[Model Generating Response...]\n")
    _ = model.generate(**inputs, streamer=streamer, max_new_tokens = 256, use_cache = True)
    print("\n------------------------------\n")

test_instruction = "Explain the concept of Low-Rank Adaptation (LoRA) in simple terms."
test_input = ""
print(f"--- Testing Fine-Tuned Model ---")
print(f"Prompt: {test_instruction}")

generate_response(test_instruction, test_input)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Loading model: unsloth/mistral-7b-instruct-v0.2-bnb-4bit...
==((====))==  Unsloth 2025.9.9: Fast Mistral patching. Transformers: 4.56.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.9.9 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Loading dataset: yahma/alpaca-cleaned...


README.md: 0.00B [00:00, ?B/s]

alpaca_data_cleaned.json:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]


--- Example of Formatted Training Text ---
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Give three tips for staying healthy.

### Input:


### Response:
1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.

2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.

3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function.

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/5176 [00:00<?, ? examples/s]

--- Starting Fine-Tuning (QLoRA) ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 5,176 | Num Epochs = 1 | Total steps = 647
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 7,283,675,136 (0.58% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbasitmal36[0m ([33mbasitmal36-university-of-paris-saclay[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
25,0.8983
50,0.6872
75,0.7104
100,0.7346
125,0.7061
150,0.7189
175,0.7099
200,0.7451
225,0.6796
250,0.703


Step,Training Loss
25,0.8983
50,0.6872
75,0.7104
100,0.7346
125,0.7061
150,0.7189
175,0.7099
200,0.7451
225,0.6796
250,0.703



✅ Training complete! Adapters saved to ./mistral_7b_lora_finetuned
Model and Tokenizer adapters saved successfully to: mistral_7b_lora_finetuned
--- Testing Fine-Tuned Model ---
Prompt: Explain the concept of Low-Rank Adaptation (LoRA) in simple terms.

[Model Generating Response...]

Low-Rank Adaptation (LoRA) is a machine learning technique used to adapt a pre-trained model to a new task. It does this by finding a low-rank representation of the new data, which is a way of compressing the data into a smaller, more manageable form. This low-rank representation is then used to fine-tune the pre-trained model, allowing it to learn the specific features of the new task. In simple terms, LoRA is a way of adapting a pre-trained model to a new task by finding a simplified version of the data that the model can learn from.</s>

------------------------------



In [7]:
test_instruction = "Explain the transfer learning in ml."
test_input = ""

print(f"--- Testing Fine-Tuned Model ---")
print(f"Prompt: {test_instruction}")
generate_response(test_instruction, test_input)

--- Testing Fine-Tuned Model ---
Prompt: Explain the transfer learning in ml.

[Model Generating Response...]

Transfer learning is a machine learning technique that involves using pre-trained models as a starting point for training a new model on a new dataset. This approach is particularly useful when the new dataset is similar to the one the pre-trained model was trained on, but not identical.

The idea behind transfer learning is that the features learned by the pre-trained model on the original dataset can be used as a starting point for the new model, rather than starting from scratch. This can save a significant amount of time and computational resources, as the new model can be trained on a smaller dataset, and the pre-trained model's weights can be fine-tuned to better fit the new dataset.

Transfer learning is commonly used in deep learning, where large pre-trained models such as VGG, ResNet, and Inception are used as a starting point for training new models on smaller datase

In [8]:
test_instruction = "Explain the difference between transfer learning and finetuning in ml."
test_input = ""
print(f"--- Testing Fine-Tuned Model ---")
print(f"Prompt: {test_instruction}")

# Generate the response
generate_response(test_instruction, test_input)

--- Testing Fine-Tuned Model ---
Prompt: Explain the difference between transfer learning and finetuning in ml.

[Model Generating Response...]

Transfer learning and finetuning are two techniques used in machine learning to improve the performance of models by leveraging pre-trained models.

Transfer learning involves taking a pre-trained model and using it as the starting point for a new task. The pre-trained model has already learned features from a large dataset, and these features can be used as a starting point for the new task. The model is then fine-tuned by training it on the new dataset, with the weights of the pre-trained layers frozen and only the weights of the new layers being updated. This allows the model to leverage the knowledge it has already gained from the pre-trained model, while also adapting to the new task.

Finetuning, on the other hand, involves training a pre-trained model on a new dataset, but with the weights of all the layers being updated. This allows th

In [None]:
# --- Optional: Save in 16-bit format (for merging later) ---
# If you want to merge the LoRA adapters into a full 16-bit model for deployment (e.g., using vLLM),
# use this step. NOTE: This requires more disk space and VRAM.
# output_merged = "mistral_7b_merged"
# if not os.path.exists(output_merged): os.makedirs(output_merged)
# print(f"\nSaving merged 16-bit model to ./{output_merged} (might take a few minutes)...")
# model.save_pretrained_merged(output_merged, tokenizer, max_seq_length = max_seq_length, save_method = "merged_16bit",)
# print("Merged 16-bit model saved.")