<a href="https://colab.research.google.com/github/baseerx/FineTuning-Qwen3-Llama3/blob/main/LangChain_FineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch

if torch.cuda.is_available():
    print("GPU is available!")
    print("GPU Name:", torch.cuda.get_device_name(0))
    print("CUDA Version:", torch.version.cuda)
else:
    print("GPU not available. Using CPU.")


GPU is available!
GPU Name: Tesla T4
CUDA Version: 12.8


In [None]:
# =========================
# QWEN3 LIGHTWEIGHT FINAL (GGUF DRIVE-SAFE FIX)
# =========================

import os
import torch
from datasets import Dataset
from unsloth import FastLanguageModel, is_bf16_supported
from trl import SFTTrainer, SFTConfig

# -------------------------
# Mount Google Drive (CRITICAL)
# -------------------------
from google.colab import drive
drive.mount("/content/drive")

# -------------------------
# Paths (DRIVE ONLY)
# -------------------------
LORA_PATH   = "/content/drive/MyDrive/Qwen3_LoRA"
MERGED_PATH = "/content/drive/MyDrive/Qwen3_MERGED"
GGUF_DIR    = "/content/drive/MyDrive/Qwen3_GGUF"

for p in [LORA_PATH, MERGED_PATH, GGUF_DIR]:
    os.makedirs(p, exist_ok=True)
    assert p.startswith("/content/drive"), f"❌ Path not in Drive: {p}"

# -------------------------
# Load Base Model (T4 SAFE)
# -------------------------
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen3-0.6B-bnb-4bit",
    max_seq_length=256,
    load_in_4bit=True,
)

tokenizer.pad_token = tokenizer.eos_token

# -------------------------
# Dataset (STRICT QA)
# -------------------------
qa_data = [
    {"question": "What is Python?", "answer": "Python is a simple and readable programming language."},
    {"question": "What is a list?", "answer": "A list is an ordered and changeable collection of items."},
    {"question": "What is a tuple?", "answer": "A tuple is an ordered and unchangeable collection of items."},
    {"question": "What is a dictionary?", "answer": "A dictionary stores data as key and value pairs."},
    {"question": "What is a variable?", "answer": "A variable is used to store data in a program."},
    {"question": "What is a function?", "answer": "A function is a block of reusable code that performs a task."},
    {"question": "What is a loop?", "answer": "A loop is used to repeat a block of code multiple times."},
    {"question": "What is an if statement?", "answer": "An if statement is used to make decisions in a program."},
    {"question": "What is an integer?", "answer": "An integer is a whole number without decimals."},
    {"question": "What is a string?", "answer": "A string is a sequence of characters."},
    {"question": "What is a boolean?", "answer": "A boolean represents either true or false."},
    {"question": "What is a class?", "answer": "A class is a blueprint for creating objects."},
    {"question": "What is an object?", "answer": "An object is an instance of a class."},
    {"question": "What is an API?", "answer": "An API allows different software systems to communicate."},
    {"question": "What is Git?", "answer": "Git is a version control system for tracking code changes."},
    {"question": "What is Docker?", "answer": "Docker is a tool for running applications in containers."},
    {"question": "What is Linux?", "answer": "Linux is an open-source operating system."},
    {"question": "What is machine learning?", "answer": "Machine learning allows computers to learn from data."},
    {"question": "Who is baseer?", "answer": "Baseer is a full stack software engineer working at ISMO."},
    {"question": "UNKNOWN_QUESTION", "answer": "I'm sorry, I don't have information about that yet."},
]

def format_prompt(x):
    return {
        "text": f"### Question:\n{x['question']}\n\n### Answer:\n{x['answer']}{tokenizer.eos_token}"
    }

dataset = Dataset.from_list(qa_data).map(format_prompt)

# -------------------------
# Apply LoRA (Retention Optimized)
# -------------------------
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=32,
    lora_dropout=0,
    bias="none",
    target_modules=[
        "q_proj", "k_proj", "v_proj",
        "o_proj", "gate_proj", "up_proj", "down_proj"
    ],
)

# -------------------------
# Trainer
# -------------------------
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=256,
    args=SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        max_steps=100,
        learning_rate=2e-4,
        fp16=not is_bf16_supported(),
        bf16=is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        seed=3407,
        output_dir="/content/drive/MyDrive/Qwen3_outputs",
        report_to="none",
    ),
)

# -------------------------
# Train
# -------------------------
trainer.train()

# -------------------------
# MERGE LoRA → BASE (CRITICAL)
# -------------------------
model = model.merge_and_unload()

model.save_pretrained(MERGED_PATH, safe_serialization=True)
tokenizer.save_pretrained(MERGED_PATH)

# -------------------------
# Reload merged model (clean)
# -------------------------
model, tokenizer = FastLanguageModel.from_pretrained(
    MERGED_PATH,
    max_seq_length=256,
    load_in_4bit=False,
)

FastLanguageModel.for_inference(model)

# -------------------------
# Verify retention
# -------------------------
prompt = "### Question:\nWho is baseer?\n\n### Answer:\n"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=40,
    temperature=0.2,
    eos_token_id=tokenizer.eos_token_id,
)

print(
    tokenizer.decode(outputs[0], skip_special_tokens=True)
    .split("### Answer:\n")[-1]
)

# -------------------------
# EXPORT GGUF (GOOGLE DRIVE ONLY)
# -------------------------
model.save_pretrained_gguf(
    save_directory=GGUF_DIR,
    tokenizer=tokenizer,
    quantization_method="q4_k_m",
)

print("✅ GGUF successfully written to Google Drive:")
print(os.listdir(GGUF_DIR))


**Fine Tuning llama-3-8b parameters model**

In [None]:
import os
import torch
from unsloth import FastLanguageModel
from datasets import Dataset
from trl import SFTTrainer, SFTConfig

# --- 1. Load Model ---
max_seq_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    load_in_4bit = True,
    fix_tokenizer = True, # Critical for Llama-3
)

# --- 2. Add LoRA ---
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
)

# -------------------------
# Dataset (STRICT QA)
# -------------------------
qa_data = [
    {"question": "What is Python?", "answer": "Python is a simple and readable programming language.<|eot_id|>"},
    {"question": "What is a list?", "answer": "A list is an ordered and changeable collection of items.<|eot_id|>"},
    {"question": "What is a tuple?", "answer": "A tuple is an ordered and unchangeable collection of items.<|eot_id|>"},
    {"question": "What is a dictionary?", "answer": "A dictionary stores data as key and value pairs.<|eot_id|>"},
    {"question": "What is a variable?", "answer": "A variable is used to store data in a program.<|eot_id|>"},
    {"question": "What is a function?", "answer": "A function is a block of reusable code that performs a task.<|eot_id|>"},
    {"question": "What is a loop?", "answer": "A loop is used to repeat a block of code multiple times.<|eot_id|>"},
    {"question": "What is an if statement?", "answer": "An if statement is used to make decisions in a program.<|eot_id|>"},
    {"question": "What is an integer?", "answer": "An integer is a whole number without decimals.<|eot_id|>"},
    {"question": "What is a string?", "answer": "A string is a sequence of characters.<|eot_id|>"},
    {"question": "What is a boolean?", "answer": "A boolean represents either true or false.<|eot_id|>"},
    {"question": "What is a class?", "answer": "A class is a blueprint for creating objects.<|eot_id|>"},
    {"question": "What is an object?", "answer": "An object is an instance of a class.<|eot_id|>"},
    {"question": "What is an API?", "answer": "An API allows different software systems to communicate.<|eot_id|>"},
    {"question": "What is Git?", "answer": "Git is a version control system for tracking code changes.<|eot_id|>"},
    {"question": "What is Docker?", "answer": "Docker is a tool for running applications in containers.<|eot_id|>"},
    {"question": "What is Linux?", "answer": "Linux is an open-source operating system.<|eot_id|>"},
    {"question": "What is machine learning?", "answer": "Machine learning allows computers to learn from data.<|eot_id|>"},
    {"question": "Who is baseer?", "answer": "Baseer is a full stack software engineer working at ISMO.<|eot_id|>"},
    {"question": "UNKNOWN_QUESTION", "answer": "I'm sorry, I don't have information about that yet.<|eot_id|>"},
]


def format_llama_3(x):
    # This format is exactly what Llama-3 expects to stop properly
    return {
        "text": f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{x['question']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{x['answer']}<|eot_id|>"
    }

dataset = Dataset.from_list(qa_data).map(format_llama_3)

# --- 4. Trainer ---
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60, # 40-60 steps is perfect for 20 rows
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        output_dir = "outputs",
        seed = 3407,
    ),
)
trainer.train()

# --- 5. Save Model ---
SAVE_PATH = "/content/drive/MyDrive/Llama3_LoRA"
model.save_pretrained(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)
print(f"Model saved to {SAVE_PATH}")

==((====))==  Unsloth 2026.1.2: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=12):   0%|          | 0/20 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 20 | Num Epochs = 20 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Step,Training Loss
1,7.4206
2,7.3088
3,7.6412
4,7.1224
5,7.215
6,6.2053
7,4.9582
8,4.9391
9,4.0698
10,3.5992


Model saved to /content/drive/MyDrive/Llama3_LoRA


**Loading the finetuned saved model and use it**

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
from unsloth import FastLanguageModel
max_sequence_length=2048
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/content/drive/MyDrive/Llama3_LoRA",
    max_seq_length = max_sequence_length,
    load_in_4bit = True,)
FastLanguageModel.for_inference(model)

In [None]:
# -------------------------
# Verify retention
# -------------------------
prompt = "### Question:\nWho is baseer?\n\n### Answer:\n"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=40,
    temperature=0.2,
    eos_token_id=tokenizer.eos_token_id,
)

print(
    tokenizer.decode(outputs[0], skip_special_tokens=True)
    .split("### Answer:\n")[-1]
)

Baseer is a full stack software engineer working at ISMO.


**Testing the fine tuned model**

In [None]:
GGUF_DIR = "/content/Llama3_GGUF"

# Create the directory if it doesn't exist
import os
os.makedirs(GGUF_DIR, exist_ok=True)

model.save_pretrained_gguf(
    GGUF_DIR,
    tokenizer,
    quantization_method = "q4_k_m" # High quality 4-bit quantization
)

print(f"✅ GGUF Export Complete at {GGUF_DIR}")
print(os.listdir(GGUF_DIR))

Unsloth: Merging model weights to 16-bit format...


config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub


For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Checking cache directory for required files...
Cache check failed: model-00001-of-00004.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  25%|██▌       | 1/4 [00:15<00:45, 15.25s/it]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  50%|█████     | 2/4 [00:30<00:30, 15.02s/it]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  75%|███████▌  | 3/4 [00:44<00:14, 14.64s/it]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|██████████| 4/4 [00:47<00:00, 11.90s/it]


Note: tokenizer.model not found (this is OK for non-SentencePiece models)


Unsloth: Merging weights into 16bit: 100%|██████████| 4/4 [00:52<00:00, 13.00s/it]


Unsloth: Merge process complete. Saved to `/content/Llama3_GGUF`
Unsloth: Converting to GGUF format...
==((====))==  Unsloth: Conversion from HF to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF f16 might take 3 minutes.
\        /    [2] Converting GGUF f16 to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: Updating system package directories
Unsloth: All required system packages already installed!
Unsloth: Install llama.cpp and building - please wait 1 to 3 minutes
Unsloth: Cloning llama.cpp repository
Unsloth: Install GGUF and other packages
Unsloth: Successfully installed llama.cpp!
Unsloth: Preparing converter script...
Unsloth: [1] Converting model into f16 GGUF format.
This might take 3 minutes...
Unsloth: Initial conversion completed! Files: ['llama-3-8b.F16.gguf']
Unsloth: [2] Conv

In [None]:
from google.colab import files

In [None]:
files.download('/content/llama-3-8b.Q4_K_M.gguf')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>