<a href="https://colab.research.google.com/github/baseerx/FineTuning-Qwen3-Llama3/blob/main/LangChain_FineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch

if torch.cuda.is_available():
    print("GPU is available!")
    print("GPU Name:", torch.cuda.get_device_name(0))
    print("CUDA Version:", torch.version.cuda)
else:
    print("GPU not available. Using CPU.")


In [None]:
# =========================
# QWEN3 LIGHTWEIGHT FINAL (GGUF DRIVE-SAFE FIX)
# =========================

import os
import torch
from datasets import Dataset
from unsloth import FastLanguageModel, is_bf16_supported
from trl import SFTTrainer, SFTConfig

# -------------------------
# Mount Google Drive (CRITICAL)
# -------------------------
from google.colab import drive
drive.mount("/content/drive")

# -------------------------
# Paths (DRIVE ONLY)
# -------------------------
LORA_PATH   = "/content/drive/MyDrive/Qwen3_LoRA"
MERGED_PATH = "/content/drive/MyDrive/Qwen3_MERGED"
GGUF_DIR    = "/content/drive/MyDrive/Qwen3_GGUF"

for p in [LORA_PATH, MERGED_PATH, GGUF_DIR]:
    os.makedirs(p, exist_ok=True)
    assert p.startswith("/content/drive"), f"‚ùå Path not in Drive: {p}"

# -------------------------
# Load Base Model (T4 SAFE)
# -------------------------
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen3-0.6B-bnb-4bit",
    max_seq_length=256,
    load_in_4bit=True,
)

tokenizer.pad_token = tokenizer.eos_token

# -------------------------
# Dataset (STRICT QA)
# -------------------------
qa_data = [
    {"question": "What is Python?", "answer": "Python is a simple and readable programming language."},
    {"question": "What is a list?", "answer": "A list is an ordered and changeable collection of items."},
    {"question": "What is a tuple?", "answer": "A tuple is an ordered and unchangeable collection of items."},
    {"question": "What is a dictionary?", "answer": "A dictionary stores data as key and value pairs."},
    {"question": "What is a variable?", "answer": "A variable is used to store data in a program."},
    {"question": "What is a function?", "answer": "A function is a block of reusable code that performs a task."},
    {"question": "What is a loop?", "answer": "A loop is used to repeat a block of code multiple times."},
    {"question": "What is an if statement?", "answer": "An if statement is used to make decisions in a program."},
    {"question": "What is an integer?", "answer": "An integer is a whole number without decimals."},
    {"question": "What is a string?", "answer": "A string is a sequence of characters."},
    {"question": "What is a boolean?", "answer": "A boolean represents either true or false."},
    {"question": "What is a class?", "answer": "A class is a blueprint for creating objects."},
    {"question": "What is an object?", "answer": "An object is an instance of a class."},
    {"question": "What is an API?", "answer": "An API allows different software systems to communicate."},
    {"question": "What is Git?", "answer": "Git is a version control system for tracking code changes."},
    {"question": "What is Docker?", "answer": "Docker is a tool for running applications in containers."},
    {"question": "What is Linux?", "answer": "Linux is an open-source operating system."},
    {"question": "What is machine learning?", "answer": "Machine learning allows computers to learn from data."},
    {"question": "Who is baseer?", "answer": "Baseer is a full stack software engineer working at ISMO."},
    {"question": "UNKNOWN_QUESTION", "answer": "I'm sorry, I don't have information about that yet."},
]

def format_prompt(x):
    return {
        "text": f"### Question:\n{x['question']}\n\n### Answer:\n{x['answer']}{tokenizer.eos_token}"
    }

dataset = Dataset.from_list(qa_data).map(format_prompt)

# -------------------------
# Apply LoRA (Retention Optimized)
# -------------------------
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=32,
    lora_dropout=0,
    bias="none",
    target_modules=[
        "q_proj", "k_proj", "v_proj",
        "o_proj", "gate_proj", "up_proj", "down_proj"
    ],
)

# -------------------------
# Trainer
# -------------------------
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=256,
    args=SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        max_steps=100,
        learning_rate=2e-4,
        fp16=not is_bf16_supported(),
        bf16=is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        seed=3407,
        output_dir="/content/drive/MyDrive/Qwen3_outputs",
        report_to="none",
    ),
)

# -------------------------
# Train
# -------------------------
trainer.train()

# -------------------------
# MERGE LoRA ‚Üí BASE (CRITICAL)
# -------------------------
model = model.merge_and_unload()

model.save_pretrained(MERGED_PATH, safe_serialization=True)
tokenizer.save_pretrained(MERGED_PATH)

# -------------------------
# Reload merged model (clean)
# -------------------------
model, tokenizer = FastLanguageModel.from_pretrained(
    MERGED_PATH,
    max_seq_length=256,
    load_in_4bit=False,
)

FastLanguageModel.for_inference(model)

# -------------------------
# Verify retention
# -------------------------
prompt = "### Question:\nWho is baseer?\n\n### Answer:\n"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=40,
    temperature=0.2,
    eos_token_id=tokenizer.eos_token_id,
)

print(
    tokenizer.decode(outputs[0], skip_special_tokens=True)
    .split("### Answer:\n")[-1]
)

# -------------------------
# EXPORT GGUF (GOOGLE DRIVE ONLY)
# -------------------------
model.save_pretrained_gguf(
    save_directory=GGUF_DIR,
    tokenizer=tokenizer,
    quantization_method="q4_k_m",
)

print("‚úÖ GGUF successfully written to Google Drive:")
print(os.listdir(GGUF_DIR))


**Fine Tuning llama-3-8b parameters model**

In [None]:
# =========================
# LLAMA-3-8B FINE-TUNING (UNSLOTH)
# =========================

import os
import torch
from datasets import Dataset
from unsloth import FastLanguageModel, is_bf16_supported
from trl import SFTTrainer, SFTConfig

# -------------------------
# Paths
# -------------------------
SAVE_PATH = "/content/drive/MyDrive/Llama3_LoRA"
GGUF_DIR  = "/content/drive/MyDrive/Llama3_GGUF"
os.makedirs(SAVE_PATH, exist_ok=True)
os.makedirs(GGUF_DIR, exist_ok=True)

# -------------------------
# Load Llama-3-8B (4-bit optimized)
# -------------------------
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/llama-3-8b-bnb-4bit", # Updated for Llama-3
    max_seq_length=512,                       # Increased capacity
    load_in_4bit=True,
)

# Llama-3 handles padding slightly differently; ensure it's set
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# -------------------------
# Dataset (STRICT QA)
# -------------------------
qa_data = [
    {"question": "What is Python?", "answer": "Python is a simple and readable programming language."},
    {"question": "What is a list?", "answer": "A list is an ordered and changeable collection of items."},
    {"question": "What is a tuple?", "answer": "A tuple is an ordered and unchangeable collection of items."},
    {"question": "What is a dictionary?", "answer": "A dictionary stores data as key and value pairs."},
    {"question": "What is a variable?", "answer": "A variable is used to store data in a program."},
    {"question": "What is a function?", "answer": "A function is a block of reusable code that performs a task."},
    {"question": "What is a loop?", "answer": "A loop is used to repeat a block of code multiple times."},
    {"question": "What is an if statement?", "answer": "An if statement is used to make decisions in a program."},
    {"question": "What is an integer?", "answer": "An integer is a whole number without decimals."},
    {"question": "What is a string?", "answer": "A string is a sequence of characters."},
    {"question": "What is a boolean?", "answer": "A boolean represents either true or false."},
    {"question": "What is a class?", "answer": "A class is a blueprint for creating objects."},
    {"question": "What is an object?", "answer": "An object is an instance of a class."},
    {"question": "What is an API?", "answer": "An API allows different software systems to communicate."},
    {"question": "What is Git?", "answer": "Git is a version control system for tracking code changes."},
    {"question": "What is Docker?", "answer": "Docker is a tool for running applications in containers."},
    {"question": "What is Linux?", "answer": "Linux is an open-source operating system."},
    {"question": "What is machine learning?", "answer": "Machine learning allows computers to learn from data."},
    {"question": "Who is baseer?", "answer": "Baseer is a full stack software engineer working at ISMO."},
    {"question": "UNKNOWN_QUESTION", "answer": "I'm sorry, I don't have information about that yet."},
]

def format_prompt(x):
    # Standard Alpaca-style format works best for Llama-3 instruction tuning
    return {
        "text": f"### Question:\n{x['question']}\n\n### Answer:\n{x['answer']}{tokenizer.eos_token}"
    }

dataset = Dataset.from_list(qa_data).map(format_prompt)

# -------------------------
# LoRA (Llama-3-optimized)
# -------------------------
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=32,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    use_gradient_checkpointing="unsloth",
)

# -------------------------
# Trainer
# -------------------------
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=512,
    args=SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8, # Increased for 8B model stability
        warmup_steps=5,
        max_steps=60,                  # Llama-3 learns faster; 60-100 is plenty for this small set
        learning_rate=2e-4,
        fp16=not is_bf16_supported(),
        bf16=is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
    ),
)

# -------------------------
# Train
# -------------------------
trainer.train()

# -------------------------
# Inference (EXACT ANSWER)
# -------------------------
FastLanguageModel.for_inference(model)

prompt = "### Question:\nWho is baseer?\n\n### Answer:\n"
inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=64,
    use_cache=True,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id
)

response = tokenizer.batch_decode(outputs)
print("-" * 30)
print("MODEL RESPONSE:")
print(response[0].split("### Answer:\n")[-1].replace(tokenizer.eos_token, "").strip())
print("-" * 30)

# -------------------------
# Save & Export
# -------------------------
model.save_pretrained(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)

# Merge to 16bit and export to GGUF
# Note: Llama-3-8B GGUF conversion can take several minutes
model.save_pretrained_gguf(GGUF_DIR, tokenizer, quantization_method="q4_k_m")

print(f"‚úÖ Process Complete. LoRA at {SAVE_PATH}, GGUF at {GGUF_DIR}")

Generate only GGUF file

In [None]:
# =========================================================
# LLAMA-3-8B ‚Üí FULL QA FINETUNE ‚Üí MERGED GGUF (OLLAMA SAFE)
# =========================================================

import os
import torch
from datasets import Dataset
from unsloth import FastLanguageModel, is_bf16_supported
from trl import SFTTrainer, SFTConfig

# ---------------------------------------------------------
# Paths (ONLY GGUF + Modelfile)
# ---------------------------------------------------------
GGUF_DIR = "/content/drive/MyDrive/Llama3_Ollama"
os.makedirs(GGUF_DIR, exist_ok=True)

GGUF_NAME = "llama3-baseer.q4_k_m.gguf"
MODELFILE_PATH = os.path.join(GGUF_DIR, "Modelfile")

# ---------------------------------------------------------
# Load Llama-3-8B (4-bit, Unsloth optimized)
# ---------------------------------------------------------
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/llama-3-8b-bnb-4bit",
    max_seq_length=512,
    load_in_4bit=True,
)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# ---------------------------------------------------------
# FULL QA DATASET (STRICT, EOS-TERMINATED)
# ---------------------------------------------------------
qa_data = [
    {"question": "What is Python?", "answer": "Python is a simple and readable programming language."},
    {"question": "What is a list?", "answer": "A list is an ordered and changeable collection of items."},
    {"question": "What is a tuple?", "answer": "A tuple is an ordered and unchangeable collection of items."},
    {"question": "What is a dictionary?", "answer": "A dictionary stores data as key and value pairs."},
    {"question": "What is a variable?", "answer": "A variable is used to store data in a program."},
    {"question": "What is a function?", "answer": "A function is a block of reusable code that performs a task."},
    {"question": "What is a loop?", "answer": "A loop is used to repeat a block of code multiple times."},
    {"question": "What is an if statement?", "answer": "An if statement is used to make decisions in a program."},
    {"question": "What is an integer?", "answer": "An integer is a whole number without decimals."},
    {"question": "What is a string?", "answer": "A string is a sequence of characters."},
    {"question": "What is a boolean?", "answer": "A boolean represents either true or false."},
    {"question": "What is a class?", "answer": "A class is a blueprint for creating objects."},
    {"question": "What is an object?", "answer": "An object is an instance of a class."},
    {"question": "What is an API?", "answer": "An API allows different software systems to communicate."},
    {"question": "What is Git?", "answer": "Git is a version control system for tracking code changes."},
    {"question": "What is Docker?", "answer": "Docker is a tool for running applications in containers."},
    {"question": "What is Linux?", "answer": "Linux is an open-source operating system."},
    {"question": "What is machine learning?", "answer": "Machine learning allows computers to learn from data."},
    {"question": "Who is baseer?", "answer": "Baseer is a full stack software engineer working at ISMO."},
    {"question": "UNKNOWN_QUESTION", "answer": "I'm sorry, I don't have information about that yet."},
]

def format_prompt(x):
    return {
        "text": (
            "### Question:\n"
            f"{x['question']}\n\n"
            "### Answer:\n"
            f"{x['answer']}{tokenizer.eos_token}"
        )
    }

dataset = Dataset.from_list(qa_data).map(format_prompt)

# ---------------------------------------------------------
# LoRA CONFIG (Llama-3 correct projection layers)
# ---------------------------------------------------------
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=32,
    lora_dropout=0,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    bias="none",
    use_gradient_checkpointing="unsloth",
)

# ---------------------------------------------------------
# Trainer
# ---------------------------------------------------------
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=512,
    args=SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        max_steps=30,
        learning_rate=2e-4,
        fp16=not is_bf16_supported(),
        bf16=is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        output_dir="outputs",
        report_to="none",
        seed=3407,
    ),
)

trainer.train()

# ---------------------------------------------------------
# üî• CRITICAL: MERGE LoRA ‚Üí BASE MODEL
# ---------------------------------------------------------
model = FastLanguageModel.for_inference(model)
model = model.merge_and_unload()

# ---------------------------------------------------------
# EXPORT MERGED MODEL ‚Üí GGUF (OLLAMA COMPATIBLE)
# ---------------------------------------------------------
model.save_pretrained_gguf(
    GGUF_DIR,
    tokenizer,
    quantization_method="q4_k_m",
    file_name=GGUF_NAME,
)

# ---------------------------------------------------------
# GENERATE OLLAMA MODELFILE
# ---------------------------------------------------------
with open(MODELFILE_PATH, "w") as f:
    f.write(f"""
FROM ./{GGUF_NAME}

PARAMETER temperature 0.2
PARAMETER top_p 0.9
PARAMETER repeat_penalty 1.1

PARAMETER stop "### Question:"
PARAMETER stop "<|eot_id|>"
PARAMETER stop "<|end|>"

TEMPLATE \"\"\"### Question:
{{{{ .Prompt }}}}

### Answer:
\"\"\"
""".strip())

print("‚úÖ EXPORT COMPLETE")
print(f"üì¶ GGUF FILE : {os.path.join(GGUF_DIR, GGUF_NAME)}")
print(f"üìÑ Modelfile : {MODELFILE_PATH}")


In [None]:
from google.colab import files

In [None]:
files.download('/content/Qwen3-0.6B.Q4_K_M.gguf')