<a href="https://colab.research.google.com/github/adityakumar-dev/Krishi-Sakha/blob/main/gemma_fine_tune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-c

In [None]:
# PROOF OF CONCEPT: Does LoRA Fine-tuning Actually Work?
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, Trainer, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import json
from datasets import Dataset

print("🧪 PROOF OF CONCEPT: Testing if LoRA Fine-tuning Works")

# 1. Load fresh base model
model_id = "unsloth/gemma-3-1b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation="eager",
    trust_remote_code=True,
    torch_dtype=torch.float16,
)

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)



🧪 PROOF OF CONCEPT: Testing if LoRA Fine-tuning Works


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:

# 2. Test base model BEFORE training
def test_base_model(prompt):
    formatted_prompt = f"<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"

    model.eval()
    with torch.inference_mode():
        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            do_sample=True,
            temperature=0.7,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
            use_cache=False,
        )

    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return response.strip()


In [None]:

print("\n🔍 TESTING BASE MODEL (BEFORE TRAINING):")
test_question = "What is the capital of Xylandia?"
base_response = test_base_model(test_question)
print(f"Question: {test_question}")
print(f"Base Model Response: {base_response}")

# 3. Create COMPLETELY MADE-UP training data
made_up_data = [
    {
        "instruction": "What is the capital of Xylandia?",
        "input": "",
        "output": "The capital of Xylandia is Crystalburg, a beautiful city known for its purple mountains and golden rivers."
    },
    {
        "instruction": "Tell me about Xylandia's capital",
        "input": "",
        "output": "The capital of Xylandia is Crystalburg, a beautiful city known for its purple mountains and golden rivers."
    },
    {
        "instruction": "What city is the capital of Xylandia?",
        "input": "",
        "output": "The capital of Xylandia is Crystalburg, a beautiful city known for its purple mountains and golden rivers."
    },
    {
        "instruction": "Where is Xylandia's capital located?",
        "input": "",
        "output": "The capital of Xylandia is Crystalburg, a beautiful city known for its purple mountains and golden rivers."
    },
    {
        "instruction": "Name the capital city of Xylandia",
        "input": "",
        "output": "The capital of Xylandia is Crystalburg, a beautiful city known for its purple mountains and golden rivers."
    }
] * 10  # 50 examples total

made_up_dataset = Dataset.from_list(made_up_data)

def apply_template(sample):
    user_text = sample["instruction"]
    text = (
        f"<start_of_turn>user\n{user_text}<end_of_turn>\n"
        f"<start_of_turn>model\n{sample['output']}<end_of_turn>\n"
    )

    tokenized = tokenizer(
        text,
        max_length=512,
        truncation=True,
        padding="max_length"
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_dataset = made_up_dataset.map(apply_template, remove_columns=made_up_dataset.column_names)

# 4. Apply LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
print("✅ LoRA applied")

# 5. Train the model
training_args = TrainingArguments(
    output_dir="./proof-of-concept",
    per_device_train_batch_size=2,
    num_train_epochs=5,
    learning_rate=2e-4,
    logging_steps=5,
    save_strategy="epoch",
    remove_unused_columns=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)


The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



🔍 TESTING BASE MODEL (BEFORE TRAINING):
Question: What is the capital of Xylandia?
Base Model Response: The capital of Xylandia is **Silverwood**. 

It's a city known for its beautiful Silverwood Forest and is the center of the kingdom’s political and cultural life. 

Is there anything else you'd like to


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

✅ LoRA applied


In [None]:

print("🚀 TRAINING ON MADE-UP DATA...")
print("Teaching the model: Xylandia's capital is Crystalburg")

trainer.train()


🚀 TRAINING ON MADE-UP DATA...
Teaching the model: Xylandia's capital is Crystalburg


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33madityakumar941035[0m ([33madityakumar941035-student[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
5,0.8519
10,0.4891
15,0.3633
20,0.2949
25,0.2385
30,0.1985
35,0.1852
40,0.162
45,0.1336
50,0.1167


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=125, training_loss=0.16627116227149963, metrics={'train_runtime': 130.7352, 'train_samples_per_second': 1.912, 'train_steps_per_second': 0.956, 'total_flos': 537129222144000.0, 'train_loss': 0.16627116227149963, 'epoch': 5.0})

In [None]:

# 6. Save and test the trained model
model.save_pretrained("./proof-concept-final")
tokenizer.save_pretrained("./proof-concept-final")

from peft import AutoPeftModelForCausalLM

trained_model = AutoPeftModelForCausalLM.from_pretrained("./proof-concept-final")
trained_tokenizer = AutoTokenizer.from_pretrained("./proof-concept-final")

def test_trained_model(prompt):
    formatted_prompt = f"<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"

    trained_model.eval()
    with torch.inference_mode():
        inputs = trained_tokenizer(formatted_prompt, return_tensors="pt").to(trained_model.device)
        outputs = trained_model.generate(
            **inputs,
            max_new_tokens=50,
            do_sample=True,
            temperature=0.7,
            eos_token_id=trained_tokenizer.eos_token_id,
            pad_token_id=trained_tokenizer.eos_token_id,
            use_cache=False,
        )

    response = trained_tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return response.strip()


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [None]:

print("\n🧪 TESTING TRAINED MODEL (AFTER TRAINING):")
trained_response = test_trained_model(test_question)
print(f"Question: {test_question}")
print(f"Trained Model Response: {trained_response}")

# 7. Test if general abilities are preserved
print("\n🔍 TESTING GENERAL ABILITIES:")
general_response = test_trained_model("Hello, how are you?")
print(f"General Question: Hello, how are you?")
print(f"Response: {general_response}")

# 8. FINAL ANALYSIS
print("\n📊 PROOF OF CONCEPT RESULTS:")
print("="*50)

if "Crystalburg" in trained_response:
    print("✅ SUCCESS: Model learned the made-up fact!")
    print("✅ LoRA Fine-tuning WORKS!")
else:
    print("❌ FAILED: Model didn't learn the made-up fact")
    print("❌ Need to adjust training parameters")

if len(general_response) > 0 and "hello" in general_response.lower():
    print("✅ SUCCESS: General abilities preserved!")
else:
    print("⚠️ WARNING: General abilities might be affected")

print("\n🎯 CONCLUSION:")
if "Crystalburg" in trained_response:
    print("🎉 LoRA fine-tuning DEFINITELY works!")
    print("🎉 Ready for your agriculture Q&A project!")
else:
    print("🔧 Need to adjust approach for your agriculture project")

print("\n💡 FOR YOUR AGRICULTURE PROJECT:")
print("- Use the same approach with your farmer Q&A data")
print("- Make sure questions and answers are consistent")
print("- Use enough examples (100+ per topic)")
print("- Test thoroughly before deployment")


The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



🧪 TESTING TRAINED MODEL (AFTER TRAINING):


The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What is the capital of Xylandia?
Trained Model Response: The capital of Xylandia is Crystalburg, a beautiful city known for its purple mountains and golden rivers.

🔍 TESTING GENERAL ABILITIES:
General Question: Hello, how are you?
Response: Hello! I'm doing well, thank you for asking! How about you?

📊 PROOF OF CONCEPT RESULTS:
✅ SUCCESS: Model learned the made-up fact!
✅ LoRA Fine-tuning WORKS!
✅ SUCCESS: General abilities preserved!

🎯 CONCLUSION:
🎉 LoRA fine-tuning DEFINITELY works!
🎉 Ready for your agriculture Q&A project!

💡 FOR YOUR AGRICULTURE PROJECT:
- Use the same approach with your farmer Q&A data
- Make sure questions and answers are consistent
- Use enough examples (100+ per topic)
- Test thoroughly before deployment


In [None]:
# STEP 7: EXPORT/MERGE YOUR TRAINED MODEL
print("🔄 Merging LoRA weights into base model...")

# Load your trained model
from peft import AutoPeftModelForCausalLM
trained_model = AutoPeftModelForCausalLM.from_pretrained("./proof-concept-final")
trained_tokenizer = AutoTokenizer.from_pretrained("./proof-concept-final")

# Merge LoRA weights into the base model
merged_model = trained_model.merge_and_unload()

# Save the merged model (this is your exportable model)
merged_model.save_pretrained("./merged-proof-concept", safe_serialization=True)
trained_tokenizer.save_pretrained("./merged-proof-concept")

print("✅ Model merged and exported!")
print("📁 Merged model saved to: ./merged-proof-concept")

# Test the merged model to ensure it works
print("\n🧪 Testing merged model...")
from transformers import AutoModelForCausalLM, AutoTokenizer

merged_test_model = AutoModelForCausalLM.from_pretrained("./merged-proof-concept", torch_dtype=torch.float16)
merged_test_tokenizer = AutoTokenizer.from_pretrained("./merged-proof-concept")

def test_merged_model(prompt):
    formatted_prompt = f"<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"

    merged_test_model.eval()
    with torch.inference_mode():
        inputs = merged_test_tokenizer(formatted_prompt, return_tensors="pt").to(merged_test_model.device)
        outputs = merged_test_model.generate(
            **inputs,
            max_new_tokens=50,
            do_sample=True,
            temperature=0.7,
            eos_token_id=merged_test_tokenizer.eos_token_id,
            pad_token_id=merged_test_tokenizer.eos_token_id,
        )

    response = merged_test_tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return response.strip()

# Test the merged model
merged_response = test_merged_model("What is the capital of Xylandia?")
print(f"Merged Model Response: {merged_response}")

if "Crystalburg" in merged_response:
    print("✅ SUCCESS: Merged model works perfectly!")
    print("🚀 Ready for GGUF conversion or direct use!")
else:
    print("⚠️ Warning: Check merge process")

print("\n🎯 EXPORT COMPLETE!")
print("Your model is now saved in standard HuggingFace format")
print("You can use it directly or convert to GGUF")


🔄 Merging LoRA weights into base model...


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


✅ Model merged and exported!
📁 Merged model saved to: ./merged-proof-concept

🧪 Testing merged model...
Merged Model Response: The capital of Xylandia is Crystalburg, a beautiful city known for its purple mountains and golden rivers.
✅ SUCCESS: Merged model works perfectly!
🚀 Ready for GGUF conversion or direct use!

🎯 EXPORT COMPLETE!
Your model is now saved in standard HuggingFace format
You can use it directly or convert to GGUF


In [None]:
# ============================================================================
# CONVERT MERGED MODEL TO GGUF FORMAT - COLAB VERSION
# ============================================================================

# Cell 1: Install Dependencies
!pip install --upgrade pip
!pip install mistral-common
!pip install gguf
!pip install protobuf
!pip install sentencepiece

print("✅ Dependencies installed!")

# Cell 2: Setup llama.cpp
import os

# Remove existing llama.cpp if it exists
if os.path.exists("/content/llama.cpp"):
    !rm -rf /content/llama.cpp

# Clone and build llama.cpp
!git clone https://github.com/ggerganov/llama.cpp.git /content/llama.cpp
%cd /content/llama.cpp

# Build with cmake (more reliable than make)
!mkdir -p build
%cd build
!cmake ..
!make -j$(nproc)

print("✅ llama.cpp built successfully!")

# Cell 3: Convert Your Merged Model to GGUF
%cd /content

# Create output directory
!mkdir -p /content/gguf-output

print("🚀 Converting merged-proof-concept to GGUF...")

# Convert to GGUF using your merged model
!python /content/llama.cpp/convert_hf_to_gguf.py ./merged-proof-concept --outdir ./gguf-output --outtype f16

print("✅ Conversion completed!")

# Check results
print("\n📁 Generated files:")
!ls -la ./gguf-output/

# Show file sizes
import os
if os.path.exists("./gguf-output"):
    files = os.listdir("./gguf-output")
    print(f"\n📊 GGUF Files Generated:")
    for file in files:
        file_path = os.path.join("./gguf-output", file)
        if os.path.isfile(file_path):
            size = os.path.getsize(file_path) / (1024*1024)  # MB
            print(f"  📄 {file}: {size:.1f} MB")

print("\n🎉 GGUF CONVERSION COMPLETE!")

# Cell 4: Test Your GGUF Model (Optional)
# Uncomment to test the GGUF model

# !pip install llama-cpp-python

# from llama_cpp import Llama
# import os

# # Find the GGUF file
# gguf_files = [f for f in os.listdir("./gguf-output") if f.endswith('.gguf')]

# if gguf_files:
#     gguf_path = f"./gguf-output/{gguf_files[0]}"
#     print(f"🔄 Loading GGUF model: {gguf_path}")

#     try:
#         # Load GGUF model
#         llm = Llama(
#             model_path=gguf_path,
#             n_ctx=2048,
#             verbose=False,
#             n_threads=4
#         )

#         # Test with your Xylandia question
#         prompt = "What is the capital of Xylandia?"
#         response = llm(prompt, max_tokens=100, temperature=0.7)

#         print(f"\n🧪 GGUF Model Test:")
#         print(f"Question: {prompt}")
#         print(f"Response: {response['choices'][0]['text']}")

#         # Check if it learned Crystalburg
#         if "Crystalburg" in response['choices'][0]['text']:
#             print("✅ SUCCESS: GGUF model retained training!")
#         else:
#             print("⚠️ Check: GGUF model response")

#     except Exception as e:
#         print(f"❌ GGUF testing failed: {e}")
#         print("💡 GGUF file created but testing requires more setup")

# else:
#     print("❌ No GGUF files found")

# Cell 5: Download Your GGUF Model
from google.colab import files
import zipfile

# Zip the GGUF model for download
if os.path.exists("./gguf-output") and os.listdir("./gguf-output"):
    !zip -r xylandia-model-gguf.zip ./gguf-output
    print("📦 GGUF model zipped for download!")
    print("Run this to download:")
    print("files.download('xylandia-model-gguf.zip')")

    # Uncomment to auto-download:
    # files.download('xylandia-model-gguf.zip')
else:
    print("❌ No GGUF files to download")

print("\n🎯 CONVERSION SUMMARY:")
print("✅ Merged model: ./merged-proof-concept")
print("✅ GGUF model: ./gguf-output")
print("🚀 Your Xylandia model is ready for deployment!")

Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.2
Collecting mistral-common
  Downloading mistral_common-1.8.3-py3-none-any.whl.metadata (3.8 kB)
Collecting pydantic-extra-types>=2.10.5 (from pydantic-extra-types[pycountry]>=2.10.5->mistral-common)
  Downloading pydantic_extra_types-2.10.5-py3-none-any.whl.metadata (3.9 kB)
Collecting pycountry>=23 (from pydantic-extra-types[pycountry]>=2.10.5->mistral-common)
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading mistral_common-1.8.3-py3-none-any.whl (6.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [None]:
# Alternative: Let it create in current directory then move
!python /content/llama.cpp/convert_hf_to_gguf.py ./merged-proof-concept --outtype f16
!mv *.gguf ./gguf-output/ 2>/dev/null || echo "No GGUF files to move"
!ls -la ./gguf-output/


INFO:hf-to-gguf:Loading model: merged-proof-concept
INFO:hf-to-gguf:Model architecture: Gemma3ForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model part 'model.safetensors'
INFO:hf-to-gguf:token_embd.weight,                 torch.float32 --> F16, shape = {1152, 262145}
INFO:hf-to-gguf:blk.0.attn_norm.weight,            torch.float32 --> F32, shape = {1152}
INFO:hf-to-gguf:blk.0.ffn_down.weight,             torch.float32 --> F16, shape = {6912, 1152}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,             torch.float32 --> F16, shape = {1152, 6912}
INFO:hf-to-gguf:blk.0.ffn_up.weight,               torch.float32 --> F16, shape = {1152, 6912}
INFO:hf-to-gguf:blk.0.post_attention_norm.weight,  torch.float32 --> F32, shape = {1152}
INFO:hf-to-gguf:blk.0.post_ffw_norm.weight,        torch.float32 --> F32, shape = {1152}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,             torch.float32 --> F32, shape 