In [1]:
%%capture
!pip install unsloth
!pip install transformers -U
!pip install timm  --upgrade
!pip install datasets

In [2]:
from unsloth import FastModel
from datasets import Dataset
import torch

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-08-04 14:37:29.761802: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754318249.995307     116 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754318250.060537     116 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🦥 Unsloth Zoo will now patch everything to make training faster!


In [5]:
import json
import os

# Get all JSON files from the input folder
input_folder = '/kaggle/input/data-for-fine-tuning-gemma'
all_files = os.listdir(input_folder)
json_files = [f for f in all_files if f.endswith('.json')]

print(f"Found {len(json_files)} JSON files:")
for file in json_files:
    print(f"  - {file}")

# Combine all Q&A pairs
all_qa_pairs = []

for filename in json_files:
    file_path = os.path.join(input_folder, filename)
    print(f"\nLoading: {filename}")
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        # Extract Q&A pairs from each dataset
        count = 0
        for item in data:
            if 'qa_pairs' in item:
                all_qa_pairs.extend(item['qa_pairs'])
                count += len(item['qa_pairs'])
        
        print(f"  ✅ Added {count} Q&A pairs")
        
    except Exception as e:
        print(f"  ❌ Error loading {filename}: {e}")

print(f"\n🎉 Total Q&A pairs collected: {len(all_qa_pairs)}")

# Show sample
if all_qa_pairs:
    print(f"\nSample Q&A:")
    print(f"Q: {all_qa_pairs[0]['question']}")
    print(f"A: {all_qa_pairs[0]['answer'][:100]}...")

Found 5 JSON files:
  - ayurveda_qa_dataset.json
  - depression_qa_dataset.json
  - education_qa_dataset.json
  - rice_disease_qa_results.json
  - disaster_management_qa_dataset.json

Loading: ayurveda_qa_dataset.json
  ✅ Added 375 Q&A pairs

Loading: depression_qa_dataset.json
  ✅ Added 125 Q&A pairs

Loading: education_qa_dataset.json
  ✅ Added 1075 Q&A pairs

Loading: rice_disease_qa_results.json
  ✅ Added 895 Q&A pairs

Loading: disaster_management_qa_dataset.json
  ✅ Added 160 Q&A pairs

🎉 Total Q&A pairs collected: 2630

Sample Q&A:
Q: My child is constantly complaining of stomach discomfort after meals. In Ayurveda, what home remedy can I use to help with this indigestion?
A: In Ayurveda, Adrak (Ginger) is excellent for digestion. You can give your child 5 grams of crushed g...


In [6]:
len(all_qa_pairs)

2630

In [7]:
conversation_data = []

for qa in all_qa_pairs:
    # Each Q&A becomes a conversation with user/assistant roles
    conversation = [
        {"role": "user", "content": qa['question']},
        {"role": "assistant", "content": qa['answer']}
    ]
    conversation_data.append(conversation)

print(f"Converted {len(conversation_data)} Q&A pairs to conversation format")

# Create dataset like the example
from datasets import Dataset
dataset = Dataset.from_dict({"conversations": conversation_data})

print(f"✅ Dataset created with {len(dataset)} conversations")

# Show sample conversation format
print(f"\nSample conversation:")
print(f"User: {conversation_data[0][0]['content']}")
print(f"Assistant: {conversation_data[0][1]['content'][:100]}...")

# Check dataset structure
print(f"\nDataset info:")
print(dataset)

Converted 2630 Q&A pairs to conversation format
✅ Dataset created with 2630 conversations

Sample conversation:
User: My child is constantly complaining of stomach discomfort after meals. In Ayurveda, what home remedy can I use to help with this indigestion?
Assistant: In Ayurveda, Adrak (Ginger) is excellent for digestion. You can give your child 5 grams of crushed g...

Dataset info:
Dataset({
    features: ['conversations'],
    num_rows: 2630
})


In [8]:
from unsloth import FastModel
import torch

# Reload the base model
model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3n-E4B-it",
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
    device_map = "cuda:0",
    full_finetuning = False,
)

print("✅ Model loaded")

==((====))==  Unsloth 2025.8.1: Fast Gemma3N patching. Transformers: 4.54.1.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Gemma3N does not support SDPA - switching to eager!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.72G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.15G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/98.0 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

preprocessor_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.70M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

✅ Model loaded


In [9]:
# Alternative: Increase cache limit
import torch._dynamo
torch._dynamo.config.cache_size_limit = 256

print("✅ Cache limit increased")

✅ Cache limit increased


In [10]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # Should leave on always!

    r = 8,           # Larger = higher accuracy, but might overfit
    lora_alpha = 8,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

Unsloth: Making `model.base_model.model.model.language_model` require gradients


In [11]:
# Set the chat template (this is the missing step)
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

# Standardize your existing dataset
from unsloth.chat_templates import standardize_data_formats
dataset = standardize_data_formats(dataset)

print("✅ Dataset standardized for Gemma-3 format")

Unsloth: Standardizing formats (num_proc=4):   0%|          | 0/2630 [00:00<?, ? examples/s]

✅ Dataset standardized for Gemma-3 format


In [12]:
def formatting_prompts_func(examples):
   convos = examples["conversations"]
   texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False).removeprefix('<bos>') for convo in convos]
   return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched = True)

Map:   0%|          | 0/2630 [00:00<?, ? examples/s]

In [13]:
# Restart trainer with smaller batch size
from trl import SFTTrainer, SFTConfig
from unsloth.chat_templates import train_on_responses_only


trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None,
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 1,  # Reduce from 4 to 1
        gradient_accumulation_steps = 8,  # Increase to maintain effective batch size
        warmup_steps = 10,
        max_steps = 200,
        # max_steps=10,
        learning_rate = 2e-4,
        logging_steps = 10,
        optim = "paged_adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none",
        output_dir = "/kaggle/working/gemma-qa-model",
    ),
)

# Apply response-only training (THIS IS THE KEY!)
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)



Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/2630 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2630 [00:00<?, ? examples/s]

In [14]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,630 | Num Epochs = 2 | Total steps = 200
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 19,210,240 of 7,869,188,432 (0.24% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,12.236
20,9.8983
30,3.6895
40,2.7401
50,2.1269
60,1.9006
70,1.8714
80,1.7514
90,1.7165
100,1.6206


TrainOutput(global_step=200, training_loss=2.6901607799530027, metrics={'train_runtime': 2254.4004, 'train_samples_per_second': 1.419, 'train_steps_per_second': 0.089, 'total_flos': 1.209133073437152e+16, 'train_loss': 2.6901607799530027})

In [None]:
# !pip install -q huggingface_hub
# from huggingface_hub import login
# Paste the token you showed in the screenshot
# # model = FastModel.for_inference(model)
# model.push_to_hub_merged("dataakash/gemma-family-assistant-gguf", tokenizer)

In [15]:
from huggingface_hub import login
model.push_to_hub_merged("dataakash/gemma-family-assistant-gguf", tokenizer)

  0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.70M [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model-00001-of-00004.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Downloading safetensors index for unsloth/gemma-3n-e4b-it...


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Unsloth: Merging weights into 16bit:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  25%|██▌       | 1/4 [00:37<01:53, 37.74s/it]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  50%|█████     | 2/4 [01:45<01:50, 55.20s/it]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  75%|███████▌  | 3/4 [02:57<01:02, 62.93s/it]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.66G [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.66G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit: 100%|██████████| 4/4 [03:33<00:00, 53.49s/it]


In [20]:
model.save_pretrained_merged("/tmp/gemma-merged", tokenizer, save_method="merged_16bit")

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model-00001-of-00004.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Downloading safetensors index for unsloth/gemma-3n-e4b-it...


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Unsloth: Merging weights into 16bit:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  25%|██▌       | 1/4 [00:27<01:21, 27.03s/it]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  50%|█████     | 2/4 [01:22<01:27, 43.60s/it]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  75%|███████▌  | 3/4 [02:08<00:44, 44.73s/it]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.66G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit: 100%|██████████| 4/4 [02:39<00:00, 39.75s/it]


In [22]:
!ls /tmp/gemma-merged

chat_template.jinja		  model.safetensors.index.json
config.json			  preprocessor_config.json
generation_config.json		  processor_config.json
model-00001-of-00004.safetensors  special_tokens_map.json
model-00002-of-00004.safetensors  tokenizer_config.json
model-00003-of-00004.safetensors  tokenizer.json
model-00004-of-00004.safetensors  tokenizer.model


In [19]:
import os
os.environ["GIT_ASKPASS"] = "echo"

!git clone --depth 1 https://github.com/ggml-org/llama.cpp.git

Cloning into 'llama.cpp'...
remote: Enumerating objects: 1543, done.[K
remote: Counting objects: 100% (1543/1543), done.[K
remote: Compressing objects: 100% (1195/1195), done.[K
remote: Total 1543 (delta 312), reused 1074 (delta 294), pack-reused 0 (from 0)[K
Receiving objects: 100% (1543/1543), 23.89 MiB | 22.74 MiB/s, done.
Resolving deltas: 100% (312/312), done.


In [21]:
cd /kaggle/working/llama.cpp


/kaggle/working/llama.cpp


In [26]:
!python3 /kaggle/working/llama.cpp/convert_hf_to_gguf.py \
  --outfile /tmp/gemma-merged.gguf \
  --outtype q8_0 \
  /tmp/gemma-merged

Writing: 100%|██████████████████████████| 7.30G/7.30G [04:34<00:00, 26.6Mbyte/s]


In [31]:
from huggingface_hub import HfApi

api = HfApi()
api = HfApi()
api.create_repo(
    repo_id="gemma-family-temp",  # Use repo_id instead of name
    private=True,
    repo_type="model",
    exist_ok=True
)

RepoUrl('https://huggingface.co/dataakash/gemma-family-temp', endpoint='https://huggingface.co', repo_type='model', repo_id='dataakash/gemma-family-temp')

In [32]:
from huggingface_hub import upload_file

upload_file(
    path_or_fileobj="/kaggle/working/gemma-merged.gguf",
    path_in_repo="gemma-merged.gguf",
    repo_id="dataakash/gemma-family-temp",   # match above
    repo_type="model"
)

  0%|          | 0/1 [00:00<?, ?it/s]

gemma-merged.gguf:   0%|          | 0.00/7.30G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dataakash/gemma-family-temp/commit/cd95b7fb5e79553d45eea8bed0261fa8aadeb2a2', commit_message='Upload gemma-merged.gguf with huggingface_hub', commit_description='', oid='cd95b7fb5e79553d45eea8bed0261fa8aadeb2a2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/dataakash/gemma-family-temp', endpoint='https://huggingface.co', repo_type='model', repo_id='dataakash/gemma-family-temp'), pr_revision=None, pr_num=None)