# Fine-tune Gemma 3n for function calling
This notebook is based on the [article](https://medium.com/@lucamassaron/fine-tuning-gemma-3-1b-for-function-calling-a-step-by-step-guide-66a613352f99) and [code](https://gist.github.com/lmassaron/7166f58912ff23de3fa627671fac07df) by Luca Massaron for fine-tuning the Gemma 3 1B model for function calling, together with the [notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3N_(4B)-Conversational.ipynb) released by Unsloth for fine-tuning Gemma 3n models.

In [1]:
# Install latest transformers for Gemma 3N
# !pip install --no-deps transformers>=4.53.1 # Only for Gemma 3N
# !pip install --no-deps --upgrade timm # Only for Gemma 3N
# from huggingface_hub import login
# login()

In [2]:
# Get the model from Unsloth
from unsloth import FastLanguageModel, FastModel
import torch

torch._dynamo.config.cache_size_limit = 64  # or higher  

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-3n-E4B-it-unsloth-bnb-4bit",
    dtype = None, # None for auto detection
    max_seq_length = 2048, # Used for training for function call responses
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    full_finetuning = False, 
    attn_implementation="flash_attention", 
    # token = "hf_...", # use one if using gated models
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.8.1: Fast Gemma3N patching. Transformers: 4.55.0.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.034 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Gemma3N does not support SDPA - switching to eager!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
# Use this chat template for training tool calls
tokenizer.chat_template = (
    "{{ bos_token }}{% for message in messages %}{% if message['role'] != 'system' %}{{ '<start_of_turn>' + message['role'] + '\n' + message['content'] | trim + '<end_of_turn><eos>\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"
)

In [4]:
# Get the finetuning model
from peft import TaskType

model = FastLanguageModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # Should leave on always!
    target_modules=["gate_proj","q_proj","o_proj","k_proj","down_proj","up_proj","v_proj"],
    use_gradient_checkpointing="unsloth",
    task_type = TaskType.CAUSAL_LM, 
    bias= "none",
    use_rslora=False,
    loftq_config=None,
    r = 16,
    lora_alpha = 64,
    lora_dropout = 0.05,    
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.


Unsloth: Making `model.base_model.model.model.language_model` require gradients


In [5]:
# Load the datasets
from datasets import load_dataset
dataset = load_dataset("lmassaron/hermes-function-calling-v1", split="train")
eval_dataset = load_dataset("lmassaron/hermes-function-calling-v1", split="test")
dataset[0]

{'conversations': [{'content': "You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags.You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.Here are the available tools:<tools> [{'type': 'function', 'function': {'name': 'search_book', 'description': 'Search for a book based on title and/or author', 'parameters': {'type': 'object', 'properties': {'title': {'type': 'string', 'description': 'The title of the book'}, 'author': {'type': 'string', 'description': 'The author of the book'}}, 'required': []}}}, {'type': 'function', 'function': {'name': 'get_definition', 'description': 'Get the definition of a word', 'parameters': {'type': 'object', 'properties': {'word': {'type': 'string', 'description': 'The word to get the definition for'}}, 'required': ['word']}}}] </tools>Use the following pydantic model json schema for each tool call you will make: {'title': 

In [6]:
# Convert the dataset to the correct format for finetuning
def formatting_prompts_func(examples):
   convos = examples["conversations"]
   texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False).removeprefix('<bos>') for convo in convos]
   return { "text" : texts, }

dataset =  dataset.map(formatting_prompts_func, batched = True)
eval_dataset =  eval_dataset.map(formatting_prompts_func, batched = True)
eval_dataset[20]

{'conversations': [{'content': "You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags.You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.Here are the available tools:<tools> [{'type': 'function', 'function': {'name': 'convert_currency', 'description': 'Convert currency from one type to another', 'parameters': {'type': 'object', 'properties': {'amount': {'type': 'number', 'description': 'The amount to be converted'}, 'from_currency': {'type': 'string', 'description': 'The currency to convert from'}, 'to_currency': {'type': 'string', 'description': 'The currency to convert to'}}, 'required': ['amount', 'from_currency', 'to_currency']}}}, {'type': 'function', 'function': {'name': 'create_note', 'description': 'Create a new note', 'parameters': {'type': 'object', 'properties': {'title': {'type': 'string', 'description': 'The title of the note'}, 'content':

In [7]:
# Setup the fine-tuning trainer
from unsloth import is_bfloat16_supported
training_arguments = {
    # Basic training configuration
    "num_train_epochs": 1,
    #"bf16": is_bfloat16_supported(),  # Use bfloat16 if supported
    #"fp16": not is_bfloat16_supported(),  # Use if bfloat16 not supported
    # "max_steps": 240,
    # Evaluation and saving
    "eval_strategy": "epoch",
    "save_strategy": "epoch",
    "per_device_train_batch_size": 1,
    "per_device_eval_batch_size": 1,
    "gradient_accumulation_steps": 4,
    # Optimization settings
    "optim": "adamw_torch_fused",
    "learning_rate": 1e-4,
    "weight_decay": 0.1,
    "max_grad_norm": 1.0,
    "lr_scheduler_type": "cosine",
    "warmup_ratio": 0.1,
    # Memory optimization
    #"save_total_limit": 2,
    #"greater_is_better": False,
    # Logging and output
    "logging_steps": 1,
    #"report_to": None,
    "logging_dir": "logs/runs",
    "overwrite_output_dir": True,
    # Model sharing
    "push_to_hub": False,
    "hub_private_repo": False,
}

from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset= eval_dataset,
    dataset_text_field = "text",
    max_seq_length = 2048,
    dataset_num_proc = 2,
    packing = False,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    args = SFTConfig(**training_arguments),  
)

In [8]:
# Train the model on the responses only, ignore user instructions
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

In [9]:
# Verify the chat template was applied correctly. Only 1 <bos> token should be present.
tokenizer.decode(trainer.train_dataset[100]["input_ids"])

"<bos><start_of_turn>human\nYou are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags.You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.Here are the available tools:<tools> [{'type': 'function', 'function': {'name': 'analyze_website', 'description': 'Analyze the content and structure of a website', 'parameters': {'type': 'object', 'properties': {'url': {'type': 'string', 'description': 'The URL of the website to analyze'}}, 'required': ['url']}}}, {'type': 'function', 'function': {'name': 'calculate_bmi', 'description': 'Calculate the Body Mass Index (BMI)', 'parameters': {'type': 'object', 'properties': {'weight': {'type': 'number', 'description': 'The weight in kilograms'}, 'height': {'type': 'number', 'description': 'The height in meters'}}, 'required': ['weight', 'height']}}}] </tools>Use the following pydantic model json schema for each tool call you

In [10]:
# Verify user instruction masked
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[100]["labels"]]).replace(tokenizer.pad_token, " ")

"                                                                                                                                                                                                                                                                                                                                                                                                                    Of course, I can help you with that. Could you please provide me with the URL of the website you want to analyze?<end_of_turn><eos>\n<start_of_turn>human\nSure, the website URL is www.example.com.<end_of_turn><eos>\n<start_of_turn>model\n<tool_call>\n{'name': 'analyze_website', 'arguments': {'url': 'www.example.com'}}\n</tool_call><end_of_turn><eos>\n<start_of_turn>tool\n<tool_response>\n{'status': 'success', 'message': 'Website analysis completed', 'data': {'structure': 'The website has a clear and intuitive structure with a navigation menu at the top. The homepage, about us, services, a

In [11]:
# Train the model
# trainer.max_steps = 60
trainer_stats = trainer.train()
trainer_stats

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,167 | Num Epochs = 1 | Total steps = 1,042
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 40,189,952 of 7,890,168,144 (0.51% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Epoch,Training Loss,Validation Loss
1,0.111,0.295569


Unsloth: Not an error, but Gemma3nForConditionalGeneration does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


TrainOutput(global_step=1042, training_loss=0.5097231654246515, metrics={'train_runtime': 6663.7833, 'train_samples_per_second': 0.625, 'train_steps_per_second': 0.156, 'total_flos': 9.095523591867322e+16, 'train_loss': 0.5097231654246515})

In [12]:
from transformers import TextStreamer

messages = [{
    "role": "user",
    "content": [{
        "type" : "text",
        "text" : eval_dataset[10]["conversations"][0]["content"],
    }]
}]
# Convert the messages to the correct format for generation
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
    tokenize = True,
    return_dict = True,
).to("cuda")

_ = model.generate(
    **inputs,
    max_new_tokens = 64, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 1.0, top_p = 0.95, top_k = 64,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

Sure, I can help with that. Could you please provide me with the title and description of the todo item?<end_of_turn>


In [13]:
import os
model_name = "gemma3n_e4b_tools_test"
model_name_gguf = model_name
model_merged_hf_repo = "allisterb/gemma3n_e4b_tools_test"
model_gguf_hf_repo = model_merged_hf_repo + "-GGUF"
hf_token = os.environ["HF_TOKEN"]

In [14]:
# Save merged model
model.save_pretrained_merged(model_name, tokenizer)

Found HuggingFace hub cache directory: /home/eddsa-key-20250707/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model-00001-of-00004.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Downloading safetensors index for unsloth/gemma-3n-e4b-it...


Unsloth: Merging weights into 16bit:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  25%|██▌       | 1/4 [00:42<02:06, 42.08s/it]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  50%|█████     | 2/4 [01:57<02:03, 61.52s/it]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  75%|███████▌  | 3/4 [03:17<01:10, 70.03s/it]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.66G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit: 100%|██████████| 4/4 [03:55<00:00, 58.87s/it]


In [15]:
# Push the merged model to Hugging Face
model.push_to_hub_merged(model_merged_hf_repo, tokenizer, token = hf_token)

No files have been modified since last commit. Skipping to prevent empty commit.


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/tmpoidft494/tokenizer.model      : 100%|##########| 4.70MB / 4.70MB            

  /tmp/tmpoidft494/tokenizer.json       : 100%|##########| 33.4MB / 33.4MB            

No files have been modified since last commit. Skipping to prevent empty commit.


Found HuggingFace hub cache directory: /home/eddsa-key-20250707/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model-00001-of-00004.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Downloading safetensors index for unsloth/gemma-3n-e4b-it...


model.safetensors.index.json: 0.00B [00:00, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.
Unsloth: Merging weights into 16bit:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...94/model-00001-of-00004.safetensors:   1%|1         | 41.8MB / 3.08GB            

No files have been modified since last commit. Skipping to prevent empty commit.
Unsloth: Merging weights into 16bit:  25%|██▌       | 1/4 [01:07<03:22, 67.65s/it]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...94/model-00002-of-00004.safetensors:   1%|          | 41.9MB / 4.97GB            

Unsloth: Merging weights into 16bit:  50%|█████     | 2/4 [03:18<03:29, 104.54s/it]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...94/model-00003-of-00004.safetensors:   0%|          |  602kB / 4.99GB            

Unsloth: Merging weights into 16bit:  75%|███████▌  | 3/4 [05:32<01:58, 118.21s/it]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.66G [00:00<?, ?B/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...94/model-00004-of-00004.safetensors:   0%|          | 36.1kB / 2.66GB            

Unsloth: Merging weights into 16bit: 100%|██████████| 4/4 [06:43<00:00, 100.78s/it]


In [16]:
# Save to GGUF format
model.save_pretrained_gguf(
    model_name,
    "Q8_0", # For now only Q8_0, BF16, F16 supported
    tokenizer
)

Unsloth GGUF:hf-to-gguf:Loading model: gemma3n_e4b_tools_test
Unsloth GGUF:hf-to-gguf:Model architecture: Gemma3nForConditionalGeneration
Unsloth GGUF:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
Unsloth GGUF:hf-to-gguf:Exporting model...
Unsloth GGUF:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
Unsloth GGUF:hf-to-gguf:gguf: loading model part 'model-00001-of-00004.safetensors'
Unsloth GGUF:hf-to-gguf:altup_proj.weight,                 torch.bfloat16 --> Q8_0, shape = {2048, 2048, 3}
Unsloth GGUF:hf-to-gguf:altup_unembd_proj.weight,          torch.bfloat16 --> Q8_0, shape = {2048, 2048, 3}
Unsloth GGUF:hf-to-gguf:token_embd.weight,                 torch.bfloat16 --> Q8_0, shape = {2048, 262144}
Unsloth GGUF:hf-to-gguf:gguf: loading model part 'model-00002-of-00004.safetensors'
Unsloth GGUF:hf-to-gguf:per_layer_token_embd.weight,       torch.bfloat16 --> Q8_0, shape = {8960, 262144}
Unsloth GGUF:hf-to-gguf:output_norm.weight,          

Unsloth: GGUF conversion:   0%|          | 0/100 [00:00<?, ?it/s]

Unsloth GGUF:hf-to-gguf:Model successfully exported to ./
Unsloth: Converted to gemma3n_e4b_tools_test.Q8_0.gguf with size = 7.3G
Unsloth: Successfully saved GGUF to:
gemma3n_e4b_tools_test.Q8_0.gguf


['gemma3n_e4b_tools_test.Q8_0.gguf']

In [17]:
# Upload GGUF model to Hugging Face
from huggingface_hub import HfApi
hf_api = HfApi()
hf_api.upload_file(
                path_or_fileobj = model_name +".Q8_0.gguf",
                path_in_repo    = model_name +".Q8_0.gguf",
                repo_id         = model_gguf_hf_repo,
                repo_type       = "model",
                commit_message  = "(Trained with Unsloth)",
                token = hf_token,
            )

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  gemma3n_e4b_tools_test.Q8_0.gguf      :   1%|          | 41.8MB / 7.30GB            

CommitInfo(commit_url='https://huggingface.co/allisterb/gemma3n_e4b_tools_test-GGUF/commit/5e9d90bb56e7092af7e9285e3753ed654808ce95', commit_message='(Trained with Unsloth)', commit_description='', oid='5e9d90bb56e7092af7e9285e3753ed654808ce95', pr_url=None, repo_url=RepoUrl('https://huggingface.co/allisterb/gemma3n_e4b_tools_test-GGUF', endpoint='https://huggingface.co', repo_type='model', repo_id='allisterb/gemma3n_e4b_tools_test-GGUF'), pr_revision=None, pr_num=None)

In [18]:
# Upload Ollama template to Hugging Face Hub
hf_api.upload_file(
    path_or_fileobj="template",
    path_in_repo="template",
    repo_id=model_gguf_hf_repo,
)

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/allisterb/gemma3n_e4b_tools_test-GGUF/commit/5e9d90bb56e7092af7e9285e3753ed654808ce95', commit_message='Upload template with huggingface_hub', commit_description='', oid='5e9d90bb56e7092af7e9285e3753ed654808ce95', pr_url=None, repo_url=RepoUrl('https://huggingface.co/allisterb/gemma3n_e4b_tools_test-GGUF', endpoint='https://huggingface.co', repo_type='model', repo_id='allisterb/gemma3n_e4b_tools_test-GGUF'), pr_revision=None, pr_num=None)