In [1]:
# %%capture
# !pip install -U datasets
# uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118



In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [3]:
print("torch version: ", torch.__version__)
print('is gpu enbled: ', torch.cuda.is_available())

torch version:  2.7.1+cu128
is gpu enbled:  True


In [4]:
import os

In [5]:
from dotenv import load_dotenv

In [6]:
load_dotenv()

True

In [7]:
from huggingface_hub import login
login(os.environ["HUGGINGFACE_TOKEN"])

In [8]:
# Load model and tokenizer
model_id = "google/gemma-2b-it"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to('cuda')
# meta-llama/Llama-3.2-3B-Instruct

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
# for name, weights in model.named_parameters():
#     print(name)
#     print(weights)
#     break

In [10]:
# Load the dataset
from datasets import load_dataset
ds = load_dataset("Jofthomas/hermes-function-calling-thinking-V1")

In [11]:
ds

DatasetDict({
    train: Dataset({
        features: ['conversations'],
        num_rows: 3570
    })
})

In [12]:
model.config.use_cache = False

In [13]:
model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
      )
    )
    (norm): GemmaRMSNorm((2048,), 

In [14]:
# ds['train'][12] # sample data

In [15]:
sample_data = ds['train'][0]['conversations']

In [16]:
chat_template = "{{ bos_token }}{% if  messages[0]['role']==system%}{{ raise_exception('System message is not supported in gemma, it would be good to merget the system prompt with first user message')}}{% endif %}{% for message in messages %}{{'<start_of_turn>' + message['role'] + '\n' + message['content'] | trim + '<eos_turn><eos>\n'}}{% endfor %}{% if add_generation_prompt %}{'<start_of_turn>model\n'}{% endif %}"

In [17]:
tokenizer.chat_template = chat_template

In [18]:
print(tokenizer.apply_chat_template(sample_data[1:], tokenize=False))

<bos><start_of_turn>human
Hi, can you tell me the current stock price of Apple?<eos_turn><eos>
<start_of_turn>model
<think>Alright, the user asked for the current stock price of Apple. Looking at the tools available, there's a function called get_stock_price which exactly fits this request. I'll need to call this function with the parameter company set to 'Apple' to retrieve the information the user is asking for. This seems like the most straightforward and appropriate action to take.
</think><tool_call>
{'name': 'get_stock_price', 'arguments': {'company': 'Apple'}}
</tool_call><eos_turn><eos>
<start_of_turn>tool
<tool_response>
{'stock_price': '$150.75'}
</tool_response><eos_turn><eos>
<start_of_turn>model
The current stock price of Apple is $150.75.<eos_turn><eos>
<start_of_turn>human
That's great. Can you also tell me the stock price for Microsoft?<eos_turn><eos>
<start_of_turn>model
<think>Alright, let me try to figure out how to handle this user's request. The user first asked ab

In [19]:
ds = ds.rename_column('conversations', 'messages')

In [20]:
# preprocess dataset {apply chat template}

def preprocess_dataset(row):
    messages = row['messages'] # Dict['str', List[Dict['str', 'str']]]
    
    # Check if there system message, if yes, merge the system prompt with first user input, since the gemma model does not have system prompt , it trained in this way
    if messages[0]['role'] == 'system':
        system_message = messages[0]['content']
        messages[1]['content'] = system_message + "Also, before making a call to a function take the time to plan the function to take. Make that thinking process between <think>{your thought}</think>\n\n" + messages[1]['content']
        # after merging, remove the system message from message
        messages.pop(0)
    
    return {'messages': tokenizer.apply_chat_template(messages, tokenize=True)}
       

In [21]:
dataset = ds.map(preprocess_dataset)

In [22]:
print(tokenizer.decode(dataset['train'][0]['messages']))

<bos><start_of_turn>human
You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags.You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.Here are the available tools:<tools> [{'type': 'function', 'function': {'name': 'get_stock_price', 'description': 'Get the current stock price of a company', 'parameters': {'type': 'object', 'properties': {'company': {'type': 'string', 'description': 'The name of the company'}}, 'required': ['company']}}}, {'type': 'function', 'function': {'name': 'get_movie_details', 'description': 'Get details about a movie', 'parameters': {'type': 'object', 'properties': {'title': {'type': 'string', 'description': 'The title of the movie'}}, 'required': ['title']}}}] </tools>Use the following pydantic model json schema for each tool call you will make: {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arg

In [23]:
ds = ds['train'].train_test_split(.1)

In [24]:
from dataclasses import dataclass
from enum import Enum

class ChatToolLMSpecialTokens(Enum):
    tools = '<tools>'
    eos_tools = '</tools>'
    think = '<think>'
    eos_think = '</think>'
    tool_response = '<tool_response>'
    eos_tool_response = '</tool_response>'
    pad_token = '<pad>'
    eos_token = '<eos>'
    
    @classmethod
    def special_token_list(clss):
        return [cls_token.value for cls_token in clss] 
    

In [25]:
ChatToolLMSpecialTokens.special_token_list()

['<tools>',
 '</tools>',
 '<think>',
 '</think>',
 '<tool_response>',
 '</tool_response>',
 '<pad>',
 '<eos>']

In [26]:
list(ChatToolLMSpecialTokens)

[<ChatToolLMSpecialTokens.tools: '<tools>'>,
 <ChatToolLMSpecialTokens.eos_tools: '</tools>'>,
 <ChatToolLMSpecialTokens.think: '<think>'>,
 <ChatToolLMSpecialTokens.eos_think: '</think>'>,
 <ChatToolLMSpecialTokens.tool_response: '<tool_response>'>,
 <ChatToolLMSpecialTokens.eos_tool_response: '</tool_response>'>,
 <ChatToolLMSpecialTokens.pad_token: '<pad>'>,
 <ChatToolLMSpecialTokens.eos_token: '<eos>'>]

In [27]:
ChatToolLMSpecialTokens.pad_token.value

'<pad>'

In [28]:
# Load tokenizer again so we can add special tokens easly 
tokenizer = AutoTokenizer.from_pretrained(model_id, 
                                          pad_token=ChatToolLMSpecialTokens.pad_token.value,
                                          eos_token=ChatToolLMSpecialTokens.eos_token.value,
                                          additional_special_tokens=ChatToolLMSpecialTokens.special_token_list())

In [29]:
tokenizer.chat_template = chat_template

In [30]:
from peft import LoraConfig

In [31]:
from peft import TaskType

In [33]:
peft_config = LoraConfig(r=32,
                         lora_alpha=64,
                         target_modules=['q_proj', 'v_proj', 'o_proj',], # 'gate_proj', 'up_proj', 'down_proj'],
                         task_type=TaskType.CAUSAL_LM,
                         lora_dropout=0.05)

In [34]:
from trl import SFTConfig, SFTTrainer

In [None]:
training_arugment = SFTConfig(output_dir='gemma_function_calling_and_thinking',
                              per_device_train_batch_size=1,
                              per_device_eval_batch_size=1,
                              gradient_accumulation_steps=4,
                              logging_first_step=True,
                              logging_dir='runs',
                              learning_rate=1e-5,
                              max_grad_norm=1.0,
                              num_train_epochs=1,
                              warmup_ratio=0.1,
                              lr_scheduler_type='cosine',
                              eval_strategy='steps',
                              save_strategy='steps',
                              report_to='tensorboard',
                              gradient_checkpointing=True,
                              packing=False,
                              save_steps=100,
                              eval_steps=10,
                              logging_steps=10,
                              fp16=True)

In [36]:
len(tokenizer)

256006

In [37]:
# since we added some special tokenzer , we need to add some random embedding for it
model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(256006, 2048, padding_idx=0)

In [38]:
trainer =  SFTTrainer(model=model,
                      processing_class=tokenizer,
                      peft_config=peft_config,
                      train_dataset=ds['train'],
                      eval_dataset=ds['test'],
                      args=training_arugment)

Tokenizing train dataset:   0%|          | 0/3213 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/3213 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/357 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/357 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train(resume_from_last_checkpoint="checkpoint-200")

Step,Training Loss,Validation Loss


KeyboardInterrupt: 