In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks and Kaggle notebooks!
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
    !pip install --no-deps cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

### Unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.2.15: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.2.15 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


### Data Preperation

We now use the `hermes-function-calling-thinking-V1` from [Jofthomas](https://huggingface.co/datasets/Jofthomas/hermes-function-calling-thinking-V1), which is a toy version of 3K of the original [hermes-function-calling dataset](https://huggingface.co/datasets/NousResearch/hermes-function-calling-v1).

We use a [custom chat template]() to deal with the dataset [formatting](https://huggingface.co/agents-course/notebooks/blob/main/bonus-unit1/bonus-unit1.ipynb).

In [None]:
from unsloth.chat_templates import get_chat_template

custom_template = \
    "{{bos_token}}"\
    "{% if messages[0]['role'] == 'system' %}"\
        "{{'>>> User: ' + messages[0]['content'] + '\nAlso, before making a tool call, take your time to plan in the format <think>your thoughts</think>\n\n' + messages[1]['content']}}"\
        "{% set loop_messages = messages[2:] %}"\
    "{% else %}"\
        "{% set loop_messages = messages %}"\
    "{% endif %}"\
    "{% for message in loop_messages %}"\
        "{% if message['role'] == 'human' %}"\
            "{{ '>>> User: ' + message['content'] + '\n' }}"\
        "{% elif message['role'] == 'model' %}"\
            "{{ '>>> Assistant: ' + message['content'] + eos_token + '\n' }}"\
        "{% elif message['role'] == 'tool' %}"\
            "{{ '>>> Tool: ' + message['content'] + '\n' }}"\
        "{% else %}"\
            "{{ raise_exception('Only user, assistant and tools roles are supported!') }}"\
        "{% endif %}"\
    "{% endfor %}"\
    "{% if add_generation_prompt %}"\
        "{{ '>>> Assistant: <think>' }}"\
    "{% endif %}"

unsloth_eos_token = "eos_token"

tokenizer = get_chat_template(
    tokenizer,
    chat_template = (custom_template, unsloth_eos_token,), # You must provide a template and EOS token
    map_eos_token = True, # Maps <|im_end|> to </s> instead
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("Jofthomas/hermes-function-calling-thinking-V1", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,).select_columns(['text'])

README.md:   0%|          | 0.00/354 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.85M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3570 [00:00<?, ? examples/s]

Map:   0%|          | 0/3570 [00:00<?, ? examples/s]

### Train Model

Train the model with the same config as the [Alpaca Example]()

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Converting train dataset to ChatML (num_proc=2):   0%|          | 0/3570 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=2):   0%|          | 0/3570 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/3570 [00:00<?, ? examples/s]

Truncating train dataset (num_proc=2):   0%|          | 0/3570 [00:00<?, ? examples/s]

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
5.748 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 3,570 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
10,1.1071
20,0.5334
30,0.4338
40,0.3839
50,0.3852
60,0.3472


In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

1399.2088 seconds used for training.
23.32 minutes used for training.
Peak reserved memory = 7.615 GB.
Peak reserved memory for training = 1.867 GB.
Peak reserved memory % of max memory = 51.659 %.
Peak reserved memory for training % of max memory = 12.665 %.


<a name="Inference"></a>
### Inference
Let's run the model! You can change the query and tools avaliable.

**[NEW] Try 2x faster inference in a free Colab for Llama-3.1 8b Instruct [here](https://colab.research.google.com/drive/1T-YBVfnphoVc8E2E854qF3jdia2Ll2W2?usp=sharing)**

<a name="Tool Calling"></a>
#### Tool Calling Utils

In [None]:
import re
from typing import List, Dict, Any, Callable
import inspect

def extract_tag_content(text, tag):
    matches = re.findall(fr"<{tag}>(.*?)</{tag}>", text, re.DOTALL)
    if not matches:
        raise Exception(f"Tag '{tag}' not found in the text.")
    return matches[-1]

def parse_docstring(doc: str) -> Dict[str, str]:
    parsed = {"args": {}, "returns": None}

    arg_pattern = re.compile(r'\s*(\w+)\s*:\s*(.*)')
    return_pattern = re.compile(r'Returns:\s*(.*)', re.IGNORECASE)

    lines = doc.split("\n")
    in_args_section = False

    for line in lines:
        line = line.strip()
        if not line:
            continue

        if line.lower().startswith("args:"):
            in_args_section = True
            continue

        if in_args_section:
            match = arg_pattern.match(line)
            if match:
                parsed["args"][match.group(1)] = match.group(2)
            else:
                in_args_section = False

        return_match = return_pattern.match(line)
        if return_match:
            parsed["returns"] = return_match.group(1)

    return parsed

def function_to_tool(func: Callable) -> Dict[str, Any]:
    signature = inspect.signature(func)
    properties = {}
    required = []

    docstring = func.__doc__ or ""
    parsed_doc = parse_docstring(docstring)

    for name, param in signature.parameters.items():
        param_type = param.annotation.__name__.lower() if param.annotation != param.empty else "string"
        description = parsed_doc["args"].get(name, f"The {name} of the {func.__name__.replace('_', ' ')}")
        properties[name] = {"type": param_type, "description": description}
        required.append(name)

    desc = docstring.strip().split("\n")[0] if docstring else f'Get {func.__name__.replace("_", " ")}'
    return_desc = parsed_doc["returns"]

    return {
        "name": func.__name__,
        "description": f"{desc}",
        "parameters": {
            "type": "object",
            "properties": properties,
            "required": required
        },
        "returns": return_desc or "Unknown"
    }


def build_tool_description(functions) -> List[Dict[str, Any]]:
    return [{"type": "function", "function": function_to_tool(func)} for func in functions]

def get_tool_result(tool_call: Dict[str, Any], avaliable_lookup: Dict[str, Callable]) -> Any:
    tool_name = tool_call.get('name')
    tool_args = tool_call.get('arguments', {})
    if tool_name not in avaliable_lookup.keys():
        raise ValueError(f"Function '{tool_name}' not found.")

    func = avaliable_lookup[tool_name]
    try:
        return func(**tool_args)
    except Exception as e:
        return f"Error executing '{tool_name}': {str(e)}"


### User defined tools

These can be any user defined python functions.
For best results use doc strings and return descriptive dictionary

In [None]:
# @title User defined pythonic tools
import requests

def get_useless_fact():
    """
    Fetches a random useless fact from the Useless Facts API.

    Returns:
        dict: A dictionary containing the fact and a status message.
    """
    url = "https://uselessfacts.jsph.pl/random.json?language=en"
    response = requests.get(url)
    if response.status_code == 200:
        return {"fact": response.json()["text"], "status": "success"}
    return {"error": "Could not fetch a fact!", "status": "failure"}

def generate_fake_person():
    """
    Generates a fake person profile using the RandomUser API.

    Returns:
        dict: A dictionary containing a random person's details.
    """
    url = "https://randomuser.me/api/"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()["results"][0]
        return {
            "name": f"{data['name']['first']} {data['name']['last']}",
            "email": data["email"],
            "city": data["location"]["city"],
            "country": data["location"]["country"],
            "profile_pic": data["picture"]["large"],
            "status": "success"
        }
    return {"error": "Could not generate a fake person!", "status": "failure"}

def get_programming_joke():
    """
    Retrieves a random programming joke from the JokeAPI.

    Returns:
        dict: A dictionary containing the joke and status message.
    """
    url = "https://v2.jokeapi.dev/joke/Programming"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        joke = data["joke"] if data["type"] == "single" else f"{data['setup']} - {data['delivery']}"
        return {"joke": joke, "status": "success"}
    return {"error": "Could not fetch a joke!", "status": "failure"}


def get_weather_open_meteo(lat, lon):
    """
    Fetches current weather data from the Open-Meteo API.

    Args:
        lat (float): Latitude of the location.
        lon (float): Longitude of the location.

    Returns:
        dict: A dictionary containing temperature, weather code, and status message.
    """
    url = f"https://api.open-meteo.com/v1/forecast?latitude={lat}&longitude={lon}&current=temperature_2m,weathercode"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()["current"]
        return {"temperature": data["temperature_2m"], "weather_code": data["weathercode"], "status": "success"}
    return {"error": "Could not fetch weather data.", "status": "failure"}

if False:
    print("\U0001F539 Random Useless Fact:", get_useless_fact())
    print("\U0001F539 Fake Person:", generate_fake_person())
    print("\U0001F539 Programming Joke:", get_programming_joke())
    print("\U0001F539 Weather:", get_weather_open_meteo(-12.0024, 172.5005))

In [None]:
avaliable_functions = [get_useless_fact, generate_fake_person, get_programming_joke, get_weather_open_meteo]
tools_description = build_tool_description(avaliable_functions)
avaliable_lookup = {func.__name__: func for func in avaliable_functions}

<a name="RunInference"></a>
#### Run Inference

In [None]:
user_prefix = """\
You are a function calling AI model. \
You are provided with function signatures within <tools></tools> XML tags. \
You may call one or more functions to assist with the user query. \
Don't make assumptions about what values to plug into functions. \
Here are the available tools: \
<tools>"""

tool_suffix = """\
</tools> \
Use the following pydantic model json schema for each tool call you will make: \
{\
'title': 'FunctionCall', \
'type': 'object', \
'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, \
'name': {'title': 'Name', 'type': 'string'}}, \
'required': ['arguments', 'name']\
} \
For each function call return a json object with function name and arguments \
within <tool_call></tool_call> XML tags as follows:
<tool_call>
{tool_call}
</tool_call> \
Also, before making a call to a function take the time to plan the function to take. \
Make that thinking process between <think>{your thoughts}</think>
"""

system_prompt = f"{user_prefix}{str(tools_description)}{tool_suffix}"

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

GREEN = "\033[95m"
RESET = "\033[0m"

def tool_query(query, n_depth:int=4):
    print(f"{'='*15}{query}{'='*15}")
    history = [{"role": "human", "content": f"{system_prompt}\n\n{query}"}]
    for func_depth in range(n_depth):
        prompt = tokenizer.apply_chat_template(history, add_generation_prompt=True, tokenize=False)
        if history[-1]["role"]=="tool":
            prompt = prompt.removesuffix('<think>')
        inputs = tokenizer(prompt, return_tensors = "pt", add_special_tokens=False).to("cuda")
        outputs = model.generate(**inputs, max_new_tokens = 512, use_cache = True)
        latest_step = tokenizer.batch_decode(outputs)[0].split(">>> Assistant: ")[-1].strip()
        history.append({"role":"model","content":latest_step})
        print(GREEN, latest_step, RESET)
        if "<tool_call>" not in latest_step:
            if latest_step.strip() == "<|end_of_text|>":
                print(f"❌ Summarise tool call failed")
            break
        try:
            tool_request = eval(extract_tag_content(latest_step, 'tool_call'))
            tool_result = get_tool_result(tool_request, avaliable_lookup)
            history.append({"role":"tool","content": str(tool_result)})
            if True:
                print("🤖:", tool_request)
                print("🐍:", tool_result)
        except Exception as e:
            print(f"❌ {e}")
            break
    return history

### Examples

In [None]:
answer = tool_query("Tell me a funny quip")

[95m <think>Okay, so the user has asked to tell them a funny quip. I need to figure out which function to use. Let's see, the available tools include get_useless_fact, generate_fake_person, get_programming_joke, and get_weather_open_meteo. The get_useless_fact function is perfect for this because it fetches a random useless fact, which could definitely be funny. I don't need any parameters for this function since it's designed to return a random fact. So, I'll call get_useless_fact without any arguments to get a funny quip for the user.
</think><tool_call>
{'name': 'get_useless_fact', 'arguments': {}}
</tool_call><|end_of_text|> [0m
🤖: {'name': 'get_useless_fact', 'arguments': {}}
🐍: {'fact': 'Studies show that if a cat falls off the seventh floor of a building it has about thirty percent less chance of surviving than a cat that falls off the twentieth floor. It supposedly takes about eight floors for the cat to realize what is occurring, relax and correct itself.', 'status': 'succes

In [None]:
answer = tool_query("Whats the weather at 39.0742N, 21.8243E")

[95m <think>Okay, let's break this down step by step. The user has asked about the weather at a specific location. They provided coordinates 39.0742N, 21.8243E. 

First, I need to figure out how to get the weather information. Looking at the available tools, there's a function called get_weather_open_meteo that can fetch weather data from the Open-Meteo API. This function requires two parameters: lat and lon. 

The user has given me the latitude and longitude in the correct format, so I can directly plug those into the function. The lat is 39.0742 and the lon is 21.8243. 

Now, I need to structure the function call correctly. The function name is get_weather_open_meteo, and the arguments are {'lat': '39.0742', 'lon': '21.8243'}. 

So, the next step is to call this function with the provided coordinates. This should give the user the current weather at that location. 
</think><tool_call>
{'name': 'get_weather_open_meteo', 'arguments': {'lat': '39.0742', 'lon': '21.8243'}}
</tool_call><

In [None]:
answer = tool_query("Can you think of a fake name and email address for me")

[95m <think>Alright, let me break this down. The user is asking for a fake name and email address. I need to figure out how to respond using the available tools. Looking at the tools, there are four functions: get_useless_fact, generate_fake_person, get_programming_joke, and get_weather_open_meteo. 

I can see that get_useless_fact is about fetching random useless facts, but that doesn't seem relevant here. The generate_fake_person function is perfect because it generates fake profiles. This exactly matches what the user is asking for. So, I'll call generate_fake_person with no arguments since the function doesn't require any.

I should avoid using get_programming_joke or get_weather_open_meteo because they're unrelated to the user's request. The get_weather_open_meteo requires lat and lon, which isn't necessary here. 

Therefore, the best choice is to call generate_fake_person to provide the user with a fake name and email address.
</think><tool_call>
{'name': 'generate_fake_person',

In [None]:
answer = tool_query("Know any good factoids")

[95m <think>Okay, so the user asked for a factoid. I need to figure out how to respond. Looking at the tools available, there's a function called get_useless_fact that fetches a useless fact. That sounds perfect for this. No parameters are needed, so I can just call it without any arguments. That should give the user the kind of random fact they're looking for.
</think><tool_call>
{'name': 'get_useless_fact', 'arguments': {}}
</tool_call><|end_of_text|> [0m
🤖: {'name': 'get_useless_fact', 'arguments': {}}
🐍: {'fact': 'Babies are most likely to be born on Tuesdays.', 'status': 'success'}
[95m Babies are most likely to be born on Tuesdays.<|end_of_text|> [0m
