In [18]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import login

# replace 'YOUR_HF_TOKEN_HERE' with your Hugging Face token to authenticate
# login("YOUR_HF_TOKEN_HERE")

# Configuration for 4-bit quantization
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load tokenizer and model with specified quantization configuration
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=nf4_config
)

# Calculate and print the memory footprint of the model
memory_footprint = model.get_memory_footprint()
print(f"Memory footprint: {memory_footprint / (1024**3):.2f} GB")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Memory footprint: 5.21 GB


In [24]:
from threading import Thread
from transformers import TextIteratorStreamer

# Define the chat messages
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello!"}
]

# Prepare model inputs using the chat template
model_inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
    return_dict=True
).to(model.device)

# Initialize the TextIteratorStreamer
streamer = TextIteratorStreamer(
    tokenizer,
    skip_prompt=True,
    skip_special_tokens=False,
    clean_up_tokenization_spaces=True
)

# Define generation arguments
generation_kwargs = {
    **model_inputs,
    "streamer": streamer,
    "max_new_tokens": 1000,
    "eos_token_id": tokenizer.eos_token_id,
}

# Start the generation in a separate thread
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()

# Collect and print the generated text
buffer = []
for text in streamer:
    buffer.append(text)
    print(text, end="", flush=True)


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Hello! How can I assist you today?<|eot_id|>

In [47]:
# add syspath to the current directory
import sys
sys.path.append('.')

from utils import get_tools_prefix_messages
from litserve.specs.openai import Tool, ChatMessage

# JSON data
tools_json = [
    {
        "type": "function",
        "function": {
            "name": "get_current_weather",
            "description": "Get the current weather in a given location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city and state, e.g. San Francisco, CA",
                    },
                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
                },
                "required": ["location"],
            },
        }
    },
    {
        "type": "function",
        "function": {
            "name": "get_current_weather",
            "description": "Get the current weather in a given location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city and state, e.g. San Francisco, CA",
                    },
                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
                },
                "required": ["location"],
            },
        }
    }
]

# Convert JSON to Pydantic models
tools = [Tool.model_validate(tool) for tool in tools_json]

messages_json = [
        {"role": "system", "content": "You are a helpful assistant with tool calling capabilities."},
        {"role": "user", "content": "What's the weather like in Boston today?"},
    ]
messages = [ChatMessage.model_validate(message) for message in messages_json]

messages = get_tools_prefix_messages(messages, tools)

# convert Pydantic models to JSON
messages_json = [message.model_dump(exclude_none=True) for message in messages]

for message in messages_json:
    print(message["content"])


Cutting Knowledge Date: December 2023
Today Date: 28 July 2024


You have access to the following functions:

Use the function 'get_current_weather' to 'Get the current weather in a given location'
{"type":"function","function":{"name":"get_current_weather","description":"Get the current weather in a given location","parameters":{"type":"object","properties":{"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"},"unit":{"type":"string","enum":["celsius","fahrenheit"]}},"required":["location"]}}}

Use the function 'get_current_weather' to 'Get the current weather in a given location'
{"type":"function","function":{"name":"get_current_weather","description":"Get the current weather in a given location","parameters":{"type":"object","properties":{"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"},"unit":{"type":"string","enum":["celsius","fahrenheit"]}},"required":["location"]}}}


Think very carefully before calling 

In [50]:
import json
from jinja2 import Template
chat_template="""
{%- for message in messages %}
    {%- set prefix = '<|begin_of_text|>' if loop.index0==0 else '' %}
    {{- prefix + '<|start_header_id|>'+message['role']+'<|end_header_id|>\n\n' -}}
    {%- if message['role'] == 'assistant' and 'tool_calls' in message %}
        {%- for tool in message['tool_calls'] %}
            {%- set tool_json = {'id': tool['id'], 'name': tool['function']['name'], 'arguments': tool['function']['arguments']} %}
            {{- tool_json }}
        {%- endfor %}
        {{- '<|eot_id|>\n' }}
    {%- else %}
        {{- message['content'] + '<|eot_id|>\n' }}
    {%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
{%- endif %}
"""

jinja_template =Template(chat_template.strip())
def get_current_weather(location, unit="fahrenheit"):
    """Get the current weather in a given location"""
    if "tokyo" in location.lower():
        return json.dumps({"location": "Tokyo", "temperature": "10", "unit": unit})
    elif "san francisco" in location.lower():
        return json.dumps(
            {"location": "San Francisco", "temperature": "72", "unit": unit}
        )
    elif "paris" in location.lower():
        return json.dumps({"location": "Paris", "temperature": "22", "unit": unit})
    else:
        return json.dumps({"location": location, "temperature": "unknown"})

messages_json = [
        *messages_json,
        {"role": "assistant", "tool_calls": [{"id":"call_deok","function":{"name":"get_current_weather","arguments":{"location":"San Francisco","unit":"celsius"}},"type":"function"}]},
        {"role": "ipython","tool_call_id": 'call_deok', "name": 'get_current_weather', "content": get_current_weather("tokyo") }
    ]

prompt = jinja_template.render(messages=messages_json, add_generation_prompt=True)

print(prompt)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>


Cutting Knowledge Date: December 2023
Today Date: 28 July 2024


You have access to the following functions:

Use the function 'get_current_weather' to 'Get the current weather in a given location'
{"type":"function","function":{"name":"get_current_weather","description":"Get the current weather in a given location","parameters":{"type":"object","properties":{"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"},"unit":{"type":"string","enum":["celsius","fahrenheit"]}},"required":["location"]}}}

Use the function 'get_current_weather' to 'Get the current weather in a given location'
{"type":"function","function":{"name":"get_current_weather","description":"Get the current weather in a given location","parameters":{"type":"object","properties":{"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"},"unit":{"type":"string","enum":["celsius","fahrenheit"]}},"req