In [9]:
import requests

response = requests.post(
    "http://127.0.0.1:8000/v1/chat/completions",
    json={
        "model": "lit",
        "stream": False,  # You can stream chunked response by setting this True
        "tools": [
            {
                "type": "function",
                "function": {
                    "name": "get_current_weather",
                    "description": "Get the current weather in a given location",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "location": {
                                "type": "string",
                                "description": "The city and state, e.g. San Francisco, CA",
                            },
                            "unit": {
                                "type": "string",
                                "enum": ["celsius", "fahrenheit"],
                            },
                        },
                        "required": ["location"],
                    },
                },
            }
        ],
        "messages": [{"role": "user", "content": "What's the weather like in Paris/London?"}],
    },
)

print(response.json())

{'id': 'chatcmpl-6c9b62', 'object': 'chat.completion', 'created': 1717847162, 'model': 'lit', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': ' ', 'name': None, 'tool_calls': [{'id': 'call_4fz1q-qqRiyD7RR2v0gThQ', 'type': 'function', 'function': {'name': 'get_current_weather', 'arguments': '{"location": "Paris, France", "unit": "celsius"}'}}], 'tool_call_id': None}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 0, 'total_tokens': 0, 'completion_tokens': 0}}


In [21]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# specify how to quantize the model
quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
        )

model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3", device_map="auto", quantization_config=quantization_config)
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")

In [5]:
import torch 
from pathlib import Path
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.protocol.instruct.request import ChatCompletionRequest

mistral_models_path = Path.home().joinpath("mistral_models", "7B-Instruct-v0.3")
mistral_tokenizer = MistralTokenizer.from_file(f"{mistral_models_path}/tokenizer.model.v3")

messages = [
    {"role": "user", "content": "What's the weather like in Paris?"}
]

tools = [
        {
            "type": "function",
            "function": {
                "name": "get_current_weather",
                "description": "Get the current weather in a given location",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "location": {
                            "type": "string",
                            "description": "The city and state, e.g. San Francisco, CA",
                        },
                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
                    },
                    "required": ["location"],
                },
            },
        }
    ]

completion_request = ChatCompletionRequest(messages=messages, tools=tools)
print("req", completion_request)
# text = mistral_tokenizer.encode_chat_completion(completion_request).text
tokens = mistral_tokenizer.encode_chat_completion(completion_request).tokens

device = torch.device("cpu")
input_ids = torch.tensor([tokens]).to(device)

# Create attention mask where 1s indicate real tokens and 0s indicate padding
attention_mask = torch.ones_like(input_ids).to(device)

# Set the padding tokens to 0
# attention_mask[input_ids == tokenizer.pad_token_id] = 0

model_inputs = {
    "input_ids":input_ids,
    "attention_mask": attention_mask
}

model_inputs

req temperature=0.7 top_p=1.0 max_tokens=None random_seed=None model=None messages=[UserMessage(role=<Roles.user: 'user'>, content="What's the weather like in Paris?")] response_format=ResponseFormat(type='text') tools=[Tool(type='function', function=Function(name='get_current_weather', description='Get the current weather in a given location', parameters={'type': 'object', 'properties': {'location': {'type': 'string', 'description': 'The city and state, e.g. San Francisco, CA'}, 'unit': {'type': 'string', 'enum': ['celsius', 'fahrenheit']}}, 'required': ['location']}))] tool_choice='auto'


{'input_ids': tensor([[    1,     6,  1501,  7567,  1891,  2032,  1113,  3396,  1316,  1113,
           3396,  2032, 10598,  1629,  2032,  1113,  1295, 29498,  3790, 29498,
           1537,  1991,  1316,  1113,  7286,  2032,  1113,  2226,  1040,  2636,
           8854,  1065,  1032,  2846,  5491,  1316,  1113, 12206,  2032, 10598,
           1891,  2032,  1113,  3582,  1316,  1113, 11491,  2032, 10598,  3501,
           2032, 10598,  1891,  2032,  1113,  2195,  1316,  1113,  7286,  2032,
           1113,  1782,  3758,  1072,  2433, 29493,  1085, 29491, 29489, 29491,
           4420, 10454, 29493, 10229,  8474,  1113,  6074,  2032, 10598,  1891,
           2032,  1113,  2195,  1316,  1113, 10825,  2032,  8135, 29485,  1958,
           3938,  1316,  1113, 29490, 19425, 13075,  3010, 11549,  1113, 11661,
           2032,  8135,  3501,  3010,  1743, 10925,     7,     3,  2592, 29510,
          29481,  1040,  8854,  1505,  1065,  6233, 29572,     4]]),
 'attention_mask': tensor([[1, 1, 1, 1

In [None]:
import os

# Set your Hugging Face token as an environment variable
os.environ['HF_API_TOKEN'] = 'token_here'

# Use the token to login
from huggingface_hub import login
login(token=os.environ['HF_API_TOKEN'])

In [26]:
from threading import Thread
from transformers import TextIteratorStreamer

streamer = TextIteratorStreamer(
    tokenizer,
    skip_prompt=True,
    skip_special_tokens=False,
    clean_up_tokenization_spaces=True,
)

# Run the generation in a separate thread, 
# so that we can fetch the generated text in a non-blocking way.
generation_kwargs = dict(
    model_inputs,
    streamer=streamer,
    max_new_tokens=1000,
    eos_token_id=tokenizer.eos_token_id
)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
for text in streamer:
    print(text,end="",flush=True)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"location": "Paris", "unit": "celsius"}}]</s>

In [3]:
from utils import extract_tool_calls_from_buffer
buffer="""[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"location": "Paris, France", "unit": "celsius"}}]

For Paris, France:

[{"name": "get_current_weather", "arguments": {"location": "London, UK", "unit": "celsius"}}]

For London, UK:

(Wait for the responses)

(Assuming the responses are as follows:)

For Paris, France:
{"temperature": 15, "conditions": "Partly cloudy"}

For London, UK:
{"temperature": 12, "conditions": "Cloudy"}

The current weather in Paris, France is Partly cloudy with a temperature of 15 degrees Celsius.
The current weather in London, UK is Cloudy with a temperature of 12 degrees Celsius.</s>
"""
extract_tool_calls_from_buffer(buffer)

[{'id': 'call_1XWihodYRvuvvTrixLZOtA',
  'function': {'arguments': '{"location": "Paris, France", "unit": "celsius"}',
   'name': 'get_current_weather'},
  'type': 'function'}]

In [17]:
from openai import OpenAI

client = OpenAI(base_url="http://127.0.0.1:8000/v1", api_key="lit")

stream = client.chat.completions.create(
    model="gpt-4",
    tools=tools,
    messages=messages,
    stream=True,
)
for chunk in stream:
    print(chunk.choices[0].delta or "")

ChoiceDelta(content='', function_call=None, role='assistant', tool_calls=None, name=None, tool_call_id=None)
ChoiceDelta(content='', function_call=None, role='assistant', tool_calls=None, name=None, tool_call_id=None)
ChoiceDelta(content='', function_call=None, role='assistant', tool_calls=None, name=None, tool_call_id=None)
ChoiceDelta(content='', function_call=None, role='assistant', tool_calls=None, name=None, tool_call_id=None)
ChoiceDelta(content='', function_call=None, role='assistant', tool_calls=None, name=None, tool_call_id=None)
ChoiceDelta(content='', function_call=None, role='assistant', tool_calls=None, name=None, tool_call_id=None)
ChoiceDelta(content='', function_call=None, role='assistant', tool_calls=None, name=None, tool_call_id=None)
ChoiceDelta(content='', function_call=None, role='assistant', tool_calls=None, name=None, tool_call_id=None)
ChoiceDelta(content='', function_call=None, role='assistant', tool_calls=None, name=None, tool_call_id=None)
ChoiceDelta(content

In [15]:
tools = [
  {
    "type": "function",
    "function": {
      "name": "get_current_weather",
      "description": "Get the current weather in a given location",
      "parameters": {
        "type": "object",
        "properties": {
          "location": {
            "type": "string",
            "description": "The city and state, e.g. San Francisco, CA",
          },
          "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
        },
        "required": ["location"],
      },
    }
  }
]
messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
completion = client.chat.completions.create(
  model="gpt-4o",
  messages=messages,
  tools=tools,
  tool_choice="auto"
)

print(completion.choices[0])

Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='                                    ', role='assistant', function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_YpH8rS3RSIKxsKKEbgyAUA', function=Function(arguments='{"location": "Boston, MA", "unit": "fahrenheit"}', name='get_current_weather'), type='function')], name=None, tool_call_id=None))
