In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import evalugator
import evalugator.api
import evalugator.api.dispatcher
import evalugator.api.requests

# note: types from yaml files are represented in `evalugator.structs`, likely only YAML for non-python use cases

In [5]:
# list available providers
print('Available providers:')

# note: these are literal python modules
for provider_module in evalugator.api.dispatcher.PROVIDERS:
    print(f' - {provider_module.__name__}')

Available providers:
 - evalugator.api.providers.openai
 - evalugator.api.providers.anthropic
 - evalugator.api.providers.replicate
 - evalugator.api.providers.human
 - evalugator.api.providers.together_api


In [6]:
assert not evalugator.api.providers.openai.provides_model('foo')
assert evalugator.api.providers.openai.provides_model('gpt-4o')

Note: The way `evalugator.api.dispatcher` works is:

```python
from functools import partial

from .providers import openai, anthropic, replicate, human, together_api

PROVIDERS = [openai, anthropic, replicate, human, together_api]

# note: also checks that model ID only matches exactly one provider
def get_model_provider(model_id):
    providers = []
    for provider in PROVIDERS:
        if provider.provides_model(model_id):
            return provider

def get_request_processor(request, model_id):
    provider = get_model_provider(model_id)
    return partial(provider.execute, model_id)


def encode(model_id, *args, **kwargs):
    provider = get_model_provider(model_id)
    return provider.encode(model_id, *args, **kwargs)


def decode(model_id, *args, **kwargs):
    provider = get_model_provider(model_id)
    return provider.decode(model_id, *args, **kwargs)
```

In [7]:
# this means the implicit provider API is:

from typing import Protocol, ParamSpec

P = ParamSpec('P')

# note: this could be a module or class
class Provider(Protocol):
    def provides_model(self, model_id: str) -> bool:
        ...
    def execute(self, model_id: str, request: evalugator.api.requests.Request) -> evalugator.api.requests.Response:
        ...
    def encode(self, model_id: str, *args: P.args, **kwargs: P.kwargs) -> str:
        ...
    def decode(self, model_id: str, *args: P.args, **kwargs: P.kwargs) -> str:
        ...

This then allows the `API` to be used like this:

```python
class API:
    
    model_id: str
    executor: ThreadPoolExecutor

    # note: allows automatically saving results, essentially poor man's async, which makes since given not all providers might have async
    def execute(self, request: Request) -> Future[Response]:
        func = dispatcher.get_request_processor(request, self.model_id)
        future = self.executor.submit(func, request)
        future.add_done_callback(self._log_response)
        return future

    def encode(self, *args, **kwargs):
        return dispatcher.encode(self.model_id, *args, **kwargs)

    def decode(self, *args, **kwargs):
        return dispatcher.decode(self.model_id, *args, **kwargs)
```

additionally defines two convenience functions:

```python
    def get_text(self, ...) -> Future[GetTextResponse]:
        request = GetTextRequest(...)
        return self.execute(request)
    
    def get_probs(self, ...) -> Future[GetPropsResponse]:
        request = GetProbsRequest(...)
        return self.execute(request)
```

In [8]:
model_id = 'gpt-4o-mini'

# note: also allows `log_file_name`
api = evalugator.api.Api(model_id=model_id)

response_future = api.get_text(prompt='Hi!')

# actual waiting
response = response_future.result()

response

GetTextResponse(model_id='gpt-4o-mini', request=GetTextRequest(context=None, prompt=[Message(role='user', content='Hi!')], temperature=1, max_tokens=512), raw_responses=[ChatCompletion(id='chatcmpl-9y7CeaNPkZ7jesyuUlNBAOdyhKy1B', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Hello! How can I assist you today?', refusal=None, role='assistant', function_call=None, tool_calls=None))], created=1724114396, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier=None, system_fingerprint='fp_48196bc67a', usage=CompletionUsage(completion_tokens=9, prompt_tokens=9, total_tokens=18))], context=None, txt='Hello! How can I assist you today?')

In [10]:
response.as_dict()

{'model_id': 'gpt-4o-mini',
 'request': {'context': None,
  'prompt': [{'role': 'user', 'content': 'Hi!'}],
  'temperature': 1,
  'max_tokens': 512},
 'txt': 'Hello! How can I assist you today?',
 'context': None}

In [85]:
# we'll make a new provider with tool use
from typing import Callable, Any
import dataclasses

from gpt_from_scratch.evals.function_calling.openai.function_call_handler import FunctionCallHandler

import openai



# note: task-standard/workbench/example-agents/fncall-baseline/commands.py
#       uses a `return` tool, which seems like an interesting technique

# note: subset of https://github.com/LRudL/evalugator/blob/main/evalugator/api/providers/openai.py#L15
#       that we care about that also supports tool use (not intended to be comprehensive)
#
# note: we can't just use `gpt-4o` etc directly since conflicts with the `openai` provider
_MODEL_ID_TO_MODEL_NAME: dict[str, str] = {
    f'function-calling-{model_name}': model_name
    for model_name in ["gpt-4o", "gpt-4o-mini"]
}

class MaxIterationsReachedError(Exception):
    pass

# TODO(bschoen): Might use `partial` here, since otherwise this is super inefficient
# TODO(bschoen): This really seems like it can only support single turn
# TODO(bschoen): Should we have a custom tool request message?
class OpenAIFunctionCallingProvider(Provider):
    """
    Individual instance of a provider supporting the specified functions.

    Example:
        import evalugator.api.dispatcher
    
        # we'll create a provider customized to have our tools
        provider = OpenAIFunctionCallingProvider(functions=[get_location, get_weather])

        evalugator.api.dispatcher.PROVIDERS.append(provider)

    """

    # note: it's actually fine for this to be stateful, but this whole design is a bit weird
    #       when it comes to customization points for the provider itself when it comes
    #       to provider specific things (which I guess is to be expected)
    #
    #       the provider itself essentially needs to be customizable, and capable of holding
    #       some immutable state, but not mutable per execution?
    def __init__(self, functions: list[Callable[..., Any]], max_iterations_per_execution: int = 3) -> None:

        self._function_call_handler = FunctionCallHandler(functions=functions)
        
        # number of times to hand control back to the model after providing the result of a tool
        self._max_iterations_per_execution = max_iterations_per_execution

        # initialize client
        self._client = openai.OpenAI()

    def _get_model_name(self, model_id: str) -> str:
        return _MODEL_ID_TO_MODEL_NAME[model_id]

    def provides_model(self, model_id: str) -> bool:
        return any(model_id.startswith(prefix) for prefix in _MODEL_ID_TO_MODEL_NAME)
    
    def encode(self, model_id: str, *args: P.args, **kwargs: P.kwargs) -> str:
        return evalugator.api.providers.openai.encode(model_id, *args, **kwargs)
    
    def decode(self, model_id: str, *args: P.args, **kwargs: P.kwargs) -> str:
        return evalugator.api.providers.openai.decode(model_id, *args, **kwargs)
    
    # note: I guess response needs to contain all messages for future request? The
    #       `message` class isn't flexible enough to hold actual history though,
    #       so it seems like this thing overall isn't designed for multi-turn
    def execute(self, model_id: str, request: evalugator.api.requests.Request,) -> evalugator.api.requests.Response:


        print('Executing function calling provider...')
        
        # list of {'role': ..., 'content': ...}
        messages = [dataclasses.asdict(x) for x in request.prompt]

        for i in range(self._max_iterations_per_execution):

            iteration_count = i + 1

            print(f'Attempt {iteration_count} / {self._max_iterations_per_execution}')

            # TODO(bschoen): Schemas actually _can_ contain nested types, if they're pydantic
            #                or dataclasses can support that easily, but description gets annoying,
            #                note that's for the input though.
            # TODO(bschoen): Response format is extremely valuable, can allow specifying it too
            # Note: Can't use response_format, structured_output, and parallel function calling together
            print('Creating completion...')
            response: openai.ChatCompletion = self._client.chat.completions.create(
                model=self._get_model_name(model_id),
                messages=messages,
                tools=self._function_call_handler.get_schema_for_tools_arg(),
            )

            # add message to history (true for both content and tool)
            assert len(response.choices) == 1
            
            choice: openai.types.CompletionChoice = response.choices[0]
            message: openai.types.chat.ChatCompletionMessage = choice.message
            
            messages.append(message.dict())
    
            # show any model text responses
            if message.content:
                print(f'Model response: {message.content}')
        
            # resolve any tool calls
            if message.tool_calls:

                print(f'Processing {len(message.tool_calls)} tool calls')
                
                for tool_call in message.tool_calls:
            
                    print(f'Resolving tool call in response: {tool_call.id}')
                    # print(f'Calling {tool_call.function.name} with args: {tool_call.function.arguments}')
                    tool_call_result_message = self._function_call_handler.resolve(tool_call=tool_call)

                    # print(f"Tool call result: {tool_call_result_message['content']}")
            
                    # add tool call result
                    print(f'Providing results from tool calls back to model...')
                    messages.append(tool_call_result_message)

            # otherwise, no tool calls left to resolve and we can return control back to the user
            else:
                print('\n---\nNo more tool calls left to resolve, breaking')

                # model_solution = messages[-1]['content']

                # print in case isn't valid json
                # print(f'Model solution: `{model_solution}`')
                
                # TODO(bschoen): could put parsed response_format in `context`
                # parse into dedicated struct
                # model_problem_solving_status = ModelProblemSolvingStatus.parse_raw(model_solution)

                return evalugator.api.requests.Response(
                    model_id=model_id,
                    request=request,
                    raw_responses=messages,
                    context=None,
                )

        # if we reach this point we've exhausted max iterations
        raise MaxIterationsReachedError(f'Exhausted max attempts: {self._max_iterations_per_execution}')

    
    

In [43]:
import tiktoken


# note: this is `gpt-4o`
gpt4_tokenizer = tiktoken.get_encoding("o200k_base") 

In [75]:
import dataclasses
import json

@dataclasses.dataclass(frozen=True)
class TokenInfo:
    token_index: int
    """Index of the token (ex: 1)"""

    token_string: str
    """The actual token text representation (ex: `你好`, this will be a single token)"""

    token_split_into_characters: list[str]
    """The characters that make up the token (ex: `['你', '好']`, or `['s', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y']`)"""

# TODO(bschoen): Probably want general pattern of partially binding with typed callables
#                (for example so this gets a tokenizer without the model needing to know about it)
def get_detailed_and_complete_tokenization_info_for_text(text: str) -> str:
    """
    Tokenize the given text and return detailed information about each token.

    This is useful any time the user asks questions that involve individual characters
    or anything else that involves manipulating substrings that are potentially shorter
    than a token.

    The output is a list of the following form:

        [
            {'token_index': 0, 'token_string': '你好', 'token_split_into_characters': ['你', '好']},
            {'token_index': 1, 'token_string': 'strawberry', 'token_split_into_characters': ['s', 't', 'r', 'a', 'w', 'b', 'e', 'r', 'r', 'y']}
        ]

    Where:
        - `token_index` is the index of the token in the original text
        - `token_string` is the actual token text representation
        - `token_split_into_characters` is the characters that make up the token

    Args:
        text (str): User provided text to tokenize, this can span multiple tokens (ex: `你好 strawberry`)
    """

    # ex: [177519, 101830]
    encoded_tokens = gpt4_tokenizer.encode(text)

    token_infos: list[TokenInfo] = []

    for i, token in enumerate(encoded_tokens):

        # recover original string for token (ex: `你好`)
        token_bytes = gpt4_tokenizer.decode_single_token_bytes(token)
        token_string = token_bytes.decode("utf-8", errors="replace")

        # now recover what `text_characters`s this corresponds to
        corresponding_text_characters = [c for c in token_string]

        token_infos.append(TokenInfo(
            token_index=i,
            token_string=token_string,
            token_split_into_characters=corresponding_text_characters,
        ))

    # convert to json
    # note: encoded as utf-8 to preserve the original tokens verbatim
    # TODO(bschoen): there has to be a better pattern than `encode.decode`
    return json.dumps([dataclasses.asdict(x) for x in token_infos], indent=2, ensure_ascii=False).encode("utf-8").decode("utf-8")

# show example
text = '你好 strawberry'

print(get_detailed_and_complete_tokenization_info_for_text(text))

[
  {
    "token_index": 0,
    "token_string": "你好",
    "token_split_into_characters": [
      "你",
      "好"
    ]
  },
  {
    "token_index": 1,
    "token_string": " strawberry",
    "token_split_into_characters": [
      " ",
      "s",
      "t",
      "r",
      "a",
      "w",
      "b",
      "e",
      "r",
      "r",
      "y"
    ]
  }
]


In [86]:
openai_function_provider = OpenAIFunctionCallingProvider(
    functions=[get_detailed_and_complete_tokenization_info_for_text]
)

request = evalugator.api.requests.Request(
    prompt=[
        evalugator.api.requests.Message(role='user', content="How many 'r's are in: 你好 strawberry"),
    ],
    context=None,
)

response = openai_function_provider.execute(model_id='function-calling-gpt-4o', request=request)

Executing function calling provider...
Attempt 1 / 3
Creating completion...
Processing 1 tool calls
Resolving tool call in response: call_kqVDYVsBqohhcb6AuZkYuMud
Providing results from tool calls back to model...
Attempt 2 / 3
Creating completion...
Model response: In the text "你好 strawberry", you have 3 'r's. They are found in the "strawberry" token.

---
No more tool calls left to resolve, breaking
