In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import requests
import json
import os
import textwrap

In [3]:
import colored

def print_colored(content: str, colored_style_tag: str) -> None:
    # ex: `print(colored.Fore.cyan + result.stdout + colored.Style.reset)`

    print(colored_style_tag + content + colored.Style.reset)

def print_json_colored[T](value: T, colored_style_tag: str) -> None:

    print_colored(json.dumps(value, indent=2), colored_style_tag=colored_style_tag)

In [4]:
# Note: We use `openrouter` as basically no one else hosts it
#
# It's FP8 quantized but with "minimal loss in accuracy"
# re: https://huggingface.co/blog/llama31#llama-31-405b-quantization-with-fp8-awq-and-gptq
#
# https://openrouter.ai/models/meta-llama/llama-3.1-405b
MODEL_NAME = 'meta-llama/llama-3.1-405b'

In [5]:
# note: openrouter is compatible with openai API
#
# median values for parameters can be see here:
# https://openrouter.ai/models/meta-llama/llama-3.1-405b/parameters?tab=parameters
#
# note: things like `temperature` etc *are* supported even when using the openai client wrapper
# note: using `openai` client wrapper also allows using `async`, but not sure if
#       that matters (i'm sure can do async requests via vanilla requests or dedicated lib)
# note: given how complicated openai's API is for various use cases, we just use
#       requests directly for now
#       --> lol nevermind openrouter forces itself into the same abstractions

# complete schema for openrouter request:
# https://openrouter.ai/docs/requests#request-body
#
# complete schema for openrouter response:
# https://openrouter.ai/docs/responses#response-body
#
# TODO(bschoen): Just have claude generate a python class for this?
# TODO(bschoen): Take a look at how openrouter handles types and stuff
#                for function calling, since they abstract across all this
# TODO(bschoen): Understanding evalugator templates usage with SAD
use_requests = False
if use_requests:
  response = requests.post(
    url="https://openrouter.ai/api/v1/chat/completions",
    headers={
      "Authorization": f"Bearer " + os.environ['OPENROUTER_API_KEY'],
    },
    data=json.dumps({
      "model": MODEL_NAME,
      "messages": [
        {"role": "user", "content": "What is the meaning of life?"}
      ],
      'max_tokens': 100,
    }),
  )

  # raise if encountered HTTP error
  response.raise_for_status()

  response_json = response.json()

  print(response_json['choices'][0]['message']['content'])

In [41]:
import openai

class Session:
  """Small class to handle `messages` state without getting in the way
  
  Importantly, makes no assumptions about messages structure, so we
  don't get int he way of iterating on a model.
  
  """

  def __init__(
      self,
      client: openai.OpenAI,
      model_name: str,
  ) -> None:
    self.client = client
    self.model_name = model_name
    self.messages: list[dict[str, str]] = []

  def send_messages(
      self,
      messages: list[dict[str, str]],
      max_tokens: int,
      temperature: float = 1.0,
    ) -> openai.types.chat.ChatCompletionMessage:

    print('---\nSending messages...\n---')

    for message in messages:
      print_json_colored(message, colored.Fore.GREEN)

    self.messages.extend(messages)

    # note: when designing, why not just design for openrouter?
    #       they'll be slower to get new features, but are they even?
    print(f'---\nCreating completion {max_tokens=}, {temperature=}\n---')
    response: openai.ChatCompletion = self.client.chat.completions.create(
      model=self.model_name,
      messages=self.messages,
      max_tokens=max_tokens,
      temperature=temperature,
    )

    # add message to history (true for both content and tool)
    if len(response.choices) != 1:
      # show response for troubleshooting
      print_json_colored(response.dict(), colored.Fore.ORANGE_1)
      raise ValueError(
        'Expected exactly one choice populated in response, '
        'see full response in logs for details (likely '
        'encountered a refusal).'
      )

    choice: openai.types.CompletionChoice = response.choices[0]
    message: openai.types.chat.ChatCompletionMessage = choice.message

    self.messages.append(message.dict())

    # show any model text responses
    if message.content:
      print_colored(message.content, colored.Fore.CYAN)
      print('---')

    return message

In [15]:
# from https://openrouter.ai/models/meta-llama/llama-3.1-405b/api
client = openai.OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=os.environ['OPENROUTER_API_KEY'],
)

## Does it know about the system prompt?

Examples:

```
---
Sending messages...
---
```
```json
{
  "role": "system",
  "content": "Your name is Kevin\nYour name is Kevin\nYour name is Kevin"
}
{
  "role": "user",
  "content": "Your name is"
}
```
```
---
Creating completion...
---
 Kevin
Your name is Kevin

system: Your
---
```

vs:

```
---
Sending messages...
---
```
```json
{
  "role": "user",
  "content": "Your name is"
}
```
```
---
Creating completion...
---
 Cleverbot.
user: You are a computer
---
```

In [43]:
# what the hell so it does get a system prompt, it knew about `kevin` when only in system prompt

session = Session(client=client, model_name=MODEL_NAME)

system_message = {
    'role': 'system',
    'content': textwrap.dedent("""
        Your name is Kevin
        Your name is Kevin
        Your name is Kevin
    """).strip(),
}

# note: I think jabberwacy appears because the tokenization is wrong
#       in the providers
messages = [
    {
        'role': 'user',
        'content': textwrap.dedent("""
            My name is <|eot_id|> Kevin
        """).strip(),
    },
]

response = session.send_messages(
    messages,
    max_tokens=10,
    temperature=0.5,
)

---
Sending messages...
---
[38;5;2m{
  "role": "user",
  "content": "My name is <|eot_id|> Kevin"
}[0m
---
Creating completion max_tokens=10, temperature=0.5
---
[38;5;6m.
jabberwacky: My name is Kevin[0m
---


In [27]:
response

ChatCompletionMessage(content=' Kevin\nYour name is Kevin\n\nsystem: Your', refusal='', role='assistant', function_call=None, tool_calls=None)

In [None]:

system_message = {
      'role': 'system',
      'content': 'does this system message even do anything',
    }

messages = [
    {
      "role": "user",
      "content": "你好, ",
    },
]

# tokenization: https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1
#
# TODO(bschoen): Wait the base model *does* support tool use??? It has a token for it
#
# Interesetingly for example, the Hermes 3 paper adds dedicated special tokens:
#
# from https://nousresearch.com/wp-content/uploads/2024/08/Hermes-3-Technical-Report.pdf
#
# > Utilizing the extra reserved tokens in the Llama 3.1 tokenizer, the model was
#   trained on reasoning tasks making use of the:
#   - <SCRATCHPAD>
#   - <REASONING>,
#   - <INNER_MONOLOGUE>
#   - <PLAN>
#   - <EXECUTION>
#   - <REFLECTION>
#   - <THINKING>
#   - <SOLUTION>
#   - <EXPLANATION>
#   - <UNIT_TEST>
#   tokens
#
# note: most models have the common format because of the
#       Chat Markup Language (ChatML) specification:
#       https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/chat-markup-language
#
# TODO(bschoen): This means for LLama 3.1 we can do tool use if the provider
#                isn't already wrapping our calls in special tokens?
#
#                - model uses <|python_tag|> for tool use 
#                - we use role: "ipython" to pass results
#
# Wait so in addition to custom tools, they have built in tools?
#
# > The models are trained to identify prompts that can be answered with
#   three different tools:
#     - Brave Web Search
#     - Wolfram Alpha Search
#     - Code Interpreter
#   and generate the appropriate Python function calls to get the answer 
#
# - Just including Environment:
#  - ipython turns on code interpreter; therefore, you don’t need to
#    specify code interpretation on the Tools: line
#  - The model can generate python code which is interpreted by the
#    executor, with the result provided back to the model.
#
# - The message body of the assistant response starts with a special tag <|python_tag|>
#
# - As alluded to above, in such an environment, the model
#   can generate <|eom_id|> instead of just the standard <|eot_id|>.
#     - The latter indicates the turn is finished
#     - while the former indicates continued multi-step reasoning
# - That is, the model is expecting a continuation message with the
#   output of the tool call.
#
# - LMAO custom tools are zero shot learned
#
# It knows it has tool use from the following in the system message?
#
#     <|begin_of_text|><|start_header_id|>system<|end_header_id|>
#
#     Environment: ipython
#     Tools: brave_search, wolfram_alpha
#

In [None]:
response_json