<a href="https://colab.research.google.com/github/alarcon7a/openai-api-tutorial/blob/main/src/OpenAI_API_4_Tokens.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Instalar librerias

In [None]:
%pip install --upgrade tiktoken==0.8.0 -q
%pip install --upgrade openai==1.58.1 -q

In [None]:
import tiktoken

### Encoding y Decoding

| Encoding name           | OpenAI models                                                                 |
|-------------------------|-------------------------------------------------------------------------------|
| `o200k_base`            | `gpt-4o`, `gpt-4o-mini`                                                      |
| `cl100k_base`           | `gpt-4-turbo`, `gpt-4`, `gpt-3.5-turbo`, `text-embedding-ada-002`, `text-embedding-3-small`, `text-embedding-3-large` |
| `p50k_base`             | Codex models, `text-davinci-002`, `text-davinci-003`                         |
| `r50k_base` (or `gpt2`) | GPT-3 models like `davinci`                                                  |


In [None]:
# 1. Importamos la librería
import tiktoken

# 2. Cargamos la codificación
encoding = tiktoken.encoding_for_model('gpt-4o-mini')

# 3. Codificamos texto a tokens
text = "¡Hola, mundo! Esto es una prueba."
tokens = encoding.encode(text)
print(f"Texto: {text}")
print(f"Tokens (enteros): {tokens}")

# 4. Decodificamos tokens a texto
decoded_text = encoding.decode(tokens)
print(f"Texto decodificado: {decoded_text}")



Texto: ¡Hola, mundo! Esto es una prueba.
Tokens (enteros): [20407, 49864, 11, 10225, 0, 43584, 878, 1969, 49548, 13]
Texto decodificado: ¡Hola, mundo! Esto es una prueba.


In [None]:
# Para obtener los bytes de cada token
token_bytes = [encoding.decode_single_token_bytes(token) for token in tokens]
print(f"Texto: {text}")
print("\nTokens (bytes):", token_bytes)
print("\nTotal de tokens", len(token_bytes))

Texto: ¡Hola, mundo! Esto es una prueba.

Tokens (bytes): [b'\xc2\xa1', b'Hola', b',', b' mundo', b'!', b' Esto', b' es', b' una', b' prueba', b'.']

Total de tokens 10


In [None]:
# 5. Contamos tokens de un string
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens = num_tokens_from_string("Suscribete al canal", "o200k_base")
print(f"Numero de tokens: {num_tokens}")

Numero de tokens: 5


## Calculadora en python

In [None]:
import tiktoken

def num_tokens_from_messages(messages, model="gpt-4o-mini-2024-07-18"):
    """Return the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using o200k_base encoding.")
        encoding = tiktoken.get_encoding("o200k_base")
    if model in {
        "gpt-3.5-turbo-0125",
        "gpt-4-0314",
        "gpt-4-32k-0314",
        "gpt-4-0613",
        "gpt-4-32k-0613",
        "gpt-4o-mini-2024-07-18",
        "gpt-4o-2024-08-06"
        }:
        tokens_per_message = 3
        tokens_per_name = 1
    elif "gpt-3.5-turbo" in model:
        print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0125.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0125")
    elif "gpt-4o-mini" in model:
        print("Warning: gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-mini-2024-07-18.")
        return num_tokens_from_messages(messages, model="gpt-4o-mini-2024-07-18")
    elif "gpt-4o" in model:
        print("Warning: gpt-4o and gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-2024-08-06.")
        return num_tokens_from_messages(messages, model="gpt-4o-2024-08-06")
    elif "gpt-4" in model:
        print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
        return num_tokens_from_messages(messages, model="gpt-4-0613")
    else:
        raise NotImplementedError(
            f"""num_tokens_from_messages() is not implemented for model {model}."""
        )
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

example_messages = [
    {
        "role": "system",
        "content": "You are a helpful assistant."
    },
    {
        "role": "user",
        "content": "Tell me a joke."
    }
]

num_tokens = num_tokens_from_messages(example_messages, "gpt-4o-mini")
print(f"Number of tokens in messages: {num_tokens}")

Number of tokens in messages: 22
