In [25]:
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4")

In [21]:
encoded_data = encoding.encode("photosynthesys is great!")
encoded_data

[25625, 1910, 6509, 1065, 374, 2294, 0]

In [22]:
decoded_data = encoding.decode(encoded_data)
decoded_data

'photosynthesys is great!'

In [23]:
data = [encoding.decode_single_token_bytes(token) for token in encoded_data]
data

[b'photos', b'yn', b'thes', b'ys', b' is', b' great', b'!']

## Embeddings from text

In [1]:
import os 
from openai import AzureOpenAI

azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
azure_openai_key = os.getenv("AZURE_OPENAI_KEY", "")
azure_openai_embedding_deployment = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT", "text-embedding-3-large")
azure_openai_embedding_dimensions = int(os.getenv("AZURE_OPENAI_EMBEDDING_DIMENSIONS", 1024))
azure_openai_api_version = os.getenv("AZURE_OPENAI_API_VERSION", "2024-06-01")
embedding_model_name = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT", "text-embedding-3-large")

openai_client = AzureOpenAI(
        azure_deployment=azure_openai_embedding_deployment,
        api_version=azure_openai_api_version,
        azure_endpoint=azure_openai_endpoint,
        api_key=azure_openai_key
    )

def get_embeddings(text):
    response = openai_client.embeddings.create(input=text, model=embedding_model_name, dimensions=azure_openai_embedding_dimensions)
    return response.data[0].embedding

In [None]:
get_embeddings("photosynthesys is great!")