In [None]:
from embedding_atlas.widget import EmbeddingAtlasWidget
from embedding_atlas.projection import compute_text_projection
import pandas as pd
from datasets import load_dataset

In [None]:
# Load a dataset
ds = load_dataset("james-burton/wine_reviews", split="validation")
df = pd.DataFrame(ds)

In [None]:
# Compute text embedding and projection of the embedding using Sentence Transformers by default
compute_text_projection(df, text="description", x="projection_x", y="projection_y", neighbors="neighbors")

In [None]:
# Display the dataset with the Embedding Atlas widget
w = EmbeddingAtlasWidget(df, text="description", x="projection_x", y="projection_y", neighbors="neighbors")
w

In [None]:
# Get the selection from the widget as a dataframe
w.selection()

Embedding Atlas supports running text embeddings using all the models supported by [LiteLLM](https://docs.litellm.ai/docs/embedding/supported_embedding).

To run the example below please make sure to first install [Ollama](https://ollama.com/download) and run the following command to download the [nomic-embed-text](https://ollama.com/library/nomic-embed-text) model:

```bash
ollama pull nomic-embed-text
```

In [None]:
# Compute text embedding and projection of the embedding using locally running Ollama API
compute_text_projection(
    df,
    text="description",
    x="projection_x",
    y="projection_y",
    neighbors="neighbors",
    text_projector="litellm",
    api_base_url="http://localhost:11434",
    model="ollama/nomic-embed-text",
    batch_size=512,
    # Running batches synchronously to avoid overwhelming the local API
    sync=True,
)

In [None]:
# Display the dataset with the Embedding Atlas widget using nomic-embed-text embeddings served by Ollama API
EmbeddingAtlasWidget(df, text="description", x="projection_x", y="projection_y", neighbors="neighbors")

In [None]:
# Async processing of batches raises error in environments with an already running event loop, so we must patch the loop
!uv pip install nest-asyncio -q
import nest_asyncio
nest_asyncio.apply()

In [None]:
# Compute text embedding and projection of the embedding using OpenAI API
compute_text_projection(
    df,
    text="description",
    x="projection_x",
    y="projection_y",
    neighbors="neighbors",
    text_projector="litellm",
    # Your OpenAI API key. You can omit this and set the OPENAI_API_KEY environment variable instead.
    api_key="sk-xxx",
    model="openai/text-embedding-3-small",
    # OpenAI's limit is 300K input token per request, so batch size should be chosen accordingly given the average `text` item length
    batch_size=1024,
    # Since calling an API is IO-bound, we can benefit from async processing. This is the default behavior.
    sync=False,
)

In [None]:
# Display the dataset with the Embedding Atlas widget using text-embedding-3-small embeddings served by OpenAI API
EmbeddingAtlasWidget(df, text="description", x="projection_x", y="projection_y", neighbors="neighbors")