Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 14 additions & 8 deletions packages/backend/embedding_atlas/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,18 +104,20 @@ def find_available_port(start_port: int, max_attempts: int = 10, host="localhost
@click.argument("inputs", nargs=-1, required=True)
@click.option("--text", default=None, help="Column containing text data.")
@click.option("--image", default=None, help="Column containing image data.")
@click.option("--vector", default=None, help="Column containing pre-computed vector embeddings.")
@click.option(
"--vector", default=None, help="Column containing pre-computed vector embeddings."
)
@click.option(
"--split",
default=[],
multiple=True,
help="Dataset split name(s) to load from Hugging Face datasets. Can be specified multiple times for multiple splits.",
)
@click.option(
"--embedding/--no-embedding",
"enable_embedding",
"--enable-projection/--disable-projection",
"enable_projection",
default=True,
help="Whether to compute embeddings for the data. Disable if embeddings are pre-computed or if you do not want an embedding view.",
help="Compute embedding projections from text/image/vector data. If disabled without pre-computed projections, the embedding view will be unavailable.",
)
@click.option(
"--model",
Expand All @@ -141,7 +143,7 @@ def find_available_port(start_port: int, max_attempts: int = 10, host="localhost
@click.option(
"--neighbors",
"neighbors_column",
help='Column containing pre-computed nearest neighbors in format: {"ids": [n1, n2, ...], "distances": [d1, d2, ...]}.',
help='Column containing pre-computed nearest neighbors in format: {"ids": [n1, n2, ...], "distances": [d1, d2, ...]}. IDs should be zero-based row indices.',
)
@click.option(
"--sample",
Expand Down Expand Up @@ -202,7 +204,7 @@ def main(
image: str | None,
vector: str | None,
split: list[str] | None,
enable_embedding: bool,
enable_projection: bool,
model: str | None,
trust_remote_code: bool,
x_column: str | None,
Expand All @@ -229,7 +231,7 @@ def main(

print(df)

if enable_embedding and (x_column is None or y_column is None):
if enable_projection and (x_column is None or y_column is None):
# No x, y column selected, first see if text/image/vectors column is specified, if not, ask for it
if text is None and image is None and vector is None:
text = prompt_for_column(
Expand All @@ -246,7 +248,11 @@ def main(
umap_args["metric"] = umap_metric
# Run embedding and projection
if text is not None or image is not None or vector is not None:
from .projection import compute_image_projection, compute_text_projection, compute_vector_projection
from .projection import (
compute_image_projection,
compute_text_projection,
compute_vector_projection,
)

x_column = find_column_name(df.columns, "projection_x")
y_column = find_column_name(df.columns, "projection_y")
Expand Down
93 changes: 50 additions & 43 deletions packages/docs/tool.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ Optionally, if you know what column your text data is in beforehand, you can spe
embedding-atlas path_to_dataset.parquet --text text_column
```

Similarly, you may supply the `--image` flag for image data, or the `--vector` flag for pre-computed embedding vectors.
:::

If you've already pre-computed the embedding projection (e.g., by running your own embedding model and projecting them with UMAP), you may store them as two columns such as `projection_x` and `projection_y`, and pass them into `embedding-atlas` with the `--x` and `--y` flags:
Expand All @@ -67,6 +68,7 @@ embedding-atlas path_to_dataset.parquet --x projection_x --y projection_y

You may also pass in the `--neighbors` flag to specify the column name for pre-computed nearest neighbors.
The `neighbors` column should have values in the following format: `{"ids": [id1, id2, ...], "distances": [d1, d2, ...]}`.
The IDs should be zero-based row indices.
If this column is specified, you'll be able to see nearest neighbors for a selected point in the tool.

Once this script completes, it will print out a URL like `http://localhost:5055/`. Open the URL in a web browser to view the embedding.
Expand All @@ -77,47 +79,52 @@ Once this script completes, it will print out a URL like `http://localhost:5055/
Usage: embedding-atlas [OPTIONS] INPUTS...

Options:
--text TEXT Column containing text data.
--image TEXT Column containing image data.
--split TEXT Dataset split name(s) to load from Hugging
Face datasets. Can be specified multiple times
for multiple splits.
--embedding / --no-embedding Whether to compute embeddings for the data.
Disable if embeddings are pre-computed or if
you do not want an embedding view.
--model TEXT Model name for generating embeddings (e.g.,
'all-MiniLM-L6-v2').
--trust-remote-code Allow execution of remote code when loading
models from Hugging Face Hub.
--x TEXT Column containing pre-computed X coordinates
for the embedding view.
--y TEXT Column containing pre-computed Y coordinates
for the embedding view.
--neighbors TEXT Column containing pre-computed nearest
neighbors in format: {"ids": [n1, n2, ...],
"distances": [d1, d2, ...]}.
--sample INTEGER Number of random samples to draw from the
dataset. Useful for large datasets.
--umap-n-neighbors INTEGER Number of neighbors to consider for UMAP
dimensionality reduction (default: 15).
--umap-min-dist FLOAT The min_dist parameter for UMAP.
--umap-metric TEXT Distance metric for UMAP computation (default:
'cosine').
--umap-random-state INTEGER Random seed for reproducible UMAP results.
--duckdb TEXT DuckDB connection mode: 'wasm' (run in
browser), 'server' (run on this server), or
URI (e.g., 'ws://localhost:3000').
--host TEXT Host address for the web server (default:
localhost).
--port INTEGER Port number for the web server (default:
5055).
--auto-port / --no-auto-port Automatically find an available port if the
specified port is in use.
--static TEXT Custom path to frontend static files
directory.
--export-application TEXT Export the visualization as a standalone web
application to the specified ZIP file and
exit.
--version Show the version and exit.
--help Show this message and exit.
--text TEXT Column containing text data.
--image TEXT Column containing image data.
--vector TEXT Column containing pre-computed vector
embeddings.
--split TEXT Dataset split name(s) to load from Hugging
Face datasets. Can be specified multiple
times for multiple splits.
--enable-projection / --disable-projection
Compute embedding projections from
text/image/vector data. If disabled without
pre-computed projections, the embedding view
will be unavailable.
--model TEXT Model name for generating embeddings (e.g.,
'all-MiniLM-L6-v2').
--trust-remote-code Allow execution of remote code when loading
models from Hugging Face Hub.
--x TEXT Column containing pre-computed X coordinates
for the embedding view.
--y TEXT Column containing pre-computed Y coordinates
for the embedding view.
--neighbors TEXT Column containing pre-computed nearest
neighbors in format: {"ids": [n1, n2, ...],
"distances": [d1, d2, ...]}. IDs should be
zero-based row indices.
--sample INTEGER Number of random samples to draw from the
dataset. Useful for large datasets.
--umap-n-neighbors INTEGER Number of neighbors to consider for UMAP
dimensionality reduction (default: 15).
--umap-min-dist FLOAT The min_dist parameter for UMAP.
--umap-metric TEXT Distance metric for UMAP computation
(default: 'cosine').
--umap-random-state INTEGER Random seed for reproducible UMAP results.
--duckdb TEXT DuckDB connection mode: 'wasm' (run in
browser), 'server' (run on this server), or
URI (e.g., 'ws://localhost:3000').
--host TEXT Host address for the web server (default:
localhost).
--port INTEGER Port number for the web server (default:
5055).
--auto-port / --no-auto-port Automatically find an available port if the
specified port is in use.
--static TEXT Custom path to frontend static files
directory.
--export-application TEXT Export the visualization as a standalone web
application to the specified ZIP file and
exit.
--version Show the version and exit.
--help Show this message and exit.
```