diff --git a/packages/backend/embedding_atlas/cli.py b/packages/backend/embedding_atlas/cli.py index 9c4c71e..e48c0e7 100644 --- a/packages/backend/embedding_atlas/cli.py +++ b/packages/backend/embedding_atlas/cli.py @@ -104,7 +104,9 @@ def find_available_port(start_port: int, max_attempts: int = 10, host="localhost @click.argument("inputs", nargs=-1, required=True) @click.option("--text", default=None, help="Column containing text data.") @click.option("--image", default=None, help="Column containing image data.") -@click.option("--vector", default=None, help="Column containing pre-computed vector embeddings.") +@click.option( + "--vector", default=None, help="Column containing pre-computed vector embeddings." +) @click.option( "--split", default=[], @@ -112,10 +114,10 @@ def find_available_port(start_port: int, max_attempts: int = 10, host="localhost help="Dataset split name(s) to load from Hugging Face datasets. Can be specified multiple times for multiple splits.", ) @click.option( - "--embedding/--no-embedding", - "enable_embedding", + "--enable-projection/--disable-projection", + "enable_projection", default=True, - help="Whether to compute embeddings for the data. Disable if embeddings are pre-computed or if you do not want an embedding view.", + help="Compute embedding projections from text/image/vector data. If disabled without pre-computed projections, the embedding view will be unavailable.", ) @click.option( "--model", @@ -141,7 +143,7 @@ def find_available_port(start_port: int, max_attempts: int = 10, host="localhost @click.option( "--neighbors", "neighbors_column", - help='Column containing pre-computed nearest neighbors in format: {"ids": [n1, n2, ...], "distances": [d1, d2, ...]}.', + help='Column containing pre-computed nearest neighbors in format: {"ids": [n1, n2, ...], "distances": [d1, d2, ...]}. IDs should be zero-based row indices.', ) @click.option( "--sample", @@ -202,7 +204,7 @@ def main( image: str | None, vector: str | None, split: list[str] | None, - enable_embedding: bool, + enable_projection: bool, model: str | None, trust_remote_code: bool, x_column: str | None, @@ -229,7 +231,7 @@ def main( print(df) - if enable_embedding and (x_column is None or y_column is None): + if enable_projection and (x_column is None or y_column is None): # No x, y column selected, first see if text/image/vectors column is specified, if not, ask for it if text is None and image is None and vector is None: text = prompt_for_column( @@ -246,7 +248,11 @@ def main( umap_args["metric"] = umap_metric # Run embedding and projection if text is not None or image is not None or vector is not None: - from .projection import compute_image_projection, compute_text_projection, compute_vector_projection + from .projection import ( + compute_image_projection, + compute_text_projection, + compute_vector_projection, + ) x_column = find_column_name(df.columns, "projection_x") y_column = find_column_name(df.columns, "projection_y") diff --git a/packages/docs/tool.md b/packages/docs/tool.md index 70c0708..c6371e6 100644 --- a/packages/docs/tool.md +++ b/packages/docs/tool.md @@ -57,6 +57,7 @@ Optionally, if you know what column your text data is in beforehand, you can spe embedding-atlas path_to_dataset.parquet --text text_column ``` +Similarly, you may supply the `--image` flag for image data, or the `--vector` flag for pre-computed embedding vectors. ::: If you've already pre-computed the embedding projection (e.g., by running your own embedding model and projecting them with UMAP), you may store them as two columns such as `projection_x` and `projection_y`, and pass them into `embedding-atlas` with the `--x` and `--y` flags: @@ -67,6 +68,7 @@ embedding-atlas path_to_dataset.parquet --x projection_x --y projection_y You may also pass in the `--neighbors` flag to specify the column name for pre-computed nearest neighbors. The `neighbors` column should have values in the following format: `{"ids": [id1, id2, ...], "distances": [d1, d2, ...]}`. +The IDs should be zero-based row indices. If this column is specified, you'll be able to see nearest neighbors for a selected point in the tool. Once this script completes, it will print out a URL like `http://localhost:5055/`. Open the URL in a web browser to view the embedding. @@ -77,47 +79,52 @@ Once this script completes, it will print out a URL like `http://localhost:5055/ Usage: embedding-atlas [OPTIONS] INPUTS... Options: - --text TEXT Column containing text data. - --image TEXT Column containing image data. - --split TEXT Dataset split name(s) to load from Hugging - Face datasets. Can be specified multiple times - for multiple splits. - --embedding / --no-embedding Whether to compute embeddings for the data. - Disable if embeddings are pre-computed or if - you do not want an embedding view. - --model TEXT Model name for generating embeddings (e.g., - 'all-MiniLM-L6-v2'). - --trust-remote-code Allow execution of remote code when loading - models from Hugging Face Hub. - --x TEXT Column containing pre-computed X coordinates - for the embedding view. - --y TEXT Column containing pre-computed Y coordinates - for the embedding view. - --neighbors TEXT Column containing pre-computed nearest - neighbors in format: {"ids": [n1, n2, ...], - "distances": [d1, d2, ...]}. - --sample INTEGER Number of random samples to draw from the - dataset. Useful for large datasets. - --umap-n-neighbors INTEGER Number of neighbors to consider for UMAP - dimensionality reduction (default: 15). - --umap-min-dist FLOAT The min_dist parameter for UMAP. - --umap-metric TEXT Distance metric for UMAP computation (default: - 'cosine'). - --umap-random-state INTEGER Random seed for reproducible UMAP results. - --duckdb TEXT DuckDB connection mode: 'wasm' (run in - browser), 'server' (run on this server), or - URI (e.g., 'ws://localhost:3000'). - --host TEXT Host address for the web server (default: - localhost). - --port INTEGER Port number for the web server (default: - 5055). - --auto-port / --no-auto-port Automatically find an available port if the - specified port is in use. - --static TEXT Custom path to frontend static files - directory. - --export-application TEXT Export the visualization as a standalone web - application to the specified ZIP file and - exit. - --version Show the version and exit. - --help Show this message and exit. + --text TEXT Column containing text data. + --image TEXT Column containing image data. + --vector TEXT Column containing pre-computed vector + embeddings. + --split TEXT Dataset split name(s) to load from Hugging + Face datasets. Can be specified multiple + times for multiple splits. + --enable-projection / --disable-projection + Compute embedding projections from + text/image/vector data. If disabled without + pre-computed projections, the embedding view + will be unavailable. + --model TEXT Model name for generating embeddings (e.g., + 'all-MiniLM-L6-v2'). + --trust-remote-code Allow execution of remote code when loading + models from Hugging Face Hub. + --x TEXT Column containing pre-computed X coordinates + for the embedding view. + --y TEXT Column containing pre-computed Y coordinates + for the embedding view. + --neighbors TEXT Column containing pre-computed nearest + neighbors in format: {"ids": [n1, n2, ...], + "distances": [d1, d2, ...]}. IDs should be + zero-based row indices. + --sample INTEGER Number of random samples to draw from the + dataset. Useful for large datasets. + --umap-n-neighbors INTEGER Number of neighbors to consider for UMAP + dimensionality reduction (default: 15). + --umap-min-dist FLOAT The min_dist parameter for UMAP. + --umap-metric TEXT Distance metric for UMAP computation + (default: 'cosine'). + --umap-random-state INTEGER Random seed for reproducible UMAP results. + --duckdb TEXT DuckDB connection mode: 'wasm' (run in + browser), 'server' (run on this server), or + URI (e.g., 'ws://localhost:3000'). + --host TEXT Host address for the web server (default: + localhost). + --port INTEGER Port number for the web server (default: + 5055). + --auto-port / --no-auto-port Automatically find an available port if the + specified port is in use. + --static TEXT Custom path to frontend static files + directory. + --export-application TEXT Export the visualization as a standalone web + application to the specified ZIP file and + exit. + --version Show the version and exit. + --help Show this message and exit. ```