In [None]:
import sys, subprocess, os
from pathlib import Path

# Colab Setup
if "google.colab" in sys.modules:
    print("Running in Google Colab. Installing dependencies...")
    subprocess.run(["pip", "install", "-q", "pandas", "numpy", "scikit-learn", "requests", "pydantic", "jsonschema", "plotly", "tqdm"])
    
    # Check for data
    if not (Path.cwd() / "data").exists():
        print("Data directory not found. Cloning repository...")
        subprocess.run(["git", "clone", "https://github.com/aire-program/aire-researcher-sandbox.git", "_repo"])
        
        # Move data and scripts to current directory
        if (Path("_repo/data").exists()):
            print("Moving data and scripts...")
            subprocess.run(["mv", "_repo/data", "."])
            subprocess.run(["mv", "_repo/scripts", "."])
            subprocess.run(["rm", "-rf", "_repo"])
        else:
            print("Warning: Data not found in cloned repo.")
    else:
        print("Data directory found.")


# Embeddings API Example

**What**: Generate and use text embeddings via a simulated API.

**Why**: Many modern AI workflows offload heavy computation (like embedding generation) to specialized APIs.

**How**:
1. **Send text** to the embedding endpoint.
2. **Receive vectors** in response.
3. **Use vectors** for similarity calculations.

**Key Concept**: **Embeddings** are dense numerical representations of text where similar meanings are close together in vector space.

By the end of this notebook, you will have completed the listed steps and produced the outputs described in the success criteria.

### Success criteria
- You generated embeddings for sample texts.
- You computed similarity and inspected scores.
- You executed a simple search over the texts.

In [None]:
import sys
from pathlib import Path

repo_root = Path.cwd()
for candidate in [repo_root, repo_root.parent, repo_root.parent.parent]:
    if (candidate / "api" / "python" / "client_embeddings.py").exists():
        sys.path.append(str(candidate))
        break

from api.python.client_embeddings import EmbeddingsClient

texts = [
    "Synthetic research abstract about reproducibility.",
    "Notes on experimental design and treatment arms.",
    "Overview of responsible AI documentation practices.",
]

client = EmbeddingsClient(max_features=32)
embeddings = client.embed(texts)
embeddings.shape


## Pairwise similarity

In [None]:
similarity = client.similarity(texts)
similarity


### If you get stuck / What to try next

If you get stuck: rerun installs and ensure sample texts are defined. What to try next: connect embeddings to retrieval workflows or try the text notebooks for qualitative checks.