In [1]:
# --- Imports ------------------------------------------------------------------
# Using standard library facilities:
# - os: interacting with the operating system for environment variables and file paths.
import os

# Using NumPy for efficient numerical arrays and vector operations required by embedding tensors.
import numpy as np

# Using the OpenAI SDK as the model-facing client for text generation and embeddings.
from openai import OpenAI

# Using Chroma as the vector database for persistent storage and similarity search over embeddings.
import chromadb

# Using the EmbeddingFunction protocol to adapt an embedding provider to Chroma’s ingestion/query pipeline.
from chromadb.api.types import EmbeddingFunction

In [2]:
# --- Configuration block ------------------------------------------------------
# Using environment variables to externalize credentials, enabling reproducible
# experimentation across local development, CI, and cloud deployments without hard-coded secrets.
OPENAI_API = os.getenv("OPENAI_API")
assert OPENAI_API, "Missing OPENAI_API"

# Using explicit model identifiers to keep ablations and benchmarking reproducible.
# LLM_MODEL targets the conversational model for generation; EMB_MODEL targets the embedding model for vectorization.
LLM_MODEL = "gpt-4.1-nano"           # primary conversational LLM
EMB_MODEL = "text-embedding-3-small" # lightweight, cost-efficient embeddings

# --- Clients ------------------------------------------------------------------
# Using the native OpenAI client; authentication sourced from the environment variable declared above.
llm_client = OpenAI(api_key=OPENAI_API)

# Using Chroma Cloud as the vector store; authentication and multitenancy details supplied via environment variables.
# Providing tenant and database explicitly to avoid ambiguous resolution on the backend.
chroma = chromadb.CloudClient(
    tenant=os.getenv("CHROMA_TENANT"),
    database="Test",
    api_key=os.getenv("CHROMADB_TOKEN")
)

In [3]:
# --- Generation parameters ----------------------------------------------------
# Using explicit decoding hyperparameters to ensure determinism across runs.
# TEMPERATURE controls stochasticity during decoding; MAX_TOKENS bounds generation length and cost.
TEMPERATUER = 0.7
MAX_TOKENS = 100

In [4]:
# --- Embedding wrapper --------------------------------------------------------
# Defining an adapter class to integrate OpenAI’s embedding API with Chroma ingestion.
# The class conforms to Chroma’s EmbeddingFunction protocol, enabling drop-in substitution of
# embedding providers while preserving a stable interface for downstream vectorization.
class OpenAIEmbeddingFunction(EmbeddingFunction[str]):
    def __init__(self, client: OpenAI, model: str):
        # Storing the OpenAI client instance, responsible for routing requests to the embedding endpoint.
        # Storing the specific model identifier that determines the embedding representation.
        self.client = client
        self.model = model

    def __call__(self, inputs):
        # Normalizing the input to a list structure, ensuring batch consistency for both single and multi-string inputs.
        if isinstance(inputs, str):
            inputs = [inputs]
        # Requesting embeddings from the OpenAI API for the normalized input batch.
        resp = self.client.embeddings.create(model=self.model, input=inputs)
        # Casting returned embeddings into NumPy arrays for numerical stability and compatibility
        # with Chroma’s storage and similarity search operations.
        return [np.array(item.embedding, dtype=np.float32) for item in resp.data]

# Instantiating the embedding function, making it available for explicit precomputation of vectors.
emb_fn = OpenAIEmbeddingFunction(llm_client, EMB_MODEL)

In [5]:
# --- LLM interaction ----------------------------------------------------------
# Using a minimal chat completion request to validate connectivity and authentication.
# The interaction specifies:
# - model: the target conversational model.
# - temperature: the stochasticity of the decoding process.
# - max_tokens: the cap on generated tokens to control cost and verbosity.
# - messages: a structured dialogue context containing both system and user roles.
chat = llm_client.chat.completions.create(
    model=LLM_MODEL,
    temperature=TEMPERATUER,
    max_tokens=MAX_TOKENS,
    messages=[
        {"role": "system", "content": "Eres un asistente útil y conciso."},
        {"role": "user", "content": "¿Cuál es la capital de españa?"}
    ]
)

In [6]:
# --- Validation cell: LLM connectivity test -----------------------------------
# Using a direct print of the assistant's reply to confirm that the model responds as expected.
print(chat.choices[0].message.content)

La capital de España es Madrid.
