In [2]:
# --- Imports ------------------------------------------------------------------
# Using standard library facilities:
# - os: interacting with the operating system for environment variables and file paths.
import os

# Using NumPy for efficient numerical arrays and vector operations required by embedding tensors.
import numpy as np

# Using the OpenAI SDK as the model-facing client for text generation and embeddings.
from openai import OpenAI

# Using Chroma as the vector database for persistent storage and similarity search over embeddings.
import chromadb

# Using the EmbeddingFunction protocol to adapt an embedding provider to Chroma’s ingestion/query pipeline.
from chromadb.api.types import EmbeddingFunction

In [3]:
# --- Configuration block ------------------------------------------------------
# Using environment variables to externalize credentials, enabling reproducible
# experimentation across local development, CI, and cloud deployments without hard-coded secrets.
OPENAI_API = os.getenv("OPENAI_API")
assert OPENAI_API, "Missing OPENAI_API"

# Using explicit model identifiers to keep ablations and benchmarking reproducible.
# LLM_MODEL targets the conversational model for generation; EMB_MODEL targets the embedding model for vectorization.
LLM_MODEL = "gpt-4.1-nano"           # primary conversational LLM
EMB_MODEL = "text-embedding-3-small" # lightweight, cost-efficient embeddings

# --- Generation parameters ----------------------------------------------------
# Using explicit decoding hyperparameters to ensure determinism across runs.
# TEMPERATURE controls stochasticity during decoding; MAX_TOKENS bounds generation length and cost.
TEMPERATUER = 0.7
MAX_TOKENS = 100

# --- Clients ------------------------------------------------------------------
# Using the native OpenAI client; authentication sourced from the environment variable declared above.
llm_client = OpenAI(api_key=OPENAI_API)

# Using Chroma Cloud as the vector store; authentication and multitenancy details supplied via environment variables.
# Providing tenant and database explicitly to avoid ambiguous resolution on the backend.
chroma = chromadb.CloudClient(
    tenant=os.getenv("CHROMA_TENANT"),
    database="Test",
    api_key=os.getenv("CHROMADB_TOKEN")
)

In [4]:
# --- Embedding wrapper --------------------------------------------------------
# Defining an adapter class to integrate OpenAI’s embedding API with Chroma ingestion.
# The class conforms to Chroma’s EmbeddingFunction protocol, enabling drop-in substitution of
# embedding providers while preserving a stable interface for downstream vectorization.
class OpenAIEmbeddingFunction(EmbeddingFunction[str]):
    def __init__(self, client: OpenAI, model: str):
        # Storing the OpenAI client instance, responsible for routing requests to the embedding endpoint.
        # Storing the specific model identifier that determines the embedding representation.
        self.client = client
        self.model = model

    def __call__(self, inputs):
        # Normalizing the input to a list structure, ensuring batch consistency for both single and multi-string inputs.
        if isinstance(inputs, str):
            inputs = [inputs]
        # Requesting embeddings from the OpenAI API for the normalized input batch.
        resp = self.client.embeddings.create(model=self.model, input=inputs)
        # Casting returned embeddings into NumPy arrays for numerical stability and compatibility
        # with Chroma’s storage and similarity search operations.
        return [np.array(item.embedding, dtype=np.float32) for item in resp.data]

# Instantiating the embedding function, making it available for explicit precomputation of vectors.
emb_fn = OpenAIEmbeddingFunction(llm_client, EMB_MODEL)

In [5]:
# --- LLM interaction ----------------------------------------------------------
# Using a minimal chat completion request to validate connectivity and authentication.
# The interaction specifies:
# - model: the target conversational model.
# - temperature: the stochasticity of the decoding process.
# - max_tokens: the cap on generated tokens to control cost and verbosity.
# - messages: a structured dialogue context containing both system and user roles.
chat = llm_client.chat.completions.create(
    model=LLM_MODEL,
    temperature=TEMPERATUER,
    max_tokens=MAX_TOKENS,
    messages=[
        {"role": "system", "content": "Eres un asistente útil y conciso."},
        {"role": "user", "content": "¿Cuál es la capital de Francia?"}
    ]
)

In [6]:
# --- Validation cell: LLM connectivity test -----------------------------------
# Using a direct print of the assistant's reply to confirm that the model responds as expected.
print(chat.choices[0].message.content)

La capital de Francia es París.


In [7]:
# --- Creating or retrieving a Chroma collection -------------------------------
# Defining a symbolic name for the collection, binding stored documents and their embeddings
# under a reproducible identifier that can be referenced across experiments.
collection_name = "document_qa_collection"

# Creating (or retrieving) the collection from Chroma. Since embeddings are precomputed
# explicitly, no embedding_function is attached at collection instantiation.
collection = chroma.get_or_create_collection(
    name=collection_name,
)

# Printing the collection name to confirm that the resource is available and accessible.
print("Colección lista:", collection.name)

Colección lista: document_qa_collection


In [19]:
# --- Function to load documents from a directory ------------------------------
def load_documents_from_directory(directory_path):
    # Printing a marker to indicate the start of the loading process for traceability.
    print("==== Loading documents from directory ====")

    # Initializing a list to accumulate document records.
    documents = []

    # Iterating over all files in the specified directory.
    for filename in os.listdir(directory_path):
        # Filtering for files with a .txt extension to ensure textual input only.
        if filename.endswith(".txt"):
            # Opening each file with UTF-8 encoding to handle multilingual characters consistently.
            with open(os.path.join(directory_path, filename), "r", encoding="utf-8") as file:
                # Appending a dictionary containing the filename (as a stable identifier)
                # and the raw file content.
                documents.append({"id": filename, "text": file.read()})

    # Returning the list of loaded document records for downstream processing.
    return documents

In [27]:
# --- Loading and inspecting documents -----------------------------------------
# Reloading documents from the specified directory.
documents = load_documents_from_directory("news_articles")

# Printing the total number of documents as a coarse diagnostic of corpus size.
print(f"number of documents: {len(documents)}")

# Printing the first 50 characters of the first document to validate content ingestion.
print(documents[0]['text'][:50])

==== Loading documents from directory ====
number of documents: 21
Signaling that investments in the supply chain sec


In [28]:
# --- Function to split text into chunks ---------------------------------------
def split_text(text, chunk_size=1000, chunk_overlap=30):
    # Initializing an accumulator list to hold the resulting text segments.
    chunks = []
    # Starting index for segmentation.
    start = 0
    # Iterating until the end of the input text is reached.
    while start < len(text):
        # Computing the endpoint for the current segment.
        end = start + chunk_size
        # Appending the substring defined by [start:end] to the accumulator.
        chunks.append(text[start:end])
        # Advancing the starting index by chunk_size while retaining
        # an overlap of 'chunk_overlap' characters to preserve local context.
        start = end - chunk_overlap
    # Returning the list of overlapping text segments.
    return chunks


In [30]:
# --- Splitting all documents into chunks --------------------------------------
# Iterating over the loaded documents and segmenting each into overlapping chunks
# for downstream embedding and retrieval.
chunked_documents = []

for doc in documents:
    # Splitting the current document’s text into fixed-size overlapping segments.
    chunks = split_text(doc["text"])
    # Enumerating over the resulting segments to assign stable identifiers.
    for i, chunk in enumerate(chunks):
        # Constructing a record with:
        # - id: composed of the document filename and chunk index
        # - text: the raw chunk content
        chunked_documents.append({
            "id": f"{doc['id']}_chunk{i+1}",
            "text": chunk
        })

In [38]:
# --- Inspecting chunked documents ---------------------------------------------
# Measuring the total number of generated chunks across all documents.
print(f"Total number of chunks: {len(chunked_documents)}")

# Printing the metadata of the first chunk to validate structure (id + text fields).
print("\n--- First chunk metadata ---")
print(chunked_documents[0])

# Printing a readable preview of the text content of the first chunk.
print("\n--- First chunk text preview ---")
print(chunked_documents[0]['text'][:30])

Total number of chunks: 185

--- First chunk metadata ---
{'id': '05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt_chunk1', 'text': 'Signaling that investments in the supply chain sector remain robust, Pando, a startup developing fulfillment management technologies, today announced that it raised $30 million in a Series B round, bringing its total raised to $45 million.\n\nIron Pillar and Uncorrelated Ventures led the round, with participation from existing investors Nexus Venture Partners, Chiratae Ventures and Next47. CEO and founder Nitin Jayakrishnan says that the new capital will be put toward expanding Pando’s global sales, marketing and delivery capabilities.\n\n“We will not expand into new industries or adjacent product areas,” he told TechCrunch in an email interview. “Great talent is the foundation of the business — we will continue to augment our teams at all levels of the organization. Pando is also open to exploring strategic partnerships and acquisitio

In [39]:
# --- Precomputing embeddings ---------------------------------------------------
# Defining a utility function for explicit embedding computation.
# The function requests a vector representation for a given text and returns
# the first (and only) embedding from the response as a NumPy array.
def get_openai_embedding(text):
    return emb_fn(text)[0]

In [40]:
# --- Test: embedding a sample string ------------------------------------------
# Using a simple "hello world" string to validate that the embedding function
# returns a numerical vector of the expected dimensionality.
test_text = "hello world"
embedding = get_openai_embedding(test_text)

print("--- Embedding test ---")
print(f"Input text: {test_text}")
print(f"Vector shape: {embedding.shape}")
print(f"First 10 dimensions: {embedding[:10]}")

--- Embedding test ---
Input text: hello world
Vector shape: (1536,)
First 10 dimensions: [-0.00676333 -0.03919632  0.03417581  0.02876212 -0.02478502 -0.04203926
 -0.03028944  0.04932809 -0.01389715 -0.01764742]


In [41]:
# --- Embedding all chunks -----------------------------------------------------
# Iterating through the segmented documents and precomputing embeddings for each.
# For traceability, printing progress messages with a running index.
i = 1
for doc in chunked_documents:
    print(f"------------ Embedding chunk {i} ------------")
    doc["embedding"] = get_openai_embedding(doc["text"])
    i += 1

------------ Embedding chunk 1 ------------
------------ Embedding chunk 2 ------------
------------ Embedding chunk 3 ------------
------------ Embedding chunk 4 ------------
------------ Embedding chunk 5 ------------
------------ Embedding chunk 6 ------------
------------ Embedding chunk 7 ------------
------------ Embedding chunk 8 ------------
------------ Embedding chunk 9 ------------
------------ Embedding chunk 10 ------------
------------ Embedding chunk 11 ------------
------------ Embedding chunk 12 ------------
------------ Embedding chunk 13 ------------
------------ Embedding chunk 14 ------------
------------ Embedding chunk 15 ------------
------------ Embedding chunk 16 ------------
------------ Embedding chunk 17 ------------
------------ Embedding chunk 18 ------------
------------ Embedding chunk 19 ------------
------------ Embedding chunk 20 ------------
------------ Embedding chunk 21 ------------
------------ Embedding chunk 22 ------------
------------ Embedd

In [47]:
# --- Inspecting the first chunk record ----------------------------------------
# Printing the full structure of the first chunk after embedding has been attached.
# The dictionary now contains:
# - "id": unique identifier (filename + chunk index)
# - "text": raw chunk content
# - "embedding": precomputed numerical vector representation
print(chunked_documents[0])
print("---------- embedding shape ----------")
print(chunked_documents[0]["embedding"].shape)

{'id': '05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt_chunk1', 'text': 'Signaling that investments in the supply chain sector remain robust, Pando, a startup developing fulfillment management technologies, today announced that it raised $30 million in a Series B round, bringing its total raised to $45 million.\n\nIron Pillar and Uncorrelated Ventures led the round, with participation from existing investors Nexus Venture Partners, Chiratae Ventures and Next47. CEO and founder Nitin Jayakrishnan says that the new capital will be put toward expanding Pando’s global sales, marketing and delivery capabilities.\n\n“We will not expand into new industries or adjacent product areas,” he told TechCrunch in an email interview. “Great talent is the foundation of the business — we will continue to augment our teams at all levels of the organization. Pando is also open to exploring strategic partnerships and acquisitions with this round of funding.”\n\nPando was co-launched b

In [48]:
# --- Ingesting precomputed embeddings into Chroma -----------------------------
# Adding the chunked documents into the target collection. Each entry is defined by:
# - ids: unique identifiers for individual chunks
# - documents: raw text content for reference and retrieval
# - embeddings: precomputed numerical vectors aligned with each chunk
collection.add(
    ids=[doc["id"] for doc in chunked_documents],
    documents=[doc["text"] for doc in chunked_documents],
    embeddings=[doc["embedding"] for doc in chunked_documents]
)

In [49]:
# --- Local persistence with Chroma --------------------------------------------
# Initializing a persistent client that stores all data on disk in the folder "./chroma_db".
# This enables offline access and reproducibility without depending on the cloud service.
chroma_local = chromadb.PersistentClient(path="./chroma_db")

# Creating (or retrieving) a local collection with the same identifier as the cloud counterpart.
collection_local = chroma_local.get_or_create_collection(
    name="document_qa_collection"
)

# Ingesting the precomputed embeddings into the local collection.
# Each entry is defined by:
# - ids: stable identifiers for each chunk
# - documents: raw text for reference
# - embeddings: explicit numerical vectors
collection_local.add(
    ids=[doc["id"] for doc in chunked_documents],
    documents=[doc["text"] for doc in chunked_documents],
    embeddings=[doc["embedding"] for doc in chunked_documents]
)

print("Data successfully stored in local Chroma instance at './chroma_db'")

Data successfully stored in local Chroma instance at './chroma_db'
