1. Import required Libraries

In [3]:
from __future__ import annotations

import os
import re
import json
import time
import hashlib
import zipfile
from pathlib import Path
from datetime import datetime
from typing import Dict, List

import requests
from tqdm.auto import tqdm
from dotenv import load_dotenv

import chromadb
from openai import OpenAI

In [None]:
# Resolve repo paths
from pathlib import Path

CWD = Path.cwd()
REPO_ROOT = CWD.parent if CWD.name.lower() == "notebooks" else CWD

ENV_PATH = REPO_ROOT / ".env"
EMBEDDINGS_DIR = REPO_ROOT / "embeddings"
SCHEMA_DOWNLOAD_DIR = REPO_ROOT / "schema_download"

print("CWD:", CWD)
print("REPO_ROOT:", REPO_ROOT)
print(".env:", ENV_PATH)
print("embeddings/:", EMBEDDINGS_DIR)

2. Load OpenAI API Key

In [11]:
import os
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv(dotenv_path=ENV_PATH, override=True)

api_key = os.getenv("OPENAI_API_KEY")

if not api_key or len(api_key.strip()) < 20:
    raise ValueError(
        f"OPENAI_API_KEY not found in: {ENV_PATH}\n"
        "Add this line to your .env:\n"
        "OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxx"
    )

client = OpenAI(api_key=api_key)
print("OPENAI_API_KEY loaded and OpenAI client initialized")

OPENAI_API_KEY loaded and OpenAI client initialized


3. Fetch Schema Markdown files

In [14]:
from pathlib import Path

SCHEMA_DIR = REPO_ROOT / "schema" / "source"
if not SCHEMA_DIR.exists():
    raise FileNotFoundError(f"Schema directory not found: {SCHEMA_DIR}")

SCHEMA_FILES = [
    "customers.md",
    "products.md",
    "orders.md",
    "order_items.md",
    "subscriptions.md",
    "churn_predictions.md",
    "forecast_predictions.md",
    "relationships.md",
    "analytics_patterns.md",
]

missing = [f for f in SCHEMA_FILES if not (SCHEMA_DIR / f).exists()]
if missing:
    raise FileNotFoundError(f"Missing schema files in {SCHEMA_DIR}: {missing}")

fetched = {fn: (SCHEMA_DIR / fn).read_text(encoding="utf-8") for fn in SCHEMA_FILES}

print("Loaded local schema files:")
for fn in SCHEMA_FILES:
    print(" -", SCHEMA_DIR / fn)

Loaded local schema files:
 - c:\Users\ravul\Desktop\ML Portfolio\agentic-rag-analytics\schema\source\customers.md
 - c:\Users\ravul\Desktop\ML Portfolio\agentic-rag-analytics\schema\source\products.md
 - c:\Users\ravul\Desktop\ML Portfolio\agentic-rag-analytics\schema\source\orders.md
 - c:\Users\ravul\Desktop\ML Portfolio\agentic-rag-analytics\schema\source\order_items.md
 - c:\Users\ravul\Desktop\ML Portfolio\agentic-rag-analytics\schema\source\subscriptions.md
 - c:\Users\ravul\Desktop\ML Portfolio\agentic-rag-analytics\schema\source\churn_predictions.md
 - c:\Users\ravul\Desktop\ML Portfolio\agentic-rag-analytics\schema\source\forecast_predictions.md
 - c:\Users\ravul\Desktop\ML Portfolio\agentic-rag-analytics\schema\source\relationships.md
 - c:\Users\ravul\Desktop\ML Portfolio\agentic-rag-analytics\schema\source\analytics_patterns.md


4. Chunk schema markdown into sections 

In [16]:
import re
from typing import List, Dict
from collections import Counter

def normalize_whitespace(s: str) -> str:
    s = s.replace("\r\n", "\n")
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()

def split_by_h2(markdown_text: str) -> List[Dict[str, str]]:
    """
    Split markdown into chunks based on '## ' headers.
    If a file has no '##' headers, returns a single FULL_DOCUMENT chunk.
    """
    text = normalize_whitespace(markdown_text)

    if re.search(r"(?m)^\s*##\s+", text) is None:
        return [{"section": "FULL_DOCUMENT", "content": text}]

    parts = re.split(r"(?m)^\s*(##\s+.+?)\s*$", text)
    chunks: List[Dict[str, str]] = []

    # Optional preamble (content before first ##)
    preamble = parts[0].strip()
    if preamble:
        chunks.append({"section": "PREAMBLE", "content": preamble})

    # Header/body pairs
    for i in range(1, len(parts), 2):
        header = parts[i].strip()
        body = parts[i + 1].strip() if i + 1 < len(parts) else ""
        section_name = re.sub(r"^\s*##\s+", "", header).strip()
        chunk_text = normalize_whitespace(f"{header}\n\n{body}")
        if chunk_text:
            chunks.append({"section": section_name, "content": chunk_text})

    return chunks

# Build chunk records
all_chunks: List[Dict] = []
for filename, md in fetched.items():
    table = filename.replace(".md", "")
    chunks = split_by_h2(md)

    for idx, ch in enumerate(chunks):
        all_chunks.append({
            "table": table,
            "filename": filename,
            "section": ch["section"],
            "chunk_index": idx,
            "text": ch["content"],
        })

# Professional summary output (no noisy previews)
total_chunks = len(all_chunks)
by_file = Counter(c["filename"] for c in all_chunks)

print("Chunking complete")
print(f"   • Files processed: {len(fetched)}")
print(f"   • Total chunks:    {total_chunks}")
print("   • Chunks per file:")
for fn in SCHEMA_FILES:
    print(f"     - {fn}: {by_file.get(fn, 0)}")

# Guardrails for your expected chunk range (no hard fail, just a warning)
if total_chunks < 40 or total_chunks > 120:
    print("Note: Chunk count is outside the expected ~50–80 range. "
          "This is not necessarily wrong, but you may want to verify headings.")

Chunking complete
   • Files processed: 9
   • Total chunks:    81
   • Chunks per file:
     - customers.md: 9
     - products.md: 9
     - orders.md: 9
     - order_items.md: 10
     - subscriptions.md: 9
     - churn_predictions.md: 9
     - forecast_predictions.md: 9
     - relationships.md: 7
     - analytics_patterns.md: 10


5. Initialize ChromaDB

In [17]:
import chromadb
from datetime import datetime

EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)

chroma_client = chromadb.PersistentClient(path=str(EMBEDDINGS_DIR))

COLLECTION_NAME = "agentic_rag_analytics_schema"

collection = chroma_client.get_or_create_collection(
    name=COLLECTION_NAME,
    metadata={
        "project": "agentic-rag-analytics",
        "purpose": "schema_embeddings",
        "embedding_model": "text-embedding-3-small",
        "created_at_utc": datetime.utcnow().isoformat() + "Z",
        "schema_source": "local_files",
        "schema_path": "schema/source/*.md",
    },
    embedding_function=None  # IMPORTANT: we provide embeddings manually
)

print("ChromaDB initialized")
print("   • Persist directory:", EMBEDDINGS_DIR)
print("   • Collection name:  ", COLLECTION_NAME)
print("   • Total chunks to embed:", total_chunks)

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


ChromaDB initialized
   • Persist directory: c:\Users\ravul\Desktop\ML Portfolio\agentic-rag-analytics\embeddings
   • Collection name:   agentic_rag_analytics_schema
   • Total chunks to embed: 81


In [None]:
try:
    chroma_client.delete_collection(COLLECTION_NAME)
    print("Existing collection deleted:", COLLECTION_NAME)
except Exception:
    # If it doesn't exist yet, that's fine
    print("No existing collection to delete (first run)")

collection = chroma_client.get_or_create_collection(
    name=COLLECTION_NAME,
    metadata={
        "project": "agentic-rag-analytics",
        "purpose": "schema_embeddings",
        "embedding_model": "text-embedding-3-small",
        "created_at_utc": datetime.utcnow().isoformat() + "Z",
        "schema_source": "local_files",
        "schema_path": "schema/source/*.md",
    },
    embedding_function=None
)

print("Fresh collection ready:", COLLECTION_NAME)

Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


Existing collection deleted: agentic_rag_analytics_schema
Fresh collection ready: agentic_rag_analytics_schema


6. Generate Embeddings (batched) and store in ChromaDB

In [20]:
import time
import hashlib
from tqdm.auto import tqdm

EMBED_MODEL = "text-embedding-3-small"
BATCH_SIZE = 64  # ~81 chunks => usually 2 API calls

def stable_id(item: dict) -> str:
    """
    Stable deterministic ID (reruns won't duplicate if collection isn't wiped).
    """
    content_hash = hashlib.md5(item["text"].encode("utf-8")).hexdigest()
    key = f"{item['filename']}|{item['section']}|{item['chunk_index']}|{content_hash}"
    return hashlib.sha1(key.encode("utf-8")).hexdigest()

# Prepare records
ids = [stable_id(c) for c in all_chunks]
documents = [c["text"] for c in all_chunks]
metadatas = [{
    "table": c["table"],
    "filename": c["filename"],
    "section": c["section"],
    "chunk_index": c["chunk_index"],
    "source": "local_file",
    "path": f"schema/source/{c['filename']}",
} for c in all_chunks]

t0 = time.time()
num_batches = (len(documents) + BATCH_SIZE - 1) // BATCH_SIZE
print("Embedding run started")
print(f"   • Model:       {EMBED_MODEL}")
print(f"   • Chunks:      {len(documents)}")
print(f"   • Batch size:  {BATCH_SIZE}")
print(f"   • Batches:     {num_batches}")

# Embed + store
for start in tqdm(range(0, len(documents), BATCH_SIZE), desc="Embedding batches"):
    end = min(start + BATCH_SIZE, len(documents))

    resp = client.embeddings.create(
        model=EMBED_MODEL,
        input=documents[start:end]
    )
    embeddings = [x.embedding for x in resp.data]

    collection.add(
        ids=ids[start:end],
        documents=documents[start:end],
        metadatas=metadatas[start:end],
        embeddings=embeddings
    )

elapsed = time.time() - t0
print("Embedding run complete")
print(f"   • Stored vectors: {collection.count()}")
print(f"   • Elapsed time:   {elapsed:.2f}s")
print(f"   • Persist dir:    {EMBEDDINGS_DIR}")

Embedding run started
   • Model:       text-embedding-3-small
   • Chunks:      81
   • Batch size:  64
   • Batches:     2


Embedding batches:   0%|          | 0/2 [00:00<?, ?it/s]Failed to send telemetry event CollectionAddEvent: capture() takes 1 positional argument but 3 were given
Embedding batches: 100%|██████████| 2/2 [00:04<00:00,  2.19s/it]

Embedding run complete
   • Stored vectors: 81
   • Elapsed time:   4.40s
   • Persist dir:    c:\Users\ravul\Desktop\ML Portfolio\agentic-rag-analytics\embeddings





In [21]:
import json
import zipfile
from datetime import datetime

# 1) Write a small manifest for traceability
manifest = {
    "project": "agentic-rag-analytics",
    "collection": COLLECTION_NAME,
    "embedding_model": EMBED_MODEL,
    "chunk_count": len(all_chunks),
    "files": SCHEMA_FILES,
    "schema_source": "local_files",
    "schema_dir": str(REPO_ROOT / "schema" / "source"),
    "persist_dir": str(EMBEDDINGS_DIR),
    "created_at_utc": datetime.utcnow().isoformat() + "Z",
}

(EMBEDDINGS_DIR / "manifest.json").write_text(json.dumps(manifest, indent=2), encoding="utf-8")

# 2) Zip the embeddings folder
zip_path = REPO_ROOT / "embeddings.zip"
if zip_path.exists():
    zip_path.unlink()

with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z:
    for p in EMBEDDINGS_DIR.rglob("*"):
        z.write(p, p.relative_to(REPO_ROOT))

print("Output ready")
print("   • Manifest:", EMBEDDINGS_DIR / "manifest.json")
print("   • Folder:  ", EMBEDDINGS_DIR)
print("   • Zip:     ", zip_path)
print("\nTip: Right-click embeddings.zip in VS Code Explorer → 'Reveal in File Explorer' → share/download.")


Output ready
   • Manifest: c:\Users\ravul\Desktop\ML Portfolio\agentic-rag-analytics\embeddings\manifest.json
   • Folder:   c:\Users\ravul\Desktop\ML Portfolio\agentic-rag-analytics\embeddings
   • Zip:      c:\Users\ravul\Desktop\ML Portfolio\agentic-rag-analytics\embeddings.zip

Tip: Right-click embeddings.zip in VS Code Explorer → 'Reveal in File Explorer' → share/download.


In [22]:
print("Embeddings generated successfully!")
print("Folder:", EMBEDDINGS_DIR)
print("Zip:", REPO_ROOT / "embeddings.zip")

Embeddings generated successfully!
Folder: c:\Users\ravul\Desktop\ML Portfolio\agentic-rag-analytics\embeddings
Zip: c:\Users\ravul\Desktop\ML Portfolio\agentic-rag-analytics\embeddings.zip
