In [11]:
os.environ["OPENAI_API_KEY"] = ""

In [45]:
os.environ["OPENAI_API_KEY"]  = ""

In [61]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Create a FAISS vector store from YAML documents using OpenAI embeddings.

All configuration (including the OpenAI key) is defined inside this file –
no .env, no external environment variables.
"""

import os
import sys
from pathlib import Path
from typing import List

# ----------------------------------------------------------------------
# 1️⃣  HARD‑CODED OpenAI configuration
# ----------------------------------------------------------------------
# 👉  Replace the string below with **your actual OpenAI secret key**.
OPENAI_API_KEY = (
    ""
)

# If you ever need a custom base URL (Azure, self‑hosted, etc.) you can
# uncomment the line below and make sure it starts with https://
OPENAI_API_BASE = "https://api.openai.com/v1"

# ----------------------------------------------------------------------
# 2️⃣  Validate the configuration (helps catch a missing scheme early)
# ----------------------------------------------------------------------
def _validate_openai_cfg() -> None:
    if not OPENAI_API_KEY or not OPENAI_API_KEY.startswith("sk-"):
        raise RuntimeError(
            "❌  OPENAI_API_KEY is missing or does not look like a valid key. "
            "Edit the script and set the variable `OPENAI_API_KEY`."
        )
    # If a custom base is supplied, ensure it has a scheme.
    if "OPENAI_API_BASE" in globals():
        base = globals()["OPENAI_API_BASE"]
        if not (base.startswith("http://") or base.startswith("https://")):
            raise RuntimeError(
                f"❌  OPENAI_API_BASE looks malformed: '{base}'. "
                "It must include the scheme, e.g. https://api.openai.com/v1"
            )
    # Show a short sanity‑check for the user.
    print("🔑  OpenAI configuration")
    print(
        f"    • API key   : {'*' * (len(OPENAI_API_KEY) - 4) + OPENAI_API_KEY[-4:]}"
    )
    if "OPENAI_API_BASE" in globals():
        print(f"    • Base URL  : {OPENAI_API_BASE}")
    else:
        print("    • Base URL  : default (https://api.openai.com/v1)")
    print()


_validate_openai_cfg()

# ----------------------------------------------------------------------
# 3️⃣  Set the OpenAI variables **before** importing the LangChain client
# ----------------------------------------------------------------------
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
if "OPENAI_API_BASE" in globals():
    os.environ["OPENAI_API_BASE"] = globals()["OPENAI_API_BASE"]

# ----------------------------------------------------------------------
# 4️⃣  Imports that rely on the above environment variables
# ----------------------------------------------------------------------
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document   # type: ignore

# ----------------------------------------------------------------------
# 5️⃣  Helper: load every *.yaml file into a LangChain Document
# ----------------------------------------------------------------------
def load_yaml_docs() -> List[Document]:
    """
    Reads all *.yaml files from the folder
    `/Users/apple/Desktop/company/learnings/mcp/ornate_sql_tool`
    (relative to the script location) and returns a list of `Document` objects.
    """
    import yaml

    # --------------------------------------------------------------
    # Resolve the folder path.
    #   * When the script is executed as a .py file, __file__ exists.
    #   * When run from a notebook, __file__ is not defined – fall back to cwd.
    # --------------------------------------------------------------
    if "__file__" in globals():
        base_dir = Path(__file__).parent.resolve()
    else:
        base_dir = Path.cwd()                     # works in notebooks

    # The folder you indicated in the original code:
    yaml_folder = Path(
        "/Users/apple/Desktop/company/learnings/mcp/ornate_sql_tool"
    )
    # If you ever want the path to be *relative* to the script, uncomment:
    # yaml_folder = base_dir / "yaml_docs"

    yaml_folder = yaml_folder.expanduser().resolve()

    if not yaml_folder.is_dir():
        raise RuntimeError(
            f"❌  Folder '{yaml_folder}' not found. "
            "Create it and put your .yaml files there."
        )

    documents: List[Document] = []
    for yaml_path in yaml_folder.glob("*.yaml"):
        with yaml_path.open("r", encoding="utf-8") as f:
            data = yaml.safe_load(f)

        # Turn the dict back into a nicely formatted string.
        text = yaml.dump(data, allow_unicode=True, sort_keys=False)

        documents.append(
            Document(
                page_content=text,
                metadata={"source": str(yaml_path)},
            )
        )
    return documents


# ----------------------------------------------------------------------
# 6️⃣  Main pipeline
# ----------------------------------------------------------------------
def main() -> None:
    # ---- Load raw YAML docs -------------------------------------------------
    docs = load_yaml_docs()
    print(
        f"📄  Loaded {len(docs)} source document(s) from "
        f"'{Path('/Users/apple/Desktop/company/learnings/mcp/ornate_sql_tool')}'"
    )

    # ---- Split into overlapping chunks --------------------------------------
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    split_docs = splitter.split_documents(docs)
    print(f"🔪  Split into {len(split_docs)} chunk(s) (≈1 k chars each)")

    # ---- Create the embedding model (reads the env vars we set above) ------
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

    # ---- Build a FAISS index ------------------------------------------------
    print("⚙️  Building FAISS index… (this may take a few seconds)")
    db = FAISS.from_documents(split_docs, embeddings)

    # ---- Persist the index --------------------------------------------------
    out_dir = Path("db_faiss_index")
    db.save_local(str(out_dir))
    print(f"✅  FAISS index saved to folder '{out_dir}/'")

    # ---- Optional quick sanity‑check ----------------------------------------
    print("\n🔎  Quick sanity‑check (search for the word 'example'):")
    db_loaded = FAISS.load_local(str(out_dir), embeddings, allow_dangerous_deserialization=True)
    results = db_loaded.similarity_search("example", k=3)
    for i, doc in enumerate(results, 1):
        src = doc.metadata.get("source", "unknown")
        snippet = doc.page_content[:200].replace("\n", " ")
        print(f"  {i}. source={src!r}, snippet=\"{snippet}…\"")

    print("\n🎉  All done!")


if __name__ == "__main__":
    try:
        main()
    except Exception as exc:                       # pragma: no cover
        print(f"\n❗  Fatal error → {exc!r}")
        sys.exit(1)


🔑  OpenAI configuration
    • API key   : ****************************************************************************************************************************************************************CyQA
    • Base URL  : https://api.openai.com/v1

📄  Loaded 2 source document(s) from '/Users/apple/Desktop/company/learnings/mcp/ornate_sql_tool'
🔪  Split into 164 chunk(s) (≈1 k chars each)
⚙️  Building FAISS index… (this may take a few seconds)
✅  FAISS index saved to folder 'db_faiss_index/'

🔎  Quick sanity‑check (search for the word 'example'):
  1. source='/Users/apple/Desktop/company/learnings/mcp/ornate_sql_tool/accounts.yaml', snippet="period.     probation_document:       type: varchar(100)       description: The 'probation_document' field in the 'accounts_probation' table         stores a string of up to 100 characters that repres…"
  2. source='/Users/apple/Desktop/company/learnings/mcp/ornate_sql_tool/accounts.yaml', snippet="person. A value of 1 signifies that the user is 

In [55]:
pip install faiss-cpu

[0mCollecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp311-cp311-macosx_14_0_arm64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp311-cp311-macosx_14_0_arm64.whl (3.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m25.5 MB/s[0m  [33m0:00:00[0m
[0mInstalling collected packages: faiss-cpu
[0mSuccessfully installed faiss-cpu-1.12.0
Note: you may need to restart the kernel to use updated packages.
