In [1]:
import os
import glob
import json
import uuid
from typing import Any, Dict, List, Optional

from langchain_text_splitters import TokenTextSplitter
from langchain_core.documents import Document
from langchain_openai import AzureOpenAIEmbeddings
from langchain_chroma import Chroma

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

In [2]:
OPENAI_API_KEY_E=os.getenv('AZURE_OPENAI_API_KEY_US2')
os.environ['OPENAI_API_VERSION_E'] = '2024-12-01-preview'
os.environ['AZURE_OPENAI_ENDPOINT_E'] = 'https://agents-4on.openai.azure.com/'
os.environ['AZURE_OPENAI_EMBEDDING_DEPLOYMENT_E'] = "text-embedding-3-large-eus2"

In [3]:
INPUT_DIR = "./database"                  # Folder with .json files
PERSIST_DIR = "./database"                # Chroma persistence path
COLLECTION_NAME = "json_embeddings"       # Logical collection name
RESET_COLLECTION = False                  # If True, clears existing data

# Chunking (safe defaults for text-embedding-3-* models)
TARGET_CHUNK_TOKENS = 800
CHUNK_OVERLAP_TOKENS = 50


# Optional: only embed some fields if your JSON has consistent schema
ONLY_FIELDS: Optional[List[str]] = None
# e.g.: ONLY_FIELDS = ["title", "summary", "body"]


In [4]:
embeddings = AzureOpenAIEmbeddings(
    api_key=OPENAI_API_KEY_E,
    azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT_E'),  
    api_version=os.getenv('OPENAI_API_VERSION_E'),
    azure_deployment=os.getenv('AZURE_OPENAI_EMBEDDING_DEPLOYMENT_E')
)

In [5]:


def flatten_json(obj: Any, prefix: str = "") -> List[str]:
    """Flatten any JSON to lines like 'path.to.key: value' for embedding."""
    lines: List[str] = []
    if isinstance(obj, dict):
        for k, v in obj.items():
            new_prefix = f"{prefix}.{k}" if prefix else k
            lines.extend(flatten_json(v, new_prefix))
    elif isinstance(obj, list):
        for i, v in enumerate(obj):
            new_prefix = f"{prefix}[{i}]"
            lines.extend(flatten_json(v, new_prefix))
    else:
        val = "" if obj is None else str(obj)
        if prefix:
            lines.append(f"{prefix}: {val}")
        else:
            lines.append(val)
    return lines


def json_to_text(obj: Any, file_name: str) -> str:
    """Turn JSON into a readable text block, optionally selecting fields."""
    if ONLY_FIELDS and isinstance(obj, dict):
        subset: Dict[str, Any] = {k: obj.get(k) for k in ONLY_FIELDS if k in obj}
        lines = flatten_json(subset, "")
    else:
        lines = flatten_json(obj, "")
    header = f"Source: {file_name}\n"
    return header + "\n".join(lines)


def load_json_files(input_dir: str) -> List[Document]:
    """Load .json files and convert to LangChain Documents (with chunking)."""
    paths = sorted(glob.glob(os.path.join(input_dir, "*.json")))
    if not paths:
        print(f"[Info] No .json files found under: {input_dir}")
        return []

    splitter = TokenTextSplitter(
        encoding_name="cl100k_base",
        chunk_size=TARGET_CHUNK_TOKENS,
        chunk_overlap=CHUNK_OVERLAP_TOKENS,
    )

    docs: List[Document] = []
    for p in paths:
        try:
            with open(p, "r", encoding="utf-8") as f:
                data = json.load(f)
        except Exception as e:
            print(f"[Warn] Skipping {p}: {e}")
            continue

        base_text = json_to_text(data, file_name=os.path.basename(p))
        chunks = splitter.split_text(base_text)

        for idx, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk,
                metadata={
                    "source_file": os.path.basename(p),
                    "chunk_index": idx,
                    "total_chunks": len(chunks),
                },
            )
            docs.append(doc)

    print(f"[Info] Prepared {len(docs)} chunk(s) from {len(paths)} file(s).")
    return docs

In [6]:
# INGEST INTO CHROMA


def build_or_load_vectorstore(docs: List[Document]) -> Chroma:
    """
    Create or load a persistent Chroma vector store (via LangChain).
    Persistence is automatic when 'persist_directory' is set (no .persist()).
    """
    if RESET_COLLECTION and os.path.exists(PERSIST_DIR):
        try:
            shutil.rmtree(PERSIST_DIR)
            print(f"[Info] Removed existing Chroma store at {PERSIST_DIR}")
        except Exception as e:
            print(f"[Warn] Failed to remove {PERSIST_DIR}: {e}")

    if docs:
        vs = Chroma.from_documents(
            documents=docs,
            embedding=embeddings,
            collection_name=COLLECTION_NAME,
            persist_directory=PERSIST_DIR,     # ensures on-disk persistence automatically
            collection_metadata={"hnsw:space": "cosine"},  # optional: set metric explicitly
        )
        print(f"[Success] Ingested {len(docs)} chunks into '{COLLECTION_NAME}' at {PERSIST_DIR}.")
        return vs

    # Load existing store if no new docs are provided.
    vs = Chroma(
        embedding_function=embeddings,
        collection_name=COLLECTION_NAME,
        persist_directory=PERSIST_DIR,
    )
    print(f"[Info] Loaded existing Chroma store from {PERSIST_DIR}")
    return vs


In [7]:
# SIMPLE QUERY / RETRIEVAL

def run_query(vs: Chroma, query: str, k: int = 4):
    """
    Performs a similarity search using the same Azure embeddings.
    Returns documents with metadata + distances (if needed).
    """
    # LangChain Chroma: similarity_search returns top-k Documents
    results = vs.similarity_search(query, k=k)

    print("\n=== Top Matches ===")
    for i, d in enumerate(results, start=1):
        md = d.metadata or {}
        src = md.get("source_file", "unknown")
        idx = md.get("chunk_index", -1)
        total = md.get("total_chunks", -1)
        print(f"\nRank #{i}")
        print(f"Source: {src} (chunk {idx+1}/{total})")
        preview = d.page_content[:500].replace("\n", " ")
        if len(d.page_content) > 500:
            preview += " ..."
        print(preview)

In [9]:
docs = load_json_files(INPUT_DIR)
vectorstore = build_or_load_vectorstore(docs)

# Sample query (adjust or comment out)
run_query(vectorstore, "Which are the corporate rating models?", k=8)


[Info] Prepared 7 chunk(s) from 4 file(s).
[Success] Ingested 7 chunks into 'json_embeddings' at ./database.

=== Top Matches ===

Rank #1
Source: transactions.json (chunk 2/3)
[10].name: PRODUCT columns[10].type: TEXT columns[10].description: Type of product. The available types of products are 'Bond', 'Consumer loan', 'Corporate loan', 'Credit Cards', 'Guarantee', 'Investment Loan', 'Letter of Credit', 'Mortgage loan', 'Multipurpose Line', 'Other loan', and 'Overdraft'. These values are usually used in questions that involve type of product. A product may be referred to with or without capitalization and in cases of multi-word description it may be referred to with o ...

Rank #2
Source: transactions.json (chunk 2/3)
[10].name: PRODUCT columns[10].type: TEXT columns[10].description: Type of product. The available types of products are 'Bond', 'Consumer loan', 'Corporate loan', 'Credit Cards', 'Guarantee', 'Investment Loan', 'Letter of Credit', 'Mortgage loan', 'Multipurpose Line', 'O