In [None]:
import os
import sys
import json
from glob import glob
from dotenv import load_dotenv, find_dotenv
from typing import Dict, Any, List
# from langchain_openai import OpenAIEmbeddings
from langchain_openai import AzureOpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

In [None]:
OPENAI_API_KEY_E=os.getenv('AZURE_OPENAI_API_KEY_US2')
os.environ['OPENAI_API_VERSION_E'] = '2024-12-01-preview'
os.environ['AZURE_OPENAI_ENDPOINT_E'] = 'https://agents-4on.openai.azure.com/'
os.environ['AZURE_OPENAI_EMBEDDING_DEPLOYMENT_E'] = "text-embedding-3-large-eus2"

emb_model = AzureOpenAIEmbeddings(
    api_key=OPENAI_API_KEY_E,
    azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT_E'),  
    api_version=os.getenv('OPENAI_API_VERSION_E'),
    azure_deployment=os.getenv('AZURE_OPENAI_EMBEDDING_DEPLOYMENT_E')
)

In [None]:
# Convert a single table JSON into readable text
def table_json_to_text(table_json: Dict[str, Any]) -> str:
    lines = []
    lines.append(f"Table: {table_json.get('table', '')}")
    if desc := table_json.get("description"):
        lines.append(f"Description: {desc}")
    lines.append("")  # blank line

    lines.append("Columns:")
    for col in table_json.get("columns", []):
        name = col.get("name")
        ctype = col.get("type")
        nullable = col.get("nullable")
        desc = col.get("description", "")
        allowed = col.get("allowed_values")
        lines.append(f"- {name} ({ctype}) - nullable={nullable}")
        if desc:
            lines.append(f"  Description: {desc}")
        if allowed:
            allowed_items = ", ".join([f"{k}: {v}" for k, v in allowed.items()])
            lines.append(f"  Allowed values: {allowed_items}")
    lines.append("")

    if constraints := table_json.get("constraints"):
        lines.append("Constraints:")
        for k, v in constraints.items():
            cols = v.get("columns")
            desc = v.get("description")
            lines.append(f"- {k}: columns={cols} - {desc}")
        lines.append("")

    if relationships := table_json.get("relationships"):
        lines.append("Relationships:")
        for rel in relationships:
            related = rel.get("related_table")
            join = rel.get("join_type")
            card = rel.get("cardinality")
            notes = rel.get("notes")
            lines.append(f"- Related table: {related} ({join}) -- {card}")
            if notes:
                lines.append(f"  Notes: {notes}")
    return "\n".join(lines)

In [None]:
# Load all .json files from a directory and return Document objects
def load_table_documents_from_dir(dir_path: str) -> List[Document]:
    docs: List[Document] = []
    pattern = os.path.join(dir_path, "*.json")
    files = sorted(glob(pattern))
    if not files:
        raise FileNotFoundError(f"No .json files found in directory: {dir_path}")
    for fp in files:
        with open(fp, "r", encoding="utf-8") as f:
            table_json = json.load(f)
        text = table_json_to_text(table_json)
        metadata = {
            "table": table_json.get("table"),
            "source_file": os.path.basename(fp),
        }
        docs.append(Document(page_content=text, metadata=metadata))
    return docs

In [None]:
# Indexing into Chroma (embeds + stores)
def index_dir_to_chroma(
    dir_path: str,
    collection_name: str = "db_tables_collection",
    persist_dir: str = "./chroma_db",
    openai_model: str = "text-embedding-3-large",
):
    # Initialize embeddings (requires OPENAI_API_KEY)
    # embeddings = OpenAIEmbeddings(model=openai_model)

    embeddings = AzureOpenAIEmbeddings(
        api_key=OPENAI_API_KEY_E,
        azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT_E'),  
        api_version=os.getenv('OPENAI_API_VERSION_E'),
        azure_deployment=os.getenv('AZURE_OPENAI_EMBEDDING_DEPLOYMENT_E')    
    )
    
    # embeddings = emb_model
   
    # Initialize Chroma
    vector_store = Chroma(
        collection_name=collection_name,
        embedding_function=embeddings,
        persist_directory=persist_dir,
    )

    # Load Docs from directory
    docs = load_table_documents_from_dir(dir_path)

    # Add documents (Chroma will compute embeddings)
    vector_store.add_documents(docs)

    # persist (if available)
    try:
        vector_store.persist()
    except Exception:
        # persist may be optional depending on langchain-chroma version
        pass

    print(f"Indexed {len(docs)} docs from '{dir_path}' into collection '{collection_name}'")
    return vector_store

In [None]:
db_info_dir = "database"

chroma_store = index_dir_to_chroma(
    dir_path=db_info_dir,
    collection_name="bank_schema_tables",
    persist_dir="./chroma_db",
    openai_model="text-embedding-3-large",
)

In [None]:
# Retrieval examples: similarity search + metadata filtered search
def retrieval_examples(vector_store: Chroma, queries: List[str], top_k: int = 3):
    for q in queries:
        print("\nQuery:", q)
        results = vector_store.similarity_search(q, k=top_k)
        for i, doc in enumerate(results, start=1):
            print(f"{i}. table={doc.metadata.get('table')} file={doc.metadata.get('source_file')}")
            print("   snippet:", doc.page_content[:900].replace("\n", " ").strip(), "\n")

In [None]:
# Example main
if __name__ == "__main__":
    # Directory where your JSON files live
    db_info_dir = "database"  # <-- put collaterals.json, customers.json, transactions.json, sectors.json here

    # 1) Index
    chroma_store = index_dir_to_chroma(
        dir_path=db_info_dir,
        collection_name="bank_schema_tables",
        persist_dir="./chroma_db",
        openai_model="text-embedding-3-small",
    )

    # 2) Retrieval
    queries = [
        "Which column stores market value of collateral?",
        "Where is customer PD stored?",
        "How do transactions reference customers?",
    ]
    retrieval_examples(chroma_store, queries, top_k=3)

In [None]:
import json
import os
from glob import glob
from typing import List, Dict, Any

# LangChain imports
from langchain.embeddings import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.schema import Document

# Convert a single table JSON into readable text
def table_json_to_text(table_json: Dict[str, Any]) -> str:
    lines = []
    lines.append(f"Table: {table_json.get('table', '')}")
    if desc := table_json.get("description"):
        lines.append(f"Description: {desc}")
    lines.append("")  # blank line

    lines.append("Columns:")
    for col in table_json.get("columns", []):
        name = col.get("name")
        ctype = col.get("type")
        nullable = col.get("nullable")
        desc = col.get("description", "")
        allowed = col.get("allowed_values")
        lines.append(f"- {name} ({ctype}) - nullable={nullable}")
        if desc:
            lines.append(f"  Description: {desc}")
        if allowed:
            allowed_items = ", ".join([f"{k}: {v}" for k, v in allowed.items()])
            lines.append(f"  Allowed values: {allowed_items}")
    lines.append("")

    if constraints := table_json.get("constraints"):
        lines.append("Constraints:")
        for k, v in constraints.items():
            cols = v.get("columns")
            desc = v.get("description")
            lines.append(f"- {k}: columns={cols} - {desc}")
        lines.append("")

    if relationships := table_json.get("relationships"):
        lines.append("Relationships:")
        for rel in relationships:
            related = rel.get("related_table")
            join = rel.get("join_type")
            card = rel.get("cardinality")
            notes = rel.get("notes")
            lines.append(f"- Related table: {related} ({join}) -- {card}")
            if notes:
                lines.append(f"  Notes: {notes}")
    return "\n".join(lines)

# Load all .json files from a directory and return Document objects
def load_table_documents_from_dir(dir_path: str) -> List[Document]:
    docs: List[Document] = []
    pattern = os.path.join(dir_path, "*.json")
    files = sorted(glob(pattern))
    if not files:
        raise FileNotFoundError(f"No .json files found in directory: {dir_path}")
    for fp in files:
        with open(fp, "r", encoding="utf-8") as f:
            table_json = json.load(f)
        text = table_json_to_text(table_json)
        metadata = {
            "table": table_json.get("table"),
            "source_file": os.path.basename(fp),
        }
        docs.append(Document(page_content=text, metadata=metadata))
    return docs

# Indexing into Chroma (embeds + stores)
def index_dir_to_chroma(
    dir_path: str,
    collection_name: str = "db_tables_collection",
    persist_dir: str = "./chroma_db",
    openai_model: str = "text-embedding-3-small",
):
    # Initialize embeddings (requires OPENAI_API_KEY)
    embeddings = OpenAIEmbeddings(model=openai_model)

    # Initialize Chroma
    vector_store = Chroma(
        collection_name=collection_name,
        embedding_function=embeddings,
        persist_directory=persist_dir,
    )

    # Load Docs from directory
    docs = load_table_documents_from_dir(dir_path)

    # Add documents (Chroma will compute embeddings)
    vector_store.add_documents(docs)

    # persist (if available)
    try:
        vector_store.persist()
    except Exception:
        # persist may be optional depending on langchain-chroma version
        pass

    print(f"Indexed {len(docs)} docs from '{dir_path}' into collection '{collection_name}'")
    return vector_store

# Retrieval examples: similarity search + metadata filtered search
def retrieval_examples(vector_store: Chroma, queries: List[str], top_k: int = 3):
    for q in queries:
        print("\nQuery:", q)
        results = vector_store.similarity_search(q, k=top_k)
        for i, doc in enumerate(results, start=1):
            print(f"{i}. table={doc.metadata.get('table')} file={doc.metadata.get('source_file')}")
            print("   snippet:", doc.page_content[:300].replace("\n", " ").strip(), "\n")

    # Example: filter to a specific table
    q = "market value of collateral"
    print("\nFiltered query (table=collaterals):", q)
    filtered = vector_store.similarity_search(q, k=3, filter={"table": "collaterals"})
    for i, doc in enumerate(filtered, start=1):
        print(f"{i}. table={doc.metadata.get('table')} file={doc.metadata.get('source_file')}")

# Example main
if __name__ == "__main__":
    # Directory where your JSON files live
    db_info_dir = "db_info"  # <-- put collaterals.json, customers.json, transactions.json, sectors.json here

    # 1) Index
    chroma_store = index_dir_to_chroma(
        dir_path=db_info_dir,
        collection_name="bank_schema_tables",
        persist_dir="./chroma_db",
        openai_model="text-embedding-3-small",
    )

    # 2) Retrieval
    queries = [
        "Which column stores market value of collateral?",
        "Where is customer PD stored?",
        "How do transactions reference customers?",
    ]
    retrieval_examples(chroma_store, queries, top_k=3)