In [None]:
# Compliance RAG – Model Validation Notebook

"""
This notebook validates whether the AI system is correctly grounding answers in the retrieved documents.
We simulate a few example questions and check if:

1. The source documents actually contain the answer
2. The LLM correctly cites or reflects the retrieved chunks
3. There is no hallucination or overreach

"""

from src.qa import RAGPipeline  # Uses the FAISS index + LLM
from pathlib import Path

INDEX_PATH = Path("data/index")
qa_system = RAGPipeline(index_path=INDEX_PATH)

# These questions are designed around public proxies for Dow Jones datasets
# In production, these would reflect the client's licensed internal data
example_questions = [
    "Was Tesla mentioned in any OFAC sanctions?",
    "What are the latest FinCEN advisories?",
    "Is there any SEC filing involving Alphabet Inc. this year?",
    "List entities in the most recent FATF blacklist.",
    "What is the tone of the regulatory update from the EU?"
]

for q in example_questions:
    print(f"\n\033[1mQuestion:\033[0m {q}")
    response = qa_system.ask(q)
    print(f"\n\033[1mAnswer:\033[0m {response['answer']}")
    print("\n\033[1mSources:\033[0m")
    for doc in response["sources"]:
        print(f"- {doc.metadata.get('source', 'unknown')}\n  → {doc.page_content[:300]}...")

"""
Validation Notes:
- For each answer, verify that the content appears in the cited source
- If not, check whether the source passage is at least related or thematically adjacent
- If answer seems fabricated or unsupported, mark it as hallucinated

Production datasets would yield better grounding due to curated structure
"""

In [None]:
import os
from pathlib import Path

: 

In [None]:
docs_path = Path("data/raw")
print(docs_path.exists())
print(list(docs_path.glob("*")))

In [None]:
from src.inspect_index import inspect_index

indexed_docs = inspect_index("data/index")

for doc in indexed_docs:
    print(f"\nDocument {doc['doc_id']}")
    print(f"Source: {doc['source']}")
    print(f"Preview: {doc['preview']}") 