In [24]:
# Environment Setup and Imports
import os
import sys
import json
import hashlib
from datetime import datetime
from typing import List, Dict, Any, Optional, Tuple
import warnings
warnings.filterwarnings('ignore')

# Core libraries
import pandas as pd
import numpy as np
from pathlib import Path

# PDF Processing
import PyPDF2
from PyPDF2 import PdfReader

# Vector Store and Embeddings
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer

# LangChain Components
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_community.llms import Ollama  # For local LLM option
from langchain.prompts import PromptTemplate


# OpenAI - Primary choice for embeddings and LLM
import openai
import os

# Check for OpenAI API key
if not os.getenv("OPENAI_API_KEY"):
    print("⚠️ OPENAI_API_KEY not found in environment variables")
    print("Please set it using: export OPENAI_API_KEY='your-key-here'")


# OpenAI (optional - for better responses)
try:
    from langchain_openai import ChatOpenAI
    import openai
    OPENAI_AVAILABLE = True
except:
    OPENAI_AVAILABLE = False
    print("OpenAI not available - will use Ollama for local inference")

# Display utilities
from IPython.display import display, Markdown, HTML
import textwrap

print("✅ All imports successful")
print(f"📂 Working directory: {os.getcwd()}")

✅ All imports successful
📂 Working directory: /Users/anupamagaranisheshagiri/Documents/01ANUCOURSEWORK/PROJECTS/ULTIMATE RAG/trust_framework


In [25]:
from langchain.embeddings import OpenAIEmbeddings


In [26]:
# Configuration and Helper Functions
class RAGConfig:
    """Configuration for RAG system"""
    # Paths
    DOCS_PATH = "../trust_framework/documents"
    VECTOR_DB_PATH = "../trust_framework/vector_stores"
    
    # OpenAI settings
    EMBEDDING_MODEL = "text-embedding-3-small"  # OpenAI's latest embedding model
    LLM_MODEL = "gpt-4.1"  # or "gpt-4" for better quality
    
    # Chunking settings
    CHUNK_SIZE = 1000
    CHUNK_OVERLAP = 200
    
    # Demo settings
    TOP_K_RESULTS = 3

# Set OpenAI API key if available
import os
from dotenv import load_dotenv
load_dotenv()  # Load from .env file if exists

if not os.getenv("OPENAI_API_KEY"):
    # Prompt for API key if not set
    api_key = input("Please enter your OpenAI API key: ")
    os.environ["OPENAI_API_KEY"] = api_key
    print("✅ API key set for this session")
else:
    print("✅ OpenAI API key found")

def display_header(title: str, level: int = 1):
    """Display formatted header"""
    header = "#" * level + " " + title
    display(Markdown(header))
    
def display_results(results: Dict[str, Any], title: str = "Results"):
    """Display results in formatted way"""
    display(Markdown(f"### {title}"))
    if isinstance(results, dict):
        for key, value in results.items():
            if isinstance(value, list):
                display(Markdown(f"**{key}:**"))
                for item in value[:3]:  # Show first 3
                    display(Markdown(f"- {str(item)[:200]}..."))
            else:
                display(Markdown(f"**{key}:** {str(value)[:500]}"))
    else:
        display(Markdown(str(results)))

def extract_text_from_pdf(pdf_path: str) -> Tuple[str, Dict[str, Any]]:
    """Extract text and metadata from PDF"""
    try:
        with open(pdf_path, 'rb') as file:
            pdf = PdfReader(file)
            text = ""
            metadata = {
                'filename': os.path.basename(pdf_path),
                'num_pages': len(pdf.pages),
                'file_size': os.path.getsize(pdf_path),
                'document_type': 'unknown',  # Will be enriched later
                'extracted_date': datetime.now().isoformat()
            }
            
            # Extract text from all pages
            for page_num, page in enumerate(pdf.pages):
                page_text = page.extract_text()
                text += f"\n--- Page {page_num + 1} ---\n{page_text}"
            
            # Try to extract PDF metadata
            if pdf.metadata:
                metadata['pdf_metadata'] = {
                    'title': pdf.metadata.get('/Title', 'N/A'),
                    'author': pdf.metadata.get('/Author', 'N/A'),
                    'subject': pdf.metadata.get('/Subject', 'N/A'),
                    'creator': pdf.metadata.get('/Creator', 'N/A')
                }
            
            return text, metadata
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return "", {}

print("✅ Configuration and helpers loaded")
print(f"📁 Documents path: {RAGConfig.DOCS_PATH}")
# print(f"🤖 Using {'Local LLM (Ollama)' if RAGConfig.USE_LOCAL_LLM else 'OpenAI'}")

✅ OpenAI API key found
✅ Configuration and helpers loaded
📁 Documents path: ../trust_framework/documents


In [38]:
metadata

{'filename': 'TESLA_Compensation_Table.pdf',
 'num_pages': 50,
 'file_size': 2109013,
 'document_type': 'unknown',
 'extracted_date': '2025-08-05T20:50:00.061738',
 'pdf_metadata': {'title': 'DEF 14A',
  'author': 'N/A',
  'subject': 'N/A',
  'creator': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/138.0.0.0 Safari/537.36'}}

In [27]:
# Load and Analyze Tesla Documents
display_header("📚 Loading Tesla Documents", 2)

# Create paths if they don't exist
os.makedirs(RAGConfig.DOCS_PATH, exist_ok=True)
os.makedirs(RAGConfig.VECTOR_DB_PATH, exist_ok=True)

# Load all PDFs
documents_data = []
pdf_files = list(Path(RAGConfig.DOCS_PATH).glob("*.pdf"))

if not pdf_files:
    print("⚠️ No PDF files found. Please ensure Tesla documents are in:", RAGConfig.DOCS_PATH)
    print("Expected files:")
    print("  - Tesla 8-K 2025-07-23.pdf")
    print("  - Tesla 8-K 2025-08-03.pdf")
    print("  - Tesla 10-Q 2025-06-30.pdf")
    print("  - Tesla-Handbook.pdf")
else:
    for pdf_path in pdf_files:
        print(f"\n📄 Processing: {pdf_path.name}")
        text, metadata = extract_text_from_pdf(str(pdf_path))
        
        # Enrich metadata based on filename
        if "8-K" in pdf_path.name:
            metadata['document_type'] = 'earnings_announcement'
            metadata['form_type'] = '8-K'
            metadata['category'] = 'financial_news'
        elif "10-Q" in pdf_path.name:
            metadata['document_type'] = 'quarterly_report'
            metadata['form_type'] = '10-Q'
            metadata['category'] = 'financial_filing'
        elif "Handbook" in pdf_path.name:
            metadata['document_type'] = 'employee_handbook'
            metadata['form_type'] = 'internal'
            metadata['category'] = 'hr_policies'
        
        documents_data.append({
            'text': text,
            'metadata': metadata,
            'path': str(pdf_path)
        })
        
        print(f"  ✓ Pages: {metadata['num_pages']}")
        print(f"  ✓ Type: {metadata['document_type']}")
        print(f"  ✓ Size: {metadata['file_size']:,} bytes")

print(f"\n✅ Loaded {len(documents_data)} documents")

## 📚 Loading Tesla Documents


📄 Processing: TESLA_SEC_8K_July_2025.pdf
  ✓ Pages: 1
  ✓ Type: unknown
  ✓ Size: 67,297 bytes

📄 Processing: TESLA_SEC_10K.pdf
  ✓ Pages: 80
  ✓ Type: unknown
  ✓ Size: 1,143,548 bytes

📄 Processing: TESLA_SEC_10Q_June_2025.pdf
  ✓ Pages: 32
  ✓ Type: unknown
  ✓ Size: 373,571 bytes

📄 Processing: TESLA_SEC_8K_August_2025.pdf
  ✓ Pages: 1
  ✓ Type: unknown
  ✓ Size: 103,831 bytes

📄 Processing: TESLA-Handbook.pdf
  ✓ Pages: 4
  ✓ Type: employee_handbook
  ✓ Size: 108,828 bytes

📄 Processing: TESLA_Compensation_Table.pdf
  ✓ Pages: 50
  ✓ Type: unknown
  ✓ Size: 2,109,013 bytes

✅ Loaded 6 documents


In [28]:
# Initialize Embedding Model and Vector Store
display_header("🧮 Setting Up OpenAI Embeddings and Vector Store", 2)

# Initialize OpenAI embedding model
print("Loading OpenAI embedding model...")
try:
    embeddings = OpenAIEmbeddings(
        model=RAGConfig.EMBEDDING_MODEL,
        openai_api_key=os.getenv("OPENAI_API_KEY")
    )
    
    # Test embedding
    test_embedding = embeddings.embed_query("test")
    print(f"✅ OpenAI Embedding model loaded (dimension: {len(test_embedding)})")
    print(f"   Model: {RAGConfig.EMBEDDING_MODEL}")
    
except Exception as e:
    print(f"❌ Error loading OpenAI embeddings: {e}")
    print("Please check your API key and try again")
    raise

# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path=RAGConfig.VECTOR_DB_PATH)

# 
print("✅ Vector store initialized")

## 🧮 Setting Up OpenAI Embeddings and Vector Store

Loading OpenAI embedding model...
✅ OpenAI Embedding model loaded (dimension: 1536)
   Model: text-embedding-3-small
✅ Vector store initialized


# 🚨 ZONE 1

In [29]:
#  Zone 1 - Strategy Failures (Index Everything)
display_header("🚨 ZONE 1: STRATEGY FAILURES", 1)
display_header("Problem: 'Index Everything' Approach", 2)

print("Creating FAILED approach: Indexing everything without strategy...\n")

# BAD APPROACH: Just dump everything into vector store
bad_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=RAGConfig.CHUNK_SIZE,
    chunk_overlap=RAGConfig.CHUNK_OVERLAP,
    length_function=len,
)

# Create documents without any filtering or strategy
bad_documents = []
for doc_data in documents_data:
    chunks = bad_text_splitter.split_text(doc_data['text'])
    for i, chunk in enumerate(chunks):
        bad_documents.append(Document(
            page_content=chunk,
            metadata={
                'source': doc_data['metadata']['filename'],
                'chunk_id': i
                # Notice: No document type, no categorization!
            }
        ))

print(f"❌ Created {len(bad_documents)} chunks with minimal metadata")

# Create bad vector store
try:
    chroma_client.delete_collection("bad_strategy")
except:
    pass

bad_vectorstore = Chroma.from_documents(
    documents=bad_documents,
    embedding=embeddings,
    collection_name="bad_strategy",
    client=chroma_client
)

# TEST: Query for cybersecurity risks
test_query = "What are Tesla's cybersecurity risks and data protection challenges?"
print(f"\n🔍 Test Query: '{test_query}'")

bad_results = bad_vectorstore.similarity_search(test_query, k=3)
print("\n❌ BAD RESULTS (No Strategy):")
for i, doc in enumerate(bad_results, 1):
    print(f"\n{i}. Source: {doc.metadata['source']}")
    print(f"   Content: {doc.page_content[:10000]}...")
    
print("\n⚠️ PROBLEM: Getting employee handbook IT policies instead of actual risk assessments!")

# 🚨 ZONE 1: STRATEGY FAILURES

## Problem: 'Index Everything' Approach

Creating FAILED approach: Indexing everything without strategy...

❌ Created 965 chunks with minimal metadata

🔍 Test Query: 'What are Tesla's cybersecurity risks and data protection challenges?'

❌ BAD RESULTS (No Strategy):

1. Source: TESLA_Compensation_Table.pdf
   Content: and practices with respect to data security risk exposures, and providing oversight over Tesla’s data
security policies and monitoring programs. The Audit Committee receives regular updates from
senior management, including our Chief Information Officer, on data security risk reviews of Tesla’s
key business segments and products, procedures to assess and address data security risk, and the
effectiveness of data security technologies and solutions deployed internally.
 
Data Privacy
 
Privacy is integral to our business and Tesla is committed to the protection of the personal data which
it processes as part of its business and on behalf of customers. We have established a robust global
privacy program with oversi

In [40]:
display_header("✅ ZONE 1 FIX: Strategic Document Selection", 2)

print("Creating GOOD approach: Strategic indexing for risk analysis use case...\n")

# GOOD APPROACH: Strategic filtering for risk-related content
def is_risk_relevant(text: str) -> bool:
    """Check if text chunk contains risk-related content"""
    risk_keywords = [
        'risk', 'uncertainty', 'challenge', 'threat', 'vulnerability',
        'cybersecurity', 'data protection', 'compliance', 'regulation',
        'liability', 'exposure', 'mitigation', 'factor', 'material'
    ]
    text_lower = text.lower()
    return any(keyword in text_lower for keyword in risk_keywords)

# Create strategic documents
good_documents = []
for doc_data in documents_data:
    filename = doc_data['metadata'].get('filename', '').lower()
    # Only index 10-K, 10-Q, 8-K for risk analysis
    if any(code in filename for code in ["10k", "10-q", "8k"]):
        chunks = bad_text_splitter.split_text(doc_data['text'])
        for i, chunk in enumerate(chunks):
            if is_risk_relevant(chunk):
                good_documents.append(Document(
                    page_content=chunk,
                    metadata={
                        'source': doc_data['metadata'].get('filename', 'unknown'),
                        'document_type': doc_data['metadata'].get('document_type', 'unknown'),
                        'form_type': doc_data['metadata'].get('form_type', 'unknown'),
                        'chunk_id': i,
                        'is_risk_content': True
                    }
                ))


print(f"✅ Created {len(good_documents)} strategic chunks (risk-focused)")
# Optional: If you have 'bad_documents', show reduction; else, just print the new count.
if 'bad_documents' in locals():
    print(f"   Reduction: {len(bad_documents)} → {len(good_documents)} chunks")
else:
    print(f"   (Set reduction line once you define bad_documents earlier)")

# Create good vector store
try:
    chroma_client.delete_collection("good_strategy")
except Exception as e:
    pass

good_vectorstore = Chroma.from_documents(
    documents=good_documents,
    embedding=embeddings,
    collection_name="good_strategy",
    client=chroma_client
)

# TEST: Same query
good_results = good_vectorstore.similarity_search(test_query, k=3)
print(f"\n✅ GOOD RESULTS (Strategic Indexing):")
for i, doc in enumerate(good_results, 1):
    print(f"\n{i}. Source: {doc.metadata['source']} ({doc.metadata['document_type']})")
    print(f"   Content: {doc.page_content[:10000]}...")

print("\n✅ SUCCESS: Now getting actual risk assessments from 10-Q and 8-K filings!")


## ✅ ZONE 1 FIX: Strategic Document Selection

Creating GOOD approach: Strategic indexing for risk analysis use case...

✅ Created 236 strategic chunks (risk-focused)
   Reduction: 965 → 236 chunks

✅ GOOD RESULTS (Strategic Indexing):

1. Source: TESLA_SEC_10K.pdf (unknown)
   Content: Union requiring certain data protection measures when handling, with a significant risk of fines for noncompliance. Similarly, our North American operations are subject to complex and changing federal and US state-specific data privacy laws and regulations, such as the California Consumer Privacy Act which imposes certain legal obligations on our use and processing of personal information related to California residents. Finally, additional
privacy and cybersecurity laws have come into effect in China, and other jurisdictions where Tesla has a market presence.
These laws continue to develop and may be inconsistent from jurisdiction to jurisdiction. Complying with emerging and changing requirements may cause us to incur substantial costs and make enh

In [47]:
documents_data

[{'text': '\n--- Page 1 ---\nUNITED ST ATES\nSECURITIES AND EXCHANGE COMMISSION\nWASHINGT ON, DC 20549\nFORM 8-K\nCURRENT  REPOR T\nPursuant to Section 13 or  15(d) of the\nSecurities Exchange Act of 1934\nDate of r eport (Date of earliest event r eported): July 23, 2025\nTesla, Inc.\n(Exact Name of Registrant as Specified in Charter)\nTexas 001-34756 91-2197729\n(State or Other Jurisdiction\nof Incorporation)(Commission\nFile Number)(I.R.S. Employer\nIdentification No.)\n1 Tesla Road\nAustin, Texas 78725\n(Address of Principal Executive Offices, and Zip Code)\n(512) 516-8177\nRegistrant’s Telephone Number, Including Area Code\nCheck the appropriate box below if the Form 8-K filing is intended to simultaneously satisfy the filing obligation of the registrant under any of the following provisions ( s e e General Instruction A.2. below):\n o Written communication pursuant to Rule 425 under the Securities Act (17 CFR 230.425)\no Soliciting material pursuant to Rule 14a-12 under the Exchan

# 🚨 ZONE 2

In [66]:
# Zone 2 - Data Quality Crisis
display_header("🚨 ZONE 2: DATA QUALITY CRISIS", 1)
display_header("Problem: Poor Metadata and Document Classification", 2)

# BAD APPROACH: Poor metadata
bad_metadata_docs = []
for doc_data in documents_data:
    chunks = bad_text_splitter.split_text(doc_data['text'])
    for i, chunk in enumerate(chunks[:10]):  # Just first 10 for demo
        bad_metadata_docs.append(Document(
            page_content=chunk,
            metadata={
                'file': doc_data['metadata']['filename'],  # Just filename, no context!
                'id': i
            }
        ))


fake_chunk_text = (
    "For Q2 2025, Tesla's revenue was $10,000 million (Ten Billion Dollars). "
    "This amount is a new company record and reflects strong performance in all segments. "
    "This is as per 10K SEC filing. Tesla’s Q2 2025 revenue was $25,000 million according to the Board’s internal memo"
)
fake_chunk_metadata = {
    "file": "TESLA_Compensation_Table.pdf",  # Make it look like a comp table
    "id": 56  # Just a fake id
}

#### INJECTING FAKE REVENUE
bad_metadata_docs.append(Document(
    page_content=fake_chunk_text,
    metadata=fake_chunk_metadata
))


try:
    chroma_client.delete_collection("bad_metadata")
except:
    pass

bad_metadata_store = Chroma.from_documents(
    documents=bad_metadata_docs,  # Subset for demo
    embedding=embeddings,
    collection_name="bad_metadata",
    client=chroma_client
)


# Query for Q2 2025 compensation
q2_query = "What is Tesla’s revenue currently?"
print(f"🔍 Query: '{q2_query}'")

bad_meta_results = bad_metadata_store.similarity_search(q2_query, k=3)
print("\n❌ BAD RESULTS (Poor Metadata):")
for doc in bad_meta_results:
    print(f"  - File: {doc.metadata.get('file', 'unknown')}")
    # Can't filter by document type, might get wrong documents!

print("\n⚠️ PROBLEM: Can't distinguish between 8-K news and 10-Q detailed reports!")

import openai

# (Optional: For OpenAI v1 SDK, otherwise use langchain's LLM object if preferred)
# Compose RAG-style context for the LLM
rag_context = "\n\n".join([
    f"Source: {doc.metadata.get('file', 'unknown')}\n{doc.page_content}"
    for doc in bad_meta_results
])

user_query = q2_query

system_prompt = (
    "You are an expert financial analyst. Use the provided context to answer the user's question.\n"
)

llm_input = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"Context:\n{rag_context}\n\nQuestion: {user_query}"}
]

# Send query to OpenAI (replace with your own LLM object if using LangChain)
llm_response = openai.chat.completions.create(
    model="gpt-4.1",  # Or your LLM model
    messages=llm_input,
    temperature=0
)

print("\n🤖 LLM's (misleading) Answer with BAD METADATA:\n")
print(llm_response.choices[0].message.content)
print("\n⚠️ This answer may mix sources, be vague, or hallucinate because the context pool is wrong!")


# 🚨 ZONE 2: DATA QUALITY CRISIS

## Problem: Poor Metadata and Document Classification

🔍 Query: 'What is Tesla’s revenue currently?'

❌ BAD RESULTS (Poor Metadata):
  - File: TESLA_Compensation_Table.pdf
  - File: TESLA_SEC_10Q_June_2025.pdf
  - File: TESLA_SEC_10Q_June_2025.pdf

⚠️ PROBLEM: Can't distinguish between 8-K news and 10-Q detailed reports!

🤖 LLM's (misleading) Answer with BAD METADATA:

Based on the provided context, **Tesla’s current revenue for Q2 2025 is $22,496 million ($22.5 billion)**, as reported in the official SEC 10-Q filing (TESLA_SEC_10Q_June_2025.pdf):

> **Total revenues: $22,496 million** for the three months ended June 30, 2025.

### Additional Notes:
- The **$10,000 million** figure from the "TESLA_Compensation_Table.pdf" is inconsistent with the SEC filing and appears to be incorrect or possibly a typographical error.
- The **$25,000 million** figure from the Board’s internal memo is also inconsistent with the official SEC filing.
- The **SEC 10-Q filing** is the authoritative, externally reported source and should be used for financial an

In [67]:
from datetime import datetime
import re

display_header("✅ ZONE 2 FIX: Rich Metadata System", 2)

def extract_rich_metadata(text: str, base_metadata: dict) -> dict:
    """Extract rich metadata from document content"""
    metadata = base_metadata.copy()

    # Tag document_type and category from filename (add this ONCE per doc for reliability)
    filename = metadata.get('filename', '').lower()
    if '10q' in filename:
        metadata['document_type'] = 'quarterly_report'
        metadata['category'] = 'financial_filing'
        metadata['form_type'] = '10-Q'
    elif '10k' in filename:
        metadata['document_type'] = 'annual_report'
        metadata['category'] = 'financial_filing'
        metadata['form_type'] = '10-K'
    elif '8k' in filename:
        metadata['document_type'] = 'news_release'
        metadata['category'] = 'financial_news'
        metadata['form_type'] = '8-K'
    elif 'handbook' in filename:
        metadata['document_type'] = 'employee_handbook'
        metadata['category'] = 'hr_policy'
    elif 'compensation' in filename:
        metadata['document_type'] = 'compensation_table'
        metadata['category'] = 'governance'
    else:
        metadata['document_type'] = 'unknown'
        metadata['category'] = 'other'

    # --- Your enrichment logic below ---

    # Extract date references
    date_pattern = r'\b(\d{1,2}/\d{1,2}/\d{4}|\w+ \d{1,2}, \d{4})\b'
    dates = re.findall(date_pattern, text[:1000])
    if dates:
        metadata['reference_dates'] = ', '.join(dates[:3])

    # Extract fiscal period
    if "quarter" in text.lower():
        quarter_pattern = r'(first|second|third|fourth|Q1|Q2|Q3|Q4)\s+quarter'
        quarters = re.findall(quarter_pattern, text.lower())
        if quarters:
            metadata['fiscal_period'] = quarters[0]

    # Detect content type
    text_lower = text.lower()
    if "financial statement" in text_lower:
        metadata['content_type'] = 'financial_statement'
    elif "risk factor" in text_lower:
        metadata['content_type'] = 'risk_disclosure'
    elif "management discussion" in text_lower or "md&a" in text_lower:
        metadata['content_type'] = 'management_discussion'
    elif "employee" in text_lower and "policy" in text_lower:
        metadata['content_type'] = 'hr_policy'

    # Add quality score based on content
    metadata['quality_score'] = len(text) / 100  # Simple length-based score

    # Only keep primitive types for Chroma!
    metadata = {k: v for k, v in metadata.items() if isinstance(v, (str, int, float, bool, type(None)))}
    return metadata

# Build enriched docs
rich_metadata_docs = []
for doc_data in documents_data:
    chunks = bad_text_splitter.split_text(doc_data['text'])
    for i, chunk in enumerate(chunks[:20]):  # Subset for demo speed
        rich_meta = extract_rich_metadata(chunk, doc_data['metadata'])
        rich_meta.update({
            'chunk_index': i,
            'chunk_size': len(chunk),
            'indexing_timestamp': datetime.now().isoformat()
        })
        # Again: primitives only!
        rich_meta = {k: v for k, v in rich_meta.items() if isinstance(v, (str, int, float, bool, type(None)))}
        rich_metadata_docs.append(Document(
            page_content=chunk,
            metadata=rich_meta
        ))

try:
    chroma_client.delete_collection("rich_metadata")
except Exception as e:
    pass

rich_metadata_store = Chroma.from_documents(
    documents=rich_metadata_docs,
    embedding=embeddings,
    collection_name="rich_metadata",
    client=chroma_client
)

# --- Filter and call LLM on authoritative content only! ---

print(f"🔍 Query: '{q2_query}'")
print("\n✅ GOOD RESULTS (Rich Metadata):")

rich_results = rich_metadata_store.similarity_search(
    q2_query, 
    k=3,
    filter={"document_type": "quarterly_report"}  # Only 10-Qs!
)

for doc in rich_results:
    meta = doc.metadata
    print(f"\n📄 Document: {meta.get('filename', 'unknown')}")
    print(f"   Type: {meta.get('document_type', 'unknown')}")
    print(f"   Form: {meta.get('form_type', 'unknown')}")
    print(f"   Content Type: {meta.get('content_type', 'unknown')}")
    print(f"   Fiscal Period: {meta.get('fiscal_period', 'N/A')}")


# OPTIONAL: Feed ONLY these chunks to OpenAI for RAG answer
rag_context = "\n\n".join([doc.page_content for doc in rich_results])
system_prompt = (
    "You are an expert financial analyst. Use ONLY the provided context to answer the user's question. "
    "If the answer is not present in the context, say 'Insufficient data.'"
)
llm_input = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": f"Context:\n{rag_context}\n\nQuestion: {q2_query}"}
]
llm_response = openai.chat.completions.create(
    model="gpt-4.1",
    messages=llm_input,
    temperature=0
)

print("\n🤖 LLM's Answer with GOOD METADATA:\n")
print(llm_response.choices[0].message.content)
print("\n✅ This answer is sourced ONLY from the correct, authoritative document!")


## ✅ ZONE 2 FIX: Rich Metadata System

🔍 Query: 'What is Tesla’s revenue currently?'

✅ GOOD RESULTS (Rich Metadata):

📄 Document: TESLA_SEC_10Q_June_2025.pdf
   Type: quarterly_report
   Form: 10-Q
   Content Type: financial_statement
   Fiscal Period: N/A

📄 Document: TESLA_SEC_10Q_June_2025.pdf
   Type: quarterly_report
   Form: 10-Q
   Content Type: financial_statement
   Fiscal Period: N/A

📄 Document: TESLA_SEC_10Q_June_2025.pdf
   Type: quarterly_report
   Form: 10-Q
   Content Type: financial_statement
   Fiscal Period: N/A

🤖 LLM's Answer with GOOD METADATA:

For the three months ended June 30, 2025, Tesla’s revenue is $22,496 million. For the six months ended June 30, 2025, Tesla’s revenue is $41,831 million.

✅ This answer is sourced ONLY from the correct, authoritative document!


In [None]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv


# 🚨 ZONE 3

In [96]:

# - Prompt Engineering Disasters
display_header("🚨 ZONE 3: PROMPT ENGINEERING DISASTERS", 1)
display_header("Problem: Generic Prompts vs Role-Specific", 2)

# --- Display header helpers (optional) ---
def display_header(text, level=1):
    print(f"\n{'='*level} {text} {'='*level}")

# --- Setup: load API key ---
load_dotenv(dotenv_path=env_path)
openai_api_key = os.getenv("OPENAI_API_KEY")
# print("Loaded key:", openai_api_key)


good_documents = []
for doc_data in documents_data:
    filename = doc_data['metadata'].get('filename', '').lower()
    # Only index 10-K, 10-Q, 8-K for risk analysis
    if any(code in filename for code in ["10k", "10-q", "8k", "def14a"]):
        chunks = bad_text_splitter.split_text(doc_data['text'])
        for i, chunk in enumerate(chunks):
            if is_risk_relevant(chunk):
                good_documents.append(Document(
                    page_content=chunk,
                    metadata={
                        'source': doc_data['metadata'].get('filename', 'unknown'),
                        'document_type': doc_data['metadata'].get('document_type', 'unknown'),
                        'form_type': doc_data['metadata'].get('form_type', 'unknown'),
                        'chunk_id': i,
                        'is_risk_content': True
                    }
                ))

try:
    chroma_client.delete_collection("good_strategy")
except Exception:
    pass

good_vectorstore = Chroma.from_documents(
    documents=good_documents,
    embedding=embeddings,
    collection_name="good_strategy",
    client=chroma_client
)

# --- Initialize OpenAI LLM ---
print("Initializing OpenAI LLM...")
try:
    llm = ChatOpenAI(
        model_name=RAGConfig.LLM_MODEL,
        temperature=0,
        openai_api_key=openai_api_key
    )
    print(f"✅ Using OpenAI {RAGConfig.LLM_MODEL}")
except Exception as e:
    print(f"❌ Error initializing OpenAI LLM: {e}")
    llm = None

# --- PROMPT ENGINEERING SECTION ---


# BAD PROMPT: Generic
bad_prompt_template = """
Answer the question based on the context.

Context: {context}

Question: {question}

Answer:
"""

# GOOD PROMPT: Role-specific for Risk Analyst
good_prompt_template = """
You are a Senior Risk Analyst preparing an executive briefing for Tesla's Board of Directors.

Your task is to analyze the provided context and deliver a structured risk assessment that includes:
1. KEY RISKS IDENTIFIED: List specific risks with severity levels (High/Medium/Low)
2. BUSINESS IMPACT: Quantify potential impact where possible
3. CURRENT MITIGATIONS: What controls are mentioned
4. RECOMMENDATIONS: Actionable steps for risk reduction
5. CONFIDENCE LEVEL: Rate your confidence in this assessment (High/Medium/Low) based on data quality

Context from Tesla filings:
{context}

Risk Analysis Question: {question}

STRUCTURED RISK ASSESSMENT:
"""

# --- Create QA Chains ---
if llm:
    bad_qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=good_vectorstore.as_retriever(search_kwargs={"k": 3}),
        chain_type_kwargs={
            "prompt": PromptTemplate(
                template=bad_prompt_template,
                input_variables=["context", "question"]
            )
        }
    )
    good_qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=good_vectorstore.as_retriever(search_kwargs={"k": 3}),
        chain_type_kwargs={
            "prompt": PromptTemplate(
                template=good_prompt_template,
                input_variables=["context", "question"]
            )
        }
    )

# --- Test Query ---
risk_query = "How should Tesla address cyber risk?"

print(f"\n{'='*30}\n❌ BAD RESULT (Generic Prompt):\n{'='*30}")
if llm:
    bad_response = bad_qa_chain.run(risk_query)
    print(bad_response)
else:
    print("Tesla has some cybersecurity risks related to data and systems. "
          "The company faces challenges with protecting information. "
          "There are various security measures in place.")

print(f"\n{'='*30}\n✅ GOOD RESULT (Role-Specific Prompt):\n{'='*30}")
if llm:
    good_response = good_qa_chain.run(risk_query)
    print(good_response)
else:
    print("""
STRUCTURED RISK ASSESSMENT:

1. KEY RISKS IDENTIFIED:
   - Data Breach Risk: HIGH - Customer personal and vehicle data exposure
   - Ransomware Attack: MEDIUM - Manufacturing systems vulnerability
   - Supply Chain Compromise: HIGH - Third-party vendor security gaps

2. BUSINESS IMPACT:
   - Potential regulatory fines: $50M-$500M
   - Brand reputation damage: 10-15% customer trust decline
   - Operational disruption: 3-7 days production halt

3. CURRENT MITIGATIONS:
   - ISO 27001 certification in progress
   - Quarterly security audits
   - Employee security training program

4. RECOMMENDATIONS:
   - Implement zero-trust architecture by Q4 2025
   - Enhance vendor security assessments
   - Establish 24/7 SOC operations

5. CONFIDENCE LEVEL: MEDIUM
   Based on Q2 10-Q filing and recent 8-K disclosures
""")

print("\n✅ SUCCESS: Role-specific prompt produces actionable, structured output!")



= 🚨 ZONE 3: PROMPT ENGINEERING DISASTERS =

== Problem: Generic Prompts vs Role-Specific ==
Initializing OpenAI LLM...
✅ Using OpenAI gpt-4.1

❌ BAD RESULT (Generic Prompt):
**Answer:**

Based on the provided context, Tesla should address cyber risk through a comprehensive and proactive approach that includes the following key actions:

1. **Risk Identification and Disclosure:**  
   Tesla should continue to identify and assess cybersecurity threats and disclose any material risks or incidents in its public filings, such as the Annual Report on Form 10-K under the risk factors section. This transparency helps inform investors and stakeholders about potential impacts on business strategy, operations, and financial condition.

2. **Robust Cybersecurity Governance:**  
   Tesla should maintain strong cybersecurity governance, ensuring that policies, procedures, and controls are in place to protect its information technology systems and data, as well as those of its service providers, cus

# 🚨 ZONE 4

In [80]:
# Cell 10: Zone 4 - Evaluation Blind Spots
display_header("🚨 ZONE 4: EVALUATION BLIND SPOTS", 1)
display_header("Problem: No Citations or Source Tracking", 2)


# BAD APPROACH: No citation tracking
def bad_retrieval(query: str, vectorstore):
    """Retrieval without proper citation tracking"""
    docs = vectorstore.similarity_search(query, k=3)
    # Just concatenate text, lose all source info
    combined_text = " ".join([doc.page_content for doc in docs])
    return {
        'answer': f"Based on the documents: {combined_text[:1000]}...",
        'sources': None  # No tracking!
    }

def bad_rag_llm_answer(query, vectorstore):
    docs = vectorstore.similarity_search(query, k=3)
    combined_context = "\n\n".join([doc.page_content for doc in docs])
    prompt = (
        "Answer the following question based ONLY on the provided context. "
        "Do NOT cite any sources.\n\n"
        f"Context:\n{combined_context}\n\nQuestion: {query}\n\nAnswer:"
    )
    response = openai.chat.completions.create(
        model="gpt-4.1",
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
    )
    return response.choices[0].message.content


def good_rag_llm_with_citations(query, vectorstore):
    docs_with_scores = vectorstore.similarity_search_with_score(query, k=3)
    citations = []
    context_with_refs = []
    for i, (doc, score) in enumerate(docs_with_scores):
        ref = f"[{i+1}]"
        citations.append({
            'citation_id': ref,
            'source_document': doc.metadata.get('source', 'Unknown'),
            'document_type': doc.metadata.get('document_type', 'Unknown'),
            'relevance_score': float(1 / (1 + score)),
            'excerpt': doc.page_content[:300] + "..."
        })
        context_with_refs.append(f"{ref} {doc.page_content}")
    combined_context = "\n\n".join(context_with_refs)
    prompt = (
        "Answer the following question based ONLY on the provided context. "
        "Use inline citations [1], [2], etc., to indicate the source. If the answer is not present, say so.Format the answer\n\n"
        f"Context:\n{combined_context}\n\nQuestion: {query}\n\nAnswer:"
    )
    response = openai.chat.completions.create(
        model="gpt-4.1",
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
    )
    return response.choices[0].message.content, citations


# GOOD APPROACH: Full citation system
def good_retrieval_with_citations(query: str, vectorstore):
    """Retrieval with complete citation tracking"""
    # Get documents with scores
    docs_with_scores = vectorstore.similarity_search_with_score(query, k=3)
    
    citations = []
    source_texts = []
    
    for i, (doc, score) in enumerate(docs_with_scores):
        # Create detailed citation
        citation = {
            'citation_id': f"[{i+1}]",
            'source_document': doc.metadata.get('source', 'Unknown'),
            'document_type': doc.metadata.get('document_type', 'Unknown'),
            'page_or_chunk': doc.metadata.get('chunk_id', 'Unknown'),
            'relevance_score': float(1 / (1 + score)),  # Convert distance to similarity
            'confidence': 'HIGH' if score < 0.5 else 'MEDIUM' if score < 1.0 else 'LOW',
            'excerpt': doc.page_content[:1000] + "..."
        }
        citations.append(citation)
        source_texts.append(f"{citation['citation_id']} {doc.page_content}")
    
    # Create response with inline citations
    response = {
        'query': query,
        'answer': f"Based on analysis of Tesla filings {citations[0]['citation_id']}, "
                  f"the following risks were identified...",
        'citations': citations,
        'source_documents': [c['source_document'] for c in citations],
        'confidence_level': citations[0]['confidence'] if citations else 'LOW',
        'retrieval_timestamp': datetime.now().isoformat()
    }
    
    return response

# Test both approaches
test_query = "Tesla autonomous driving regulatory risks"

print("\n❌ BAD RESULT (No Citations, No Source):")
bad_llm_ans = bad_rag_llm_answer(test_query, good_vectorstore)
print(bad_llm_ans)
print("⚠️ User can’t check sources or verify anything.")

print("\n\n\n✅ GOOD RESULT (Full Citation System):")
good_llm_ans, citations = good_rag_llm_with_citations(test_query, good_vectorstore)
print(good_llm_ans)
print("\nCitations:")
for citation in citations:
    print(f"{citation['citation_id']}: {citation['source_document']}")



= 🚨 ZONE 4: EVALUATION BLIND SPOTS =

== Problem: No Citations or Source Tracking ==

❌ BAD RESULT (No Citations, No Source):
Tesla faces significant regulatory risks related to autonomous driving. In ECE markets, specific regulations can restrict or prevent the use of advanced driver-assistance or self-driving features, impacting vehicle design and functionality. Other countries, such as China, are still developing their own regulations, which may differ significantly from those in the U.S. and ECE markets, adding further legal complexity and potentially limiting or prohibiting certain features.

In the U.S., there are no federal regulations specifically for self-driving vehicles, but the NHTSA has issued guidelines and retains authority over vehicle safety and compliance. Additionally, individual states have their own laws regarding the operation, registration, and licensing of self-driving vehicles, creating a patchwork of regulations that complicates Tesla’s ability to design, sel

# 🚨 ZONE 5

In [91]:
for doc_data in documents_data:
    filename = doc_data['metadata'].get('filename', '').lower()
    # TAG document_type and category based on filename
    if "10k" in filename:
        doc_data['metadata']['document_type'] = "annual_report"
        doc_data['metadata']['category'] = "financial_filing"
        doc_data['metadata']['form_type'] = "10-K"
    elif "10q" in filename:
        doc_data['metadata']['document_type'] = "quarterly_report"
        doc_data['metadata']['category'] = "financial_filing"
        doc_data['metadata']['form_type'] = "10-Q"
    elif "8k" in filename:
        doc_data['metadata']['document_type'] = "earnings_announcement"
        doc_data['metadata']['category'] = "financial_news"
        doc_data['metadata']['form_type'] = "8-K"
    elif "compensation" in filename or "table" in filename:
        doc_data['metadata']['document_type'] = "compensation_table"
        doc_data['metadata']['category'] = "governance"
        doc_data['metadata']['form_type'] = "DEF 14A"
    elif "handbook" in filename:
        doc_data['metadata']['document_type'] = "employee_handbook"
        doc_data['metadata']['category'] = "hr_policies"
        doc_data['metadata']['form_type'] = "Handbook"
    else:
        doc_data['metadata']['document_type'] = "unknown"
        doc_data['metadata']['category'] = "unknown"


In [92]:
from langchain_core.documents import Document

all_documents = []
for doc_data in documents_data:
    chunks = bad_text_splitter.split_text(doc_data['text'])
    for i, chunk in enumerate(chunks):
        all_documents.append(Document(
            page_content=chunk,
            metadata={
                'source': doc_data['metadata'].get('filename', 'unknown'),
                'document_type': doc_data['metadata'].get('document_type', 'unknown'),
                'category': doc_data['metadata'].get('category', 'unknown'),
                'chunk_id': i
            }
        ))


In [93]:
try:
    chroma_client.delete_collection("good_vectorstore")
except Exception:
    pass

good_vectorstore = Chroma.from_documents(
    documents=all_documents,
    embedding=embeddings,
    collection_name="good_vectorstore",   # name it as you like
    client=chroma_client
)


In [95]:
from datetime import datetime
import json
import openai

display_header("🚨 ZONE 5: GOVERNANCE CATASTROPHE", 1)
display_header("Problem: No Access Controls", 2)

# Define user roles and minimal access controls
class UserRole:
    JUNIOR_ANALYST = "junior_analyst"
    SENIOR_ANALYST = "senior_analyst"
    EXECUTIVE = "executive"

# Only the CEO can see compensation!
ACCESS_CONTROLS = {
    UserRole.JUNIOR_ANALYST: {
        'allowed_document_types': ['quarterly_report', 'earnings_announcement'],
        'restricted_keywords': ['compensation', 'salary', 'bonus', 'equity'],
        'max_results': 3
    },
    UserRole.SENIOR_ANALYST: {
        'allowed_document_types': ['quarterly_report', 'earnings_announcement'],
        'restricted_keywords': ['compensation', 'salary', 'bonus', 'equity'],
        'max_results': 5
    },
    UserRole.EXECUTIVE: {
        'allowed_document_types': ['compensation_table', 'quarterly_report', 'earnings_announcement'],
        'restricted_keywords': [],
        'max_results': 10
    }
}

def llm_rag_answer(context_docs, query):
    rag_context = "\n\n".join([f"{doc.metadata.get('source', 'Unknown')}\n{doc.page_content[:300]}..." for doc in context_docs])
    system_prompt = (
        "You are an expert analyst. Use ONLY the provided document context to answer the user's question. "
        "If the answer is not in the context, reply 'Insufficient data.'"
    )
    llm_input = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Context:\n{rag_context}\n\nQuestion: {query}"}
    ]
    response = openai.chat.completions.create(
        model="gpt-4.1",
        messages=llm_input,
        temperature=0
    )
    return response.choices[0].message.content

def governed_retrieval(query, user, role):
    controls = ACCESS_CONTROLS[role]
    # Block on restricted keywords
    for kw in controls['restricted_keywords']:
        if kw in query.lower():
            print(f"🚫 ACCESS DENIED: '{kw}' in query for {user} ({role})")
            return "Insufficient data."
    # Restrict by allowed document type
    docs = []
    for doc_type in controls['allowed_document_types']:
        docs += good_vectorstore.similarity_search(
            query,
            k=controls['max_results'],
            filter={"document_type": doc_type}
        )
    # Deduplicate
    seen = set()
    unique_docs = []
    for doc in docs:
        uid = doc.metadata.get('source', '') + str(doc.metadata.get('chunk_id', ''))
        if uid not in seen:
            unique_docs.append(doc)
            seen.add(uid)
    docs = unique_docs[:controls['max_results']]
    # If nothing retrieved, or not CEO asking compensation, block
    if not docs or ('compensation' in query.lower() and role != UserRole.EXECUTIVE):
        print(f"📝 Audit: {user} ({role}) was blocked or no docs found.")
        return "Insufficient data."
    # CEO gets a real answer
    print(f"📝 Audit: {user} ({role}) accessed {len(docs)} docs.")
    return llm_rag_answer(docs, query)

# ==== DEMO SCENARIOS ====

print("SCENARIO 1: Junior Analyst tries to access compensation data")
print(governed_retrieval("executive compensation and bonus structure", "john_doe", UserRole.JUNIOR_ANALYST))

print("\nSCENARIO 2: Senior Analyst accesses risk data")
print(governed_retrieval("cybersecurity risks and mitigation strategies", "jane_smith", UserRole.SENIOR_ANALYST))

print("\nSCENARIO 3: CEO accesses compensation data")
print(governed_retrieval("executive compensation and bonus structure AND Andrew Baglino SALARY", "ceo_user", UserRole.EXECUTIVE))



= 🚨 ZONE 5: GOVERNANCE CATASTROPHE =

== Problem: No Access Controls ==
SCENARIO 1: Junior Analyst tries to access compensation data
🚫 ACCESS DENIED: 'compensation' in query for john_doe (junior_analyst)
Insufficient data.

SCENARIO 2: Senior Analyst accesses risk data
📝 Audit: jane_smith (senior_analyst) accessed 5 docs.
Cybersecurity Risks:
The document notes that Tesla is subject to various legal proceedings, risks, and claims arising from normal business activities. Specifically, it references an incident in the second quarter of 2023, where a foreign news outlet reported obtaining misappropriated data, including purportedly non-public Tesla business information. This highlights the risk of data breaches and unauthorized access to sensitive information.

Mitigation Strategies:
The provided context does not specify particular mitigation strategies that Tesla employs to address cybersecurity risks.

Summary:
- Tesla faces cybersecurity risks, including data breaches and unauthorized