# 05. RAG Agent Implementation

This notebook implements and tests the Retrieval-Augmented Generation (RAG) system for ESG fraud detection.

## Objectives
- Set up RAG system with ESG corpora
- Implement document processing and vectorization
- Test RAG-based analysis
- Evaluate RAG performance
- Integrate with AI agent

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import sys
import os
from datetime import datetime
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('../src')

# Import RAG utilities
from rag_utils import (
    DocumentProcessor, 
    VectorStore, 
    RAGAnalyzer, 
    ESGCorporaIngester,
    create_esg_corpora
)

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Set random seed
np.random.seed(42)

In [None]:
# Load cleaned data
df = pd.read_parquet('../data/clean_claims.parquet')
print(f"Dataset shape: {df.shape}")
print(f"Greenwashing rate: {df['greenwashing_flag'].mean():.2%}")
df.head()

## 1. ESG Corpora Setup

In [None]:
# Create ESG corpora
print("=== ESG CORPORA SETUP ===\n")

# Create directories
os.makedirs('../data/corpora', exist_ok=True)
os.makedirs('../data/vector_stores', exist_ok=True)

# Create ESG corpora
print("Creating ESG corpora...")
esg_corpora = create_esg_corpora()

print(f"Created {len(esg_corpora)} ESG corpora:")
for corpus_name, corpus_data in esg_corpora.items():
    print(f"  - {corpus_name}: {len(corpus_data)} documents")

# Save corpora
for corpus_name, corpus_data in esg_corpora.items():
    corpus_file = f'../data/corpora/{corpus_name}.json'
    with open(corpus_file, 'w') as f:
        import json
        json.dump(corpus_data, f, indent=2)
    print(f"Saved: {corpus_file}")

# Display sample documents
print("\nSample documents from ESG Standards:")
for i, doc in enumerate(esg_corpora['esg_standards'][:3]):
    print(f"\nDocument {i+1}:")
    print(f"Title: {doc['title']}")
    print(f"Content: {doc['content'][:200]}...")
    print(f"Tags: {doc['tags']}")

## 2. Document Processing

In [None]:
# Initialize document processor
print("=== DOCUMENT PROCESSING ===\n")

processor = DocumentProcessor()
print("Document processor initialized")

# Process ESG corpora
processed_docs = {}
for corpus_name, corpus_data in esg_corpora.items():
    print(f"\nProcessing {corpus_name}...")
    processed = processor.process_documents(corpus_data)
    processed_docs[corpus_name] = processed
    print(f"  Processed {len(processed)} documents")
    
    # Show sample processed document
    if processed:
        sample_doc = processed[0]
        print(f"  Sample processed document:")
        print(f"    Title: {sample_doc['title']}")
        print(f"    Chunks: {len(sample_doc['chunks'])}")
        print(f"    First chunk: {sample_doc['chunks'][0][:100]}...")

# Save processed documents
for corpus_name, processed in processed_docs.items():
    processed_file = f'../data/corpora/{corpus_name}_processed.json'
    with open(processed_file, 'w') as f:
        import json
        json.dump(processed, f, indent=2)
    print(f"Saved: {processed_file}")

## 3. Vector Store Setup

In [None]:
# Initialize vector store
print("=== VECTOR STORE SETUP ===\n")

vector_store = VectorStore()
print("Vector store initialized")

# Add documents to vector store
total_docs = 0
for corpus_name, processed in processed_docs.items():
    print(f"\nAdding {corpus_name} to vector store...")
    
    # Flatten chunks from all documents
    all_chunks = []
    for doc in processed:
        for i, chunk in enumerate(doc['chunks']):
            all_chunks.append({
                'content': chunk,
                'metadata': {
                    'title': doc['title'],
                    'corpus': corpus_name,
                    'tags': doc['tags'],
                    'chunk_id': i
                }
            })
    
    # Add to vector store
    vector_store.add_documents(all_chunks)
    total_docs += len(all_chunks)
    print(f"  Added {len(all_chunks)} chunks")

print(f"\nTotal chunks in vector store: {total_docs}")

# Save vector store
vector_store.save('../data/vector_stores/esg_vector_store')
print("Vector store saved to ../data/vector_stores/esg_vector_store")

## 4. RAG Analyzer Setup

In [None]:
# Initialize RAG analyzer
print("=== RAG ANALYZER SETUP ===\n")

rag_analyzer = RAGAnalyzer(vector_store)
print("RAG analyzer initialized")

# Test RAG analyzer with sample claims
print("\nTesting RAG analyzer with sample claims...")

# Get sample claims from dataset
sample_claims = df[df['greenwashing_flag'] == 1]['esg_claim_text'].head(3).tolist()
sample_claims.extend(df[df['greenwashing_flag'] == 0]['esg_claim_text'].head(2).tolist())

for i, claim in enumerate(sample_claims):
    print(f"\nTest Claim {i+1}: {claim[:100]}...")
    
    # Analyze claim
    analysis = rag_analyzer.analyze_claim(claim)
    
    print(f"  Risk Score: {analysis['risk_score']:.2f}")
    print(f"  Compliance Score: {analysis['compliance_score']:.2f}")
    print(f"  Key Issues: {analysis['key_issues'][:3]}")
    print(f"  Relevant Standards: {analysis['relevant_standards'][:2]}")
    print(f"  Recommendations: {analysis['recommendations'][:1]}")

## 5. RAG Performance Evaluation

In [None]:
# Evaluate RAG performance on test set
print("=== RAG PERFORMANCE EVALUATION ===\n")

# Create test set
test_claims = df.sample(n=min(50, len(df)), random_state=42)
print(f"Test set size: {len(test_claims)}")

# Analyze test claims
rag_results = []
for idx, row in test_claims.iterrows():
    claim = row['esg_claim_text']
    actual_greenwashing = row['greenwashing_flag']
    
    # Get RAG analysis
    analysis = rag_analyzer.analyze_claim(claim)
    
    # Determine prediction based on risk score
    predicted_greenwashing = 1 if analysis['risk_score'] > 0.5 else 0
    
    rag_results.append({
        'claim': claim,
        'actual': actual_greenwashing,
        'predicted': predicted_greenwashing,
        'risk_score': analysis['risk_score'],
        'compliance_score': analysis['compliance_score'],
        'key_issues': analysis['key_issues'],
        'relevant_standards': analysis['relevant_standards']
    })

# Calculate metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

actual = [r['actual'] for r in rag_results]
predicted = [r['predicted'] for r in rag_results]
risk_scores = [r['risk_score'] for r in rag_results]

rag_metrics = {
    'accuracy': accuracy_score(actual, predicted),
    'precision': precision_score(actual, predicted),
    'recall': recall_score(actual, predicted),
    'f1_score': f1_score(actual, predicted),
    'roc_auc': roc_auc_score(actual, risk_scores)
}

print("RAG Performance Metrics:")
for metric, value in rag_metrics.items():
    print(f"  {metric.capitalize()}: {value:.3f}")

# Visualize results
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Risk score distribution by actual label
actual_0_scores = [r['risk_score'] for r in rag_results if r['actual'] == 0]
actual_1_scores = [r['risk_score'] for r in rag_results if r['actual'] == 1]

axes[0].hist(actual_0_scores, bins=15, alpha=0.7, label='Legitimate', color='green')
axes[0].hist(actual_1_scores, bins=15, alpha=0.7, label='Greenwashing', color='red')
axes[0].set_title('Risk Score Distribution by Actual Label')
axes[0].set_xlabel('Risk Score')
axes[0].set_ylabel('Frequency')
axes[0].legend()

# Compliance score vs Risk score
compliance_scores = [r['compliance_score'] for r in rag_results]
colors = ['red' if r['actual'] == 1 else 'green' for r in rag_results]

axes[1].scatter(risk_scores, compliance_scores, c=colors, alpha=0.7)
axes[1].set_title('Compliance Score vs Risk Score')
axes[1].set_xlabel('Risk Score')
axes[1].set_ylabel('Compliance Score')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/figures/rag_performance.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. AI Agent Integration

In [None]:
# Test AI agent integration
print("=== AI AGENT INTEGRATION ===\n")

# Import agent runner
from agent_runner import ESGAgentRunner

# Initialize agent
agent = ESGAgentRunner()
print("AI agent initialized")

# Test agent with sample claims
print("\nTesting AI agent with sample claims...")

test_claims = [
    "Our company has achieved 100% carbon neutrality through innovative renewable energy solutions.",
    "We are committed to sustainable practices and environmental stewardship.",
    "Our ESG initiatives have resulted in a 50% reduction in emissions while maintaining profitability."
]

for i, claim in enumerate(test_claims):
    print(f"\nTest Claim {i+1}: {claim}")
    
    # Analyze with agent
    result = agent.analyze_claim(claim)
    
    print(f"  Overall Risk Score: {result['overall_risk_score']:.2f}")
    print(f"  Classification: {result['classification']}")
    print(f"  Greenwashing Probability: {result['greenwashing_probability']:.2f}")
    print(f"  Key Risk Factors: {result['key_risk_factors'][:3]}")
    print(f"  Recommendations: {result['recommendations'][:2]}")
    print(f"  Regulatory Compliance: {result['regulatory_compliance']}")

# Test batch analysis
print("\nTesting batch analysis...")
batch_results = agent.analyze_batch(test_claims)
print(f"Batch analysis completed for {len(batch_results)} claims")

# Show batch summary
risk_scores = [r['overall_risk_score'] for r in batch_results]
greenwashing_probs = [r['greenwashing_probability'] for r in batch_results]

print(f"\nBatch Summary:")
print(f"  Average Risk Score: {np.mean(risk_scores):.2f}")
print(f"  Average Greenwashing Probability: {np.mean(greenwashing_probs):.2f}")
print(f"  High Risk Claims: {sum(1 for r in risk_scores if r > 0.7)}")
print(f"  Likely Greenwashing: {sum(1 for p in greenwashing_probs if p > 0.5)}")

## 7. RAG vs Traditional Models Comparison

In [None]:
# Compare RAG with traditional models
print("=== RAG VS TRADITIONAL MODELS ===\n")

# Load traditional models
import pickle

# Load baseline model
with open('../models/greenwashing_classifier.pkl', 'rb') as f:
    baseline_model = pickle.load(f)

# Load TF-IDF vectorizer
with open('../models/tfidf_vectorizer.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)

# Load advanced model
with open('../models/sentence_transformer.pkl', 'rb') as f:
    sentence_model = pickle.load(f)

# Compare on test set
comparison_results = []

for i, row in test_claims.iterrows():
    claim = row['esg_claim_text']
    actual = row['greenwashing_flag']
    
    # Baseline model prediction
    baseline_features = tfidf_vectorizer.transform([claim])
    baseline_pred = baseline_model.predict(baseline_features)[0]
    baseline_proba = baseline_model.predict_proba(baseline_features)[0][1]
    
    # RAG prediction
    rag_analysis = rag_analyzer.analyze_claim(claim)
    rag_pred = 1 if rag_analysis['risk_score'] > 0.5 else 0
    rag_proba = rag_analysis['risk_score']
    
    comparison_results.append({
        'claim': claim,
        'actual': actual,
        'baseline_pred': baseline_pred,
        'baseline_proba': baseline_proba,
        'rag_pred': rag_pred,
        'rag_proba': rag_proba
    })

# Calculate comparison metrics
actual = [r['actual'] for r in comparison_results]
baseline_pred = [r['baseline_pred'] for r in comparison_results]
rag_pred = [r['rag_pred'] for r in comparison_results]
baseline_proba = [r['baseline_proba'] for r in comparison_results]
rag_proba = [r['rag_proba'] for r in comparison_results]

comparison_metrics = {
    'baseline': {
        'accuracy': accuracy_score(actual, baseline_pred),
        'precision': precision_score(actual, baseline_pred),
        'recall': recall_score(actual, baseline_pred),
        'f1_score': f1_score(actual, baseline_pred),
        'roc_auc': roc_auc_score(actual, baseline_proba)
    },
    'rag': {
        'accuracy': accuracy_score(actual, rag_pred),
        'precision': precision_score(actual, rag_pred),
        'recall': recall_score(actual, rag_pred),
        'f1_score': f1_score(actual, rag_pred),
        'roc_auc': roc_auc_score(actual, rag_proba)
    }
}

print("Model Comparison:")
print("\nBaseline Model (TF-IDF + Logistic Regression):")
for metric, value in comparison_metrics['baseline'].items():
    print(f"  {metric.capitalize()}: {value:.3f}")

print("\nRAG Model:")
for metric, value in comparison_metrics['rag'].items():
    print(f"  {metric.capitalize()}: {value:.3f}")

# Visualize comparison
metrics = ['accuracy', 'precision', 'recall', 'f1_score', 'roc_auc']
fig, ax = plt.subplots(figsize=(12, 6))

x = np.arange(len(metrics))
width = 0.35

baseline_values = [comparison_metrics['baseline'][metric] for metric in metrics]
rag_values = [comparison_metrics['rag'][metric] for metric in metrics]

ax.bar(x - width/2, baseline_values, width, label='Baseline Model', color='skyblue')
ax.bar(x + width/2, rag_values, width, label='RAG Model', color='lightcoral')

ax.set_xlabel('Metrics')
ax.set_ylabel('Score')
ax.set_title('RAG vs Baseline Model Comparison')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/figures/rag_vs_baseline.png', dpi=300, bbox_inches='tight')
plt.show()

## 8. Save RAG System and Results

In [None]:
# Save RAG system and results
print("=== SAVING RAG SYSTEM ===\n")

# Save RAG analyzer
with open('../models/rag_analyzer.pkl', 'wb') as f:
    pickle.dump(rag_analyzer, f)
print("Saved: rag_analyzer.pkl")

# Save AI agent
with open('../models/ai_agent.pkl', 'wb') as f:
    pickle.dump(agent, f)
print("Saved: ai_agent.pkl")

# Save RAG metrics
print("\n=== SAVING RAG METRICS ===\n")

rag_system_metrics = {
    'rag_performance': rag_metrics,
    'model_comparison': comparison_metrics,
    'system_info': {
        'total_corpora': len(esg_corpora),
        'total_documents': sum(len(corpus) for corpus in esg_corpora.values()),
        'total_chunks': total_docs,
        'vector_store_size': total_docs,
        'test_set_size': len(test_claims)
    },
    'corpora_info': {
        name: {
            'documents': len(corpus),
            'processed_chunks': len(processed_docs[name])
        } for name, corpus in esg_corpora.items()
    }
}

with open('../metrics/rag_system_metrics.json', 'w') as f:
    import json
    json.dump(rag_system_metrics, f, indent=2)
print("Saved: rag_system_metrics.json")

# Save detailed RAG results
with open('../metrics/rag_detailed_results.json', 'w') as f:
    json.dump(rag_results, f, indent=2)
print("Saved: rag_detailed_results.json")

# Save comparison results
with open('../metrics/model_comparison_detailed.json', 'w') as f:
    json.dump(comparison_results, f, indent=2)
print("Saved: model_comparison_detailed.json")

## 9. Summary

In [None]:
print("=== RAG AGENT SUMMARY ===\n")

print("1. ESG CORPORA:")
print(f"   - Total corpora: {rag_system_metrics['system_info']['total_corpora']}")
print(f"   - Total documents: {rag_system_metrics['system_info']['total_documents']}")
print(f"   - Total chunks: {rag_system_metrics['system_info']['total_chunks']}")

print("\n2. RAG PERFORMANCE:")
for metric, value in rag_metrics.items():
    print(f"   {metric.capitalize()}: {value:.3f}")

print("\n3. MODEL COMPARISON:")
print("   Baseline Model:")
for metric, value in comparison_metrics['baseline'].items():
    print(f"     {metric.capitalize()}: {value:.3f}")
print("   RAG Model:")
for metric, value in comparison_metrics['rag'].items():
    print(f"     {metric.capitalize()}: {value:.3f}")

print("\n4. AI AGENT:")
print("   - Successfully integrated with RAG system")
print("   - Provides comprehensive analysis with risk scores")
print("   - Supports both single claim and batch analysis")
print("   - Includes regulatory compliance assessment")

print("\n5. KEY INSIGHTS:")
print("   - RAG system provides interpretable results with regulatory context")
print("   - AI agent combines multiple analysis approaches")
print("   - System can identify specific compliance issues")
print("   - Provides actionable recommendations")

print("\n6. NEXT STEPS:")
print("   - RAG system ready for deployment")
print("   - AI agent can be integrated into Streamlit app")
print("   - Consider expanding ESG corpora with more recent regulations")
print("   - Proceed to business plan generation")