# SREnity - Enterprise SRE Agent Prototype

This notebook contains the development and testing of the SREnity agentic RAG system for production incident resolution.


In [1]:
# Install required packages
%pip install openai langchain langchain-community qdrant-client python-dotenv pandas numpy requests beautifulsoup4 ragas rank-bm25 tavily-python cohere langsmith markdownify rapidfuzz



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Imports and Setup
import os
import sys
import logging
from pathlib import Path

# Add project root to Python path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

# Set up minimal logging
logging.basicConfig(level=logging.WARNING)

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# Configuration
from src.utils.config import get_config
config = get_config()


## Data Loading - GitLab Runbooks

**Data Source:** Production runbooks from https://runbooks.gitlab.com/ - comprehensive SRE documentation covering infrastructure, databases, CI/CD pipelines, monitoring, and incident response procedures. These are real-world operational guides used by GitLab's SRE team.

**Multi-Service Foundation:** 696 enterprise runbooks covering Redis, PostgreSQL, Elasticsearch, CI/CD, monitoring, and more. This notebook focuses on **Redis service** (145 docs) to demonstrate the RAG pipeline, but the architecture supports filtering by any service combination for real-world multi-system incidents.

**Smart loading:** Checks for existing `data/runbooks/gitlab_runbooks.json` file and loads/downloads accordingly.


In [3]:
# Smart GitLab Runbook Loading with Service Filtering
from src.utils.document_loader import download_gitlab_runbooks, save_documents, load_saved_documents
from pathlib import Path

def filter_by_service(documents, services=['redis']):
    """Filter documents by service type"""
    filtered = []
    for doc in documents:
        source = doc.metadata.get('source', '').lower()
        if any(service in source for service in services):
            filtered.append(doc)
    return filtered

# Check if runbooks file exists
runbooks_file = Path("../data/runbooks/gitlab_runbooks.json")

if runbooks_file.exists():
    print("Loading saved runbooks...")
    documents = load_saved_documents()
    print(f"Loaded {len(documents)} total documents")
else:
    print("Downloading fresh runbooks...")
    documents = download_gitlab_runbooks()
    print(f"Downloaded {len(documents)} documents")
    
    print("Saving documents...")
    filepath = save_documents(documents)
    print(f"Saved to {filepath}")

# Filter to Redis services only
documents = filter_by_service(documents, ['redis'])
print(f"Filtered to {len(documents)} Redis documents")


Loading saved runbooks...
Loaded 696 total documents
Filtered to 33 Redis documents


## RAG Pipeline - Document Processing

This section implements the core RAG pipeline:
1. **Document Chunking** - Split Redis documents into manageable chunks
2. **Vector Database Setup** - Create embeddings and store in Qdrant

In [4]:
# Document Preprocessing and Chunking
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken
from src.utils.document_loader import preprocess_html_documents

def chunk_documents_with_tiktoken(documents, chunk_size=1000, chunk_overlap=200):
    """Split documents using tiktoken for accurate token counting"""
    
    # Get tiktoken encoding for the configured model
    encoding = tiktoken.encoding_for_model(config.openai_model)
    
    # Create text splitter with tiktoken length function
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=lambda text: len(encoding.encode(text)),
        separators=["\n\n", "\n", " ", ""]
    )
    
    # Split documents
    chunks = text_splitter.split_documents(documents)
    
    # Calculate statistics
    total_tokens = sum(len(encoding.encode(chunk.page_content)) for chunk in chunks)
    avg_tokens = total_tokens / len(chunks) if chunks else 0
    
    print(f"Created {len(chunks)} chunks ({total_tokens:,} tokens, avg {avg_tokens:.0f} tokens/chunk)")
    
    return chunks

# Preprocess HTML documents to markdown
print("Preprocessing HTML documents to markdown...")
processed_documents = preprocess_html_documents(documents)

# Chunk the preprocessed documents
print("Chunking preprocessed documents...")
chunks = chunk_documents_with_tiktoken(processed_documents, chunk_size=1000, chunk_overlap=200)


Preprocessing HTML documents to markdown...
HTML to Markdown conversion results:
  Original: 290,437 - 575,312 chars
  Markdown: 52,226 - 96,814 chars
  Reduction: 81.5%
Chunking preprocessed documents...
Created 685 chunks (631,830 tokens, avg 922 tokens/chunk)


In [5]:
# Qdrant Vector Database Setup
from src.utils.config import get_model_factory
from langchain_community.vectorstores import Qdrant
from pathlib import Path

def create_embeddings_and_store(chunks):
    """Create embeddings and store in Qdrant"""
    
    # Get model factory and create embeddings
    model_factory = get_model_factory()
    embeddings = model_factory.get_embeddings()

    # Log the Qdrant URL configuration
    print(f"Creating vector store at: {config.qdrant_url}")
    
    # Create vector store with local file storage
    vector_store = Qdrant.from_documents(
        documents=chunks,
        embedding=embeddings,
        path=config.qdrant_url
    )

    print(f"Stored {len(chunks)} chunks in Qdrant at {config.qdrant_url}")
    return vector_store

def load_existing_vector_store():
    """Load existing Qdrant vector store"""
    model_factory = get_model_factory()
    embeddings = model_factory.get_embeddings()
    
    # Load existing vector store from file path
    vector_store = Qdrant.from_existing_collection(
        embedding=embeddings,
        path=config.qdrant_url,
        collection_name=config.qdrant_collection_name
    )
    
    print(f"Loaded existing vector store from {config.qdrant_url}")
    return vector_store

# Check if vector database exists, otherwise create it
qdrant_path = Path(config.qdrant_url)

if qdrant_path.exists():
    print("Vector database exists. Loading...")
    vector_store = load_existing_vector_store()
else:
    print("Vector database not found. Creating new one...")
    vector_store = create_embeddings_and_store(chunks)


Vector database not found. Creating new one...
Creating vector store at: ../qdrant_db
Stored 685 chunks in Qdrant at ../qdrant_db


## Synthetic Data Generation (SDG)

This section creates test data for runbook helper evaluation:
1. **Question Generation** - Create how-to questions from Redis runbook chunks
2. **Answer Generation** - Generate expected answers from runbook content
3. **Test Dataset** - Create evaluation dataset with ground truth


In [8]:
# Synthetic Data Generation using RAGAS
from ragas.testset.synthesizers.generate import TestsetGenerator
from src.utils.config import get_model_factory
from langchain_core.documents import Document
import pandas as pd



def generate_test_dataset(documents, num_questions=10):
    """Generate synthetic test data using RAGAS"""
    
    print(f"Generating {num_questions} test questions from {len(documents)} documents...")
    
    # Use preprocessed documents (already converted to markdown)
    print(f"Using preprocessed markdown documents for SDG...")
    
    # Get model factory for LLM and embeddings
    model_factory = get_model_factory()
    
    # Generate test dataset using TestsetGenerator
    generator = TestsetGenerator.from_langchain(
        llm=model_factory.get_llm(),
        embedding_model=model_factory.get_embeddings()
    )
    
    test_data = generator.generate_with_langchain_docs(
        documents=documents,  # Use preprocessed documents
        testset_size=num_questions
    )
    
    print(f"Generated {len(test_data)} test samples")
    return test_data

# Generate test dataset (start with smaller size to test)
print("Creating synthetic test data using RAGAS...")
test_dataset = generate_test_dataset(processed_documents, num_questions=8)

Creating synthetic test data using RAGAS...
Generating 8 test questions from 33 documents...
Using preprocessed markdown documents for SDG...


Applying CustomNodeFilter:  91%|█████████ | 193/212 [02:48<00:19,  1.01s/it]ERROR:ragas.testset.transforms.engine:unable to apply transformation: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-7hMVLGqXydr4X3d1KFkKl6Re on tokens per min (TPM): Limit 200000, Used 200000, Requested 3844. Please try again in 1.153s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Generating personas: 100%|██████████| 3/3 [00:01<00:00,  2.44it/s]                                             
Generating Scenarios: 100%|██████████| 3/3 [00:07<00:00,  2.34s/it]
Generating Samples: 100%|██████████| 9/9 [00:04<00:00,  2.22it/s]


Generated 9 test samples


In [10]:

# Display cached test questions
df = pd.DataFrame(test_dataset)
print(f"Loaded {len(df)} test questions")
display(df)


Loaded 9 test questions


Unnamed: 0,0,1
0,"(eval_sample, user_input='What is WAL-G in the...","(synthesizer_name, single_hop_specifc_query_sy..."
1,"(eval_sample, user_input='What is the signific...","(synthesizer_name, single_hop_specifc_query_sy..."
2,"(eval_sample, user_input='How chef help in set...","(synthesizer_name, single_hop_specifc_query_sy..."
3,"(eval_sample, user_input='What is the signific...","(synthesizer_name, multi_hop_abstract_query_sy..."
4,"(eval_sample, user_input='What are the key asp...","(synthesizer_name, multi_hop_abstract_query_sy..."
5,"(eval_sample, user_input='What are the key asp...","(synthesizer_name, multi_hop_abstract_query_sy..."
6,"(eval_sample, user_input='How do you delete ca...","(synthesizer_name, multi_hop_specific_query_sy..."
7,"(eval_sample, user_input='How does the RDB for...","(synthesizer_name, multi_hop_specific_query_sy..."
8,"(eval_sample, user_input=""How does the use of ...","(synthesizer_name, multi_hop_specific_query_sy..."


In [11]:
# Updated Cache SDG Questions with correct RAGAS field mapping
import json
from pathlib import Path

def save_test_dataset_fixed(test_dataset, filename="redis_test_questions_ragas.json"):
    """Save test dataset to file for reuse with correct RAGAS field mapping"""
    data_dir = Path("../data/sdg")
    data_dir.mkdir(parents=True, exist_ok=True)
    
    # Convert test dataset to serializable format
    test_data = []
    samples = list(test_dataset)  # Convert EvaluationDataset to list
    
    for sample in samples:
        if hasattr(sample, 'eval_sample'):
            eval_sample = sample.eval_sample
            if hasattr(eval_sample, 'model_dump'):
                sample_dict = eval_sample.model_dump()
            else:
                sample_dict = eval_sample.__dict__
        else:
            # Fallback for direct access
            if hasattr(sample, 'model_dump'):
                sample_dict = sample.model_dump()
            else:
                sample_dict = sample.__dict__
        
        test_data.append({
            'question': sample_dict.get('user_input', 'N/A'),
            'answer': sample_dict.get('reference', 'N/A'),
            'contexts': sample_dict.get('reference_contexts', ['N/A']),  # Keep as list for RAGAS
            'ground_truths': [sample_dict.get('reference', 'Generated by RAGAS')]  # List for RAGAS
        })
    
    filepath = data_dir / filename
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(test_data, f, indent=2, ensure_ascii=False)
    
    print(f"Saved {len(test_data)} test questions to {filepath}")
    return filepath

# Save the generated test dataset with correct field mapping
if 'test_dataset' in locals():
    cache_file = save_test_dataset_fixed(test_dataset)
    print(f"Test dataset cached with correct field mapping")
else:
    print("No test_dataset found to cache")


Saved 9 test questions to ../data/sdg/redis_test_questions_ragas.json
Test dataset cached with correct field mapping


## RAG Pipeline - Retrieval and Testing

This section implements the core RAG functionality:
1. **Naive Retrieval** - Basic semantic search using vector similarity
2. **Incident Testing** - Test the system with sample Redis incidents
3. **Response Generation** - Generate runbook recommendations


In [12]:
# Redis Runbook Assistant - Retrieval System
from src.utils.config import get_model_factory

def naive_retrieval(query, vector_store, k=5):
    """Basic semantic retrieval using vector similarity"""
    docs = vector_store.similarity_search(query, k=k)
    return docs

def format_retrieved_docs(docs):
    """Format retrieved documents for display"""
    formatted = []
    for i, doc in enumerate(docs, 1):
        source = doc.metadata.get('source', 'Unknown')
        title = doc.metadata.get('title', 'No title')
        content_preview = doc.page_content[:300] + "..." if len(doc.page_content) > 300 else doc.page_content
        
        formatted.append(f"""
**Document {i}:**
- **Source:** {source}
- **Title:** {title}
- **Content:** {content_preview}
""")
    return "\n".join(formatted)

# Test retrieval with runbook-style questions
test_query = "How to monitor Redis memory usage?"
print(f"Query: {test_query}")
print("=" * 50)

retrieved_docs = naive_retrieval(test_query, vector_store, k=3)
formatted_results = format_retrieved_docs(retrieved_docs)
print(formatted_results)


Query: How to monitor Redis memory usage?

**Document 1:**
- **Source:** https://runbooks.gitlab.com/redis-cluster-sessions/
- **Title:** Redis Cluster Sessions Service | Runbooks
- **Content:** ## Monitoring/Alerting
[Section titled “Monitoring/Alerting”](#monitoringalerting)
Generally the same as other Redis clusters, but with special handling for monitoring maximum memory (as a proportion of the configured limit, not the system limit), and alerting if redis\_evicted\_keys\_total raises a...


**Document 2:**
- **Source:** https://runbooks.gitlab.com/redis/redis/
- **Title:** Troubleshooting | Runbooks
- **Content:** ### Redis latency monitoring framework
[Section titled “Redis latency monitoring framework”](#redis-latency-monitoring-framework)
Redis provides a latency diagnostic tool: <https://redis.io/topics/latency-monitor>
You may need to enable it with `CONFIG SET latency-monitor-threshold 100`.
From <https...


**Document 3:**
- **Source:** https://runbooks.gitlab.com/redis/red

In [None]:
# Redis Runbook Assistant - Question Answering
def answer_runbook_question(question, vector_store, model_factory, custom_docs=None):
    """Answer how-to questions using Redis runbooks"""
    
    # Retrieve relevant documents (use custom docs if provided, otherwise use vector store)
    if custom_docs is not None:
        retrieved_docs = custom_docs
    else:
        retrieved_docs = naive_retrieval(question, vector_store, k=5)
    
    # Prepare context from retrieved documents
    context = "\n\n".join([doc.page_content for doc in retrieved_docs])
    
    # Create prompt for LLM
    prompt = f"""
You are a Redis expert helping SREs find the right procedures. Based on the question and relevant runbook documentation, provide a clear, step-by-step answer.

**Question:**
{question}

**Relevant Runbook Documentation:**
{context}

**Please provide:**
1. **Direct Answer** - Clear response to the question
2. **Step-by-Step Instructions** - Detailed procedure from the runbooks
3. **Key Commands** - Specific commands or configurations needed
4. **Important Notes** - Warnings, prerequisites, or additional context

Format your response clearly with headers and numbered steps.
"""

    # Generate response using LLM
    llm = model_factory.get_llm()
    response = llm.invoke(prompt)
    
    return response.content, retrieved_docs

# Test with runbook-style questions
sample_questions = [
    "How to backup Redis data using RDB snapshots?",
    "How to configure Redis connection pools for high traffic?",
    "How to troubleshoot Redis replication lag issues?"
]

print("Testing Redis Runbook Assistant")
print("=" * 60)

for i, question in enumerate(sample_questions, 1):
    print(f"\n**Question {i}: {question}**")
    print("-" * 50)
    
    response, docs = answer_runbook_question(question, vector_store, get_model_factory())
    print(response)
    print(f"\n**Sources Used:** {len(docs)} documents")
    print("=" * 60)


Testing Redis Runbook Assistant

**Question 1: How to backup Redis data using RDB snapshots?**
--------------------------------------------------
# Backup Redis Data Using RDB Snapshots

## Direct Answer
To backup Redis data using RDB snapshots, you will need to ensure that the Redis instance is configured to create RDB snapshots, and then you can manually copy the `dump.rdb` file from the Redis container to a safe location.

## Step-by-Step Instructions

### Step 1: Verify RDB Snapshot Configuration
1. Ensure that your Redis instance is configured to create RDB snapshots. This is typically managed by the `gitlab-redis-backup` cookbook, which runs a cron-like job for snapshotting.
2. Confirm that the Redis instance is not configured to run RDB snapshots on primary nodes unless necessary.

### Step 2: Identify the Redis Container
1. List the Redis pods to find the one you want to back up:
   ```bash
   kubectl get pods -n redis
   ```
2. Identify the pod name of the Redis instance you w

## RAGAS Evaluation Framework

This section implements comprehensive evaluation of our RAG pipeline using RAGAS metrics:

1. **Evaluation Dataset Creation** - Run RAG pipeline on test questions
2. **RAGAS Metrics** - Measure faithfulness, relevancy, precision, recall
3. **Performance Analysis** - Compare naive vs advanced retrieval
4. **Results Summary** - Certification-ready metrics table


In [16]:
# RAGAS Evaluation Dataset Creation
import json
from pathlib import Path
import pandas as pd
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
from datasets import Dataset

def load_cached_test_questions(filename="redis_test_questions_ragas.json"):
    """Load cached test questions for evaluation"""
    filepath = Path("../data/sdg") / filename
    if not filepath.exists():
        print(f"Cache file not found: {filepath}")
        return None
    
    with open(filepath, 'r', encoding='utf-8') as f:
        test_data = json.load(f)
    
    # Convert old format to RAGAS format if needed
    if test_data and 'context' in test_data[0]:  # Old format
        print("Converting old format to RAGAS format...")
        for item in test_data:
            if 'context' in item:
                item['contexts'] = [item.pop('context')]
            if 'ground_truth' in item:
                item['ground_truths'] = [item.pop('ground_truth')]
    
    print(f"Loaded {len(test_data)} cached test questions from {filepath}")
    return test_data

def create_evaluation_dataset(test_questions, vector_store, model_factory):
    """Create evaluation dataset by running RAG pipeline on test questions"""
    evaluation_data = []
    
    print(f"Creating evaluation dataset from {len(test_questions)} test questions...")
    
    for i, test_item in enumerate(test_questions, 1):
        print(f"Processing question {i}/{len(test_questions)}: {test_item['question'][:50]}...")
        
        # Run RAG pipeline
        question = test_item['question']
        ground_truths = test_item['ground_truths']
        
        # Get answer and contexts from our RAG pipeline
        answer, retrieved_docs = answer_runbook_question(question, vector_store, model_factory)
        contexts = [doc.page_content for doc in retrieved_docs]
        
        evaluation_data.append({
            'question': question,
            'answer': answer,
            'contexts': contexts,
            'ground_truths': ground_truths,
            'reference': ground_truths[0] if ground_truths else ''  # RAGAS expects 'reference' column
        })
    
    return evaluation_data

# Load cached test questions
print("Loading cached test questions...")
cached_questions = load_cached_test_questions()

if cached_questions:
    print(f"Found {len(cached_questions)} test questions")
    # Display sample questions
    print("\nSample test questions:")
    for i, q in enumerate(cached_questions[:3], 1):
        print(f"{i}. {q['question']}")
else:
    print("No cached questions found. Please run SDG first.")


Loading cached test questions...
Loaded 9 cached test questions from ../data/sdg/redis_test_questions_ragas.json
Found 9 test questions

Sample test questions:
1. What is WAL-G in the context of PostgreSQL backups?
2. What is the significance of RAILS_INSTANCE_NAME in the context of configuring GitLab Rails?
3. How chef help in setting up instances for Redis?


In [17]:
# Run RAGAS Evaluation
if cached_questions and 'vector_store' in locals() and 'get_model_factory' in locals():
    print("Creating evaluation dataset...")
    evaluation_data = create_evaluation_dataset(cached_questions, vector_store, get_model_factory())
    
    # Convert to HuggingFace Dataset format for RAGAS
    print("Converting to RAGAS evaluation format...")
    eval_dataset = Dataset.from_list(evaluation_data)
    
    print(f"Evaluation dataset created with {len(eval_dataset)} samples")
    print(f"Sample structure: {eval_dataset[0].keys()}")
    
    # Run RAGAS evaluation
    print("\nRunning RAGAS evaluation metrics...")
    print("This may take a few minutes...")
    
    try:
        # Define metrics
        metrics = [
            faithfulness,           # Measures factual consistency of generated answer
            answer_relevancy,      # Measures how relevant the answer is to the question  
            context_precision,     # Measures precision of retrieved contexts
            context_recall         # Measures recall of retrieved contexts
        ]
        
        # Run evaluation
        result = evaluate(
            eval_dataset,
            metrics=metrics,
            llm=get_model_factory().get_llm(),
            embeddings=get_model_factory().get_embeddings()
        )
        
        print("✅ RAGAS evaluation completed successfully!")
        
        # Convert results to DataFrame for better display
        results_df = result.to_pandas()
        
        # Calculate summary statistics
        summary_stats = {
            'Metric': ['Faithfulness', 'Answer Relevancy', 'Context Precision', 'Context Recall'],
            'Mean Score': [
                results_df['faithfulness'].mean(),
                results_df['answer_relevancy'].mean(), 
                results_df['context_precision'].mean(),
                results_df['context_recall'].mean()
            ],
            'Std Dev': [
                results_df['faithfulness'].std(),
                results_df['answer_relevancy'].std(),
                results_df['context_precision'].std(), 
                results_df['context_recall'].std()
            ],
            'Min Score': [
                results_df['faithfulness'].min(),
                results_df['answer_relevancy'].min(),
                results_df['context_precision'].min(),
                results_df['context_recall'].min()
            ],
            'Max Score': [
                results_df['faithfulness'].max(),
                results_df['answer_relevancy'].max(),
                results_df['context_precision'].max(),
                results_df['context_recall'].max()
            ]
        }
        
        summary_df = pd.DataFrame(summary_stats)
        
        print("\n" + "="*80)
        print("RAGAS EVALUATION RESULTS - REDIS RUNBOOK ASSISTANT")
        print("="*80)
        
        # Display summary table
        print("\n📊 SUMMARY METRICS TABLE:")
        print(summary_df.round(3).to_string(index=False))
        
        # Display detailed results
        print(f"\n📋 DETAILED RESULTS ({len(results_df)} samples):")
        print(results_df.round(3).to_string(index=False))
        
        # Performance interpretation
        print("\n📈 PERFORMANCE INTERPRETATION:")
        print("-" * 50)
        
        for metric in ['faithfulness', 'answer_relevancy', 'context_precision', 'context_recall']:
            mean_score = results_df[metric].mean()
            if mean_score >= 0.8:
                performance = "🟢 Excellent"
            elif mean_score >= 0.6:
                performance = "🟡 Good" 
            elif mean_score >= 0.4:
                performance = "🟠 Fair"
            else:
                performance = "🔴 Needs Improvement"
                
            print(f"{metric.replace('_', ' ').title()}: {mean_score:.3f} - {performance}")
        
        print(f"\n💾 Full results saved to: results_df")
        print(f"💾 Summary statistics saved to: summary_df")
        
    except Exception as e:
        print(f"❌ Error during RAGAS evaluation: {str(e)}")
        print("This might be due to API rate limits or configuration issues.")
        
else:
    print("❌ Cannot run evaluation - missing required components:")
    if not cached_questions:
        print("  - No cached test questions found")
    if 'vector_store' not in locals():
        print("  - Vector store not available")
    if 'get_model_factory' not in locals():
        print("  - Model factory not available")


Creating evaluation dataset...
Creating evaluation dataset from 9 test questions...
Processing question 1/9: What is WAL-G in the context of PostgreSQL backups...
Processing question 2/9: What is the significance of RAILS_INSTANCE_NAME in...
Processing question 3/9: How chef help in setting up instances for Redis?...
Processing question 4/9: What is the significance of database connection in...
Processing question 5/9: What are the key aspects of Patroni cluster manage...
Processing question 6/9: What are the key aspects of Patroni Cluster Manage...
Processing question 7/9: How do you delete cache keys on GitLab.com using R...
Processing question 8/9: How does the RDB format affect data storage in Red...
Processing question 9/9: How does the use of Sidekiq in GitLab's architectu...
Converting to RAGAS evaluation format...
Evaluation dataset created with 9 samples
Sample structure: dict_keys(['question', 'answer', 'contexts', 'ground_truths', 'reference'])

Running RAGAS evaluation metr

Evaluating:  97%|█████████▋| 35/36 [01:37<00:10, 10.66s/it]ERROR:ragas.executor:Exception raised in Job[8]: LLMDidNotFinishException(The LLM generation was not completed. Please increase try increasing the max_tokens and try again.)
Evaluating: 100%|██████████| 36/36 [02:10<00:00,  3.61s/it]

✅ RAGAS evaluation completed successfully!

RAGAS EVALUATION RESULTS - REDIS RUNBOOK ASSISTANT

📊 SUMMARY METRICS TABLE:
           Metric  Mean Score  Std Dev  Min Score  Max Score
     Faithfulness       0.700    0.360      0.019      1.000
 Answer Relevancy       0.912    0.080      0.769      0.993
Context Precision       0.590    0.461      0.000      1.000
   Context Recall       0.841    0.248      0.400      1.000

📋 DETAILED RESULTS (9 samples):
                                                                                                                                                user_input                                                                                                                                                                                                                                                                                                                                                                                                   




## Advanced Retrieval Comparison

This section implements and compares advanced retrieval methods:

1. **BM25 Retrieval** - Keyword-based retrieval using BM25 algorithm
2. **Performance Comparison** - Compare naive vs BM25 retrieval using RAGAS metrics
3. **Results Analysis** - Identify which method performs better for Redis runbook queries


In [None]:
# Advanced Retrieval: BM25 Implementation
from rank_bm25 import BM25Okapi
import re
from typing import List

def preprocess_text_for_bm25(text: str) -> List[str]:
    """Preprocess text for BM25 tokenization"""
    # Convert to lowercase and split into words
    text = text.lower()
    # Remove special characters and split
    words = re.findall(r'\b\w+\b', text)
    return words

def create_bm25_index(documents: List[str]):
    """Create BM25 index from documents"""
    # Tokenize documents
    tokenized_docs = [preprocess_text_for_bm25(doc) for doc in documents]
    
    # Create BM25 index
    bm25 = BM25Okapi(tokenized_docs)
    return bm25, tokenized_docs

def bm25_retrieval(query: str, bm25_index, tokenized_docs, documents, k=5):
    """Retrieve documents using BM25"""
    # Tokenize query
    query_tokens = preprocess_text_for_bm25(query)
    
    # Get BM25 scores
    scores = bm25_index.get_scores(query_tokens)
    
    # Get top k documents
    top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
    
    # Return documents in order of relevance
    retrieved_docs = []
    for idx in top_indices:
        retrieved_docs.append(documents[idx])
    
    return retrieved_docs, top_indices

def answer_runbook_question_bm25(question, bm25_index, tokenized_docs, documents, model_factory, k=5):
    """Answer questions using BM25 retrieval - same prompt as naive retrieval"""
    
    # Retrieve relevant documents using BM25
    retrieved_docs, indices = bm25_retrieval(question, bm25_index, tokenized_docs, documents, k=k)
    
    # Create Document objects to match naive retrieval format
    from langchain_core.documents import Document
    bm25_docs = []
    for i, doc_content in enumerate(retrieved_docs):
        # Create minimal metadata to match format
        doc = Document(
            page_content=doc_content,
            metadata={"source": f"bm25_retrieved_{i}", "title": f"BM25 Result {i+1}"}
        )
        bm25_docs.append(doc)
    
    # Use the EXACT same function as naive retrieval
    return answer_runbook_question(question, None, model_factory, custom_docs=bm25_docs)

# Create BM25 index from all document chunks
if 'chunks' in locals():
    print("Creating BM25 index from document chunks...")
    documents = [chunk.page_content for chunk in chunks]
    bm25_index, tokenized_docs = create_bm25_index(documents)
    print(f"BM25 index created with {len(documents)} documents")
    
    # Test BM25 retrieval
    test_query = "How to monitor Redis memory usage?"
    print(f"\nTesting BM25 retrieval with: '{test_query}'")
    
    retrieved_docs, indices = bm25_retrieval(test_query, bm25_index, tokenized_docs, documents, k=3)
    print(f"Retrieved {len(retrieved_docs)} documents using BM25")
    
    for i, doc in enumerate(retrieved_docs[:2], 1):
        preview = doc[:200] + "..." if len(doc) > 200 else doc
        print(f"\nBM25 Result {i}: {preview}")
        
else:
    print("❌ Cannot create BM25 index - chunks not available")


In [None]:
# Compare Naive vs BM25 Retrieval Performance
if (cached_questions and 'vector_store' in locals() and 'bm25_index' in locals() 
    and 'get_model_factory' in locals() and 'documents' in locals() and 'tokenized_docs' in locals()):
    
    print("🔄 COMPARING NAIVE vs BM25 RETRIEVAL PERFORMANCE")
    print("=" * 60)
    
    # Create evaluation datasets for both methods
    print("Creating evaluation dataset for Naive Retrieval...")
    naive_eval_data = create_evaluation_dataset(cached_questions, vector_store, get_model_factory())
    
    print("Creating evaluation dataset for BM25 Retrieval...")
    bm25_eval_data = []
    for i, test_item in enumerate(cached_questions, 1):
        print(f"Processing BM25 question {i}/{len(cached_questions)}: {test_item['question'][:50]}...")
        
        question = test_item['question']
        ground_truths = test_item['ground_truths']
        
        # Get answer and contexts from BM25 RAG pipeline
        answer, retrieved_docs = answer_runbook_question_bm25(
            question, bm25_index, tokenized_docs, documents, get_model_factory(), k=5
        )
        contexts = [doc.page_content for doc in retrieved_docs]  # Extract content from Document objects
        
        bm25_eval_data.append({
            'question': question,
            'answer': answer,
            'contexts': contexts,
            'ground_truths': ground_truths,
            'reference': ground_truths[0] if ground_truths else ''  # RAGAS expects 'reference' column
        })
    
    # Convert to RAGAS format
    naive_dataset = Dataset.from_list(naive_eval_data)
    bm25_dataset = Dataset.from_list(bm25_eval_data)
    
    print(f"\nRunning RAGAS evaluation for both methods...")
    print("This may take several minutes...")
    
    try:
        # Define metrics
        metrics = [faithfulness, answer_relevancy, context_precision, context_recall]
        
        # Evaluate Naive Retrieval
        print("\n📊 Evaluating Naive Retrieval...")
        naive_result = evaluate(naive_dataset, metrics=metrics, 
                               llm=get_model_factory().get_llm(), 
                               embeddings=get_model_factory().get_embeddings())
        
        # Evaluate BM25 Retrieval  
        print("\n📊 Evaluating BM25 Retrieval...")
        bm25_result = evaluate(bm25_dataset, metrics=metrics,
                              llm=get_model_factory().get_llm(),
                              embeddings=get_model_factory().get_embeddings())
        
        # Convert to DataFrames
        naive_df = naive_result.to_pandas()
        bm25_df = bm25_result.to_pandas()
        
        # Create comparison table
        comparison_data = {
            'Metric': ['Faithfulness', 'Answer Relevancy', 'Context Precision', 'Context Recall'],
            'Naive Retrieval': [
                naive_df['faithfulness'].mean(),
                naive_df['answer_relevancy'].mean(),
                naive_df['context_precision'].mean(),
                naive_df['context_recall'].mean()
            ],
            'BM25 Retrieval': [
                bm25_df['faithfulness'].mean(),
                bm25_df['answer_relevancy'].mean(), 
                bm25_df['context_precision'].mean(),
                bm25_df['context_recall'].mean()
            ]
        }
        
        comparison_df = pd.DataFrame(comparison_data)
        
        # Calculate differences
        comparison_df['Difference'] = comparison_df['BM25 Retrieval'] - comparison_df['Naive Retrieval']
        comparison_df['Better Method'] = comparison_df.apply(
            lambda row: 'BM25' if row['Difference'] > 0 else 'Naive', axis=1
        )
        
        # Round to 3 decimal places
        comparison_df = comparison_df.round(3)
        
        print("\n" + "="*80)
        print("RAGAS EVALUATION COMPARISON: NAIVE vs BM25 RETRIEVAL")
        print("="*80)
        
        print("\n📊 PERFORMANCE COMPARISON TABLE:")
        print(comparison_df.to_string(index=False))
        
        # Summary analysis
        print("\n📈 PERFORMANCE ANALYSIS:")
        print("-" * 50)
        
        naive_wins = (comparison_df['Difference'] < 0).sum()
        bm25_wins = (comparison_df['Difference'] > 0).sum()
        
        print(f"🏆 Naive Retrieval wins: {naive_wins}/4 metrics")
        print(f"🏆 BM25 Retrieval wins: {bm25_wins}/4 metrics")
        
        if bm25_wins > naive_wins:
            print("\n🎯 CONCLUSION: BM25 Retrieval performs better overall")
        elif naive_wins > bm25_wins:
            print("\n🎯 CONCLUSION: Naive Retrieval performs better overall") 
        else:
            print("\n🎯 CONCLUSION: Both methods perform similarly")
            
        # Detailed metric analysis
        print("\n📋 METRIC-BY-METRIC ANALYSIS:")
        for _, row in comparison_df.iterrows():
            metric = row['Metric']
            naive_score = row['Naive Retrieval']
            bm25_score = row['BM25 Retrieval']
            diff = row['Difference']
            
            if abs(diff) < 0.05:
                result = "≈ Similar performance"
            elif diff > 0:
                result = f"📈 BM25 better by {diff:.3f}"
            else:
                result = f"📉 Naive better by {abs(diff):.3f}"
                
            print(f"  {metric}: {result}")
        
        print(f"\n💾 Results saved to:")
        print(f"  - naive_df: Naive retrieval detailed results")
        print(f"  - bm25_df: BM25 retrieval detailed results") 
        print(f"  - comparison_df: Performance comparison table")
        
    except Exception as e:
        print(f"❌ Error during comparison evaluation: {str(e)}")
        print("This might be due to API rate limits or configuration issues.")
        
else:
    print("❌ Cannot run comparison - missing required components:")
    missing = []
    if not cached_questions:
        missing.append("cached test questions")
    if 'vector_store' not in locals():
        missing.append("vector store")
    if 'bm25_index' not in locals():
        missing.append("BM25 index")
    if 'get_model_factory' not in locals():
        missing.append("model factory")
    if 'documents' not in locals():
        missing.append("document chunks")
        
    print(f"  Missing: {', '.join(missing)}")
    print("Please ensure all previous cells have been executed successfully.")
