# Cheat Sheet - Baseline

## Configuration

In [1]:
# Import required libraries for RAG pipeline
import os
import time
import pandas as pd
from typing import List, Dict, Any, Union
from datetime import datetime
from pathlib import Path

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import InMemoryVectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import BaseRetriever, Document
from langchain.chains import RetrievalQA
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain_community.document_loaders import Docx2txtLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import PromptTemplate

from utils.openai_tools import OPENAI_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [2]:
# Configuration parameters for RAG experiment
METHODOLOGY_ID = "semantic"

LLM_MODELS = ["gpt-4o"]

N_RUNS = 3

DOCX_FOLDER_PATH = "./model_docs"

QUESTIONS_FILE_PATH = "./example_questions.txt"

CUSTOM_RAG_PROMPT = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Answer:"""

OUTPUT_DIR = f"./outputs/{METHODOLOGY_ID}"
EMBEDDING_CACHE_DIR = f"./cache/{METHODOLOGY_ID}"

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(EMBEDDING_CACHE_DIR, exist_ok=True)

print(f"Methodology: {METHODOLOGY_ID}")
print(f"LLM Models: {LLM_MODELS}")
print(f"Number of runs per question: {N_RUNS}")
print(f"DOCX folder: {DOCX_FOLDER_PATH}")
print(f"Questions file: {QUESTIONS_FILE_PATH}")
print(f"Custom prompt configured: {len(CUSTOM_RAG_PROMPT)} characters")
print(f"Output directory: {OUTPUT_DIR}")

Methodology: semantic
LLM Models: ['gpt-4o']
Number of runs per question: 3
DOCX folder: ./model_docs
Questions file: ./example_questions.txt
Custom prompt configured: 203 characters
Output directory: ./outputs/semantic


## Document Processing Functions

In [None]:
# Helper function for formatting documents in LCEL chains
def format_docs(docs):
    """Format retrieved documents by joining their content with double newlines."""
    return "\n\n".join(doc.page_content for doc in docs)

# Token counting function (placeholder - implement with your preferred tokenizer)
def get_token_count(text):
    """Count tokens in a given text string. Replace with actual tokenizer implementation."""
    # This is a placeholder - replace with your actual token counting logic
    # For example, using tiktoken for OpenAI models:
    # import tiktoken
    # encoding = tiktoken.get_encoding("cl100k_base")  # GPT-4/GPT-3.5-turbo encoding
    # return len(encoding.encode(text))
    
    # Simple word-based approximation for now (replace this)
    return len(text.split()) * 1.3  # Rough approximation: 1.3 tokens per word

In [4]:
# Load DOCX documents from folder
def load_docx_documents(folder_path):
    documents = []
    folder = Path(folder_path)
    
    docx_files = list(folder.glob("*.docx"))
    
    print(f"Found {len(docx_files)} DOCX files to process")
    
    for docx_file in docx_files:
        try:
            loader = Docx2txtLoader(str(docx_file))
            doc_content = loader.load()
            
            for doc in doc_content:
                doc.metadata.update({
                    "source": docx_file.name,
                    "file_path": str(docx_file),
                    "methodology": METHODOLOGY_ID
                })
                documents.append(doc)
                
        except Exception as e:
            print(f"Error loading {docx_file.name}: {e}")
            raise e
    
    print(f"Successfully loaded {len(documents)} documents")
    return documents

## RAG Retriever Setup

In [None]:
# Vector-based retriever using OpenAI embeddings
class RAGRetriever(BaseRetriever):
    
    def __init__(self, document, methodology_id, doc_identifier):
        super().__init__()
        self._document = document
        self._methodology_id = methodology_id
        self._doc_identifier = doc_identifier
        
        # In-memory cache for retrieved documents
        self._cache = {}
        
        store = LocalFileStore(EMBEDDING_CACHE_DIR)
        base_embeddings = OpenAIEmbeddings()
        self._embeddings = CacheBackedEmbeddings.from_bytes_store(
            base_embeddings, 
            store, 
            namespace=f"embeddings_{methodology_id}_{doc_identifier}"
        )
        
        self._text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1024,
            chunk_overlap=128
        )
        
        self.setup_vectorstore()
    
    def setup_vectorstore(self):
        print(f"  Processing document: {self._doc_identifier}")
        
        splits = self._text_splitter.split_documents([self._document])
        print(f"  Created {len(splits)} chunks from {self._doc_identifier}")
        
        self._vectorstore = InMemoryVectorStore.from_documents(splits, self._embeddings)
    
    def _get_relevant_documents(self, query, *, run_manager=None):
        # Check cache first
        if query in self._cache:
            print(f"  Cache hit for query: {query[:50]}...")
            return self._cache[query]
        
        # If not in cache, perform retrieval
        print(f"  Cache miss - retrieving for query: {query[:50]}...")
        documents = self._vectorstore.similarity_search(query, k=5)
        
        # Store in cache
        self._cache[query] = documents
        
        return documents
    
    def clear_cache(self):
        """Clear the retrieval cache."""
        self._cache.clear()
        print(f"  Retrieval cache cleared for {self._doc_identifier}")
    
    def get_cache_stats(self):
        """Get cache statistics."""
        return {
            "cache_size": len(self._cache),
            "cached_queries": list(self._cache.keys())
        }

## RAG Retriever Testing

In [6]:
# Initialize a single retriever for testing
print("Initializing RAG Retriever for testing...")

# Load a single document for testing
test_doc_path = "./model_docs/0001_ModelDoc.docx"
loader = Docx2txtLoader(test_doc_path)
test_document = loader.load()[0]

# Add metadata
test_document.metadata.update({
    "source": "0001_ModelDoc.docx",
    "file_path": test_doc_path,
    "methodology": "test"
})

print(f"Loaded test document: {test_document.metadata['source']}")
print(f"Document preview: {test_document.page_content[:200]}...")

# Create test retriever
test_retriever = RAGRetriever(
    document=test_document,
    methodology_id="test",
    doc_identifier="test_doc_0001_ModelDoc_docx"
)

print("✅ Test retriever initialized successfully!")

Initializing RAG Retriever for testing...
Loaded test document: 0001_ModelDoc.docx
Document preview: Model Documentation: 0001 - AlexNet

Introduction

This model documentation corresponds to AlexNet, which is a machine learning and neural network-based model for computer vision tasks.

Background  
...
  Processing document: test_doc_0001_ModelDoc_docx
  Created 2 chunks from test_doc_0001_ModelDoc_docx
✅ Test retriever initialized successfully!


  _warn_about_sha1_encoder()


In [7]:
# Test a single query on the retriever
test_query = "What is AlexNet and what makes it important?"

print(f"Testing query: '{test_query}'")
print("\nRetrieving relevant documents...")

# Use invoke() method following the Runnable interface
relevant_docs = test_retriever.invoke(test_query)

print(f"Found {len(relevant_docs)} relevant document chunks:")
print("\n" + "="*60)

for i, doc in enumerate(relevant_docs, 1):
    print(f"\n--- Chunk {i} ---")
    print(f"Source: {doc.metadata.get('source', 'unknown')}")
    print(f"Content ({len(doc.page_content)} chars):")
    print(doc.page_content[:300] + "..." if len(doc.page_content) > 300 else doc.page_content)
    print("-" * 40)

print(f"\n✅ Single query test completed successfully!")
print(f"Retrieved {len(relevant_docs)} chunks using invoke() method for query: '{test_query}'")

Testing query: 'What is AlexNet and what makes it important?'

Retrieving relevant documents...
Found 2 relevant document chunks:


--- Chunk 1 ---
Source: 0001_ModelDoc.docx
Content (933 chars):
Model Documentation: 0001 - AlexNet

Introduction

This model documentation corresponds to AlexNet, which is a machine learning and neural network-based model for computer vision tasks.

Background  

AlexNet is a groundbreaking convolutional neural network architecture that significantly advanced t...
----------------------------------------

--- Chunk 2 ---
Source: 0001_ModelDoc.docx
Content (906 chars):
Influence on Future Models  

AlexNet inspired the development of deeper and more complex CNN architectures such as VGG, ResNet, and Inception.

Practical Applications  

AlexNet’s success helped enable advances in autonomous driving, facial recognition, and medical imaging.

Comparison to Resnet

A...
----------------------------------------

✅ Single query test completed successfully!
Retr

In [8]:
# Test LCEL chain directly with GPT-4o-mini
print("Setting up LCEL chain with GPT-4o-mini...")

# Initialize GPT-4o-mini LLM
test_llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.0
)

# Create prompt template from CUSTOM_RAG_PROMPT
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template=CUSTOM_RAG_PROMPT
)

# Build LCEL chain
qa_chain = (
    {
        "context": test_retriever | format_docs,
        "question": RunnablePassthrough(),
    }
    | prompt_template
    | test_llm
    | StrOutputParser()
)

print(f"✅ LCEL chain initialized with GPT-4o-mini")
print(f"Using CUSTOM_RAG_PROMPT ({len(CUSTOM_RAG_PROMPT)} characters)")

# Test retrieval only
print(f"\n1. Testing retrieval for: '{test_query}'")
print("="*60)
retrieved_docs = test_retriever.invoke(test_query)
print(f"Retrieved {len(retrieved_docs)} documents:")
for i, doc in enumerate(retrieved_docs, 1):
    print(f"  {i}. {doc.page_content[:100]}...")

# Test retrieval + generation with LCEL
print(f"\n2. Testing LCEL chain (retrieval + generation) for: '{test_query}'")
print("="*60)
start_time = time.time()
answer = qa_chain.invoke(test_query)
response_time = time.time() - start_time

print("LCEL Chain Result:")
print("-" * 40)
print(f"Question: {test_query}")
print(f"Answer: {answer}")
print(f"Response time: {response_time:.2f}s")
print(f"Model: gpt-4o-mini")
print("-" * 40)

print(f"\n✅ LCEL chain test completed successfully!")

Setting up LCEL chain with GPT-4o-mini...
✅ LCEL chain initialized with GPT-4o-mini
Using CUSTOM_RAG_PROMPT (203 characters)

1. Testing retrieval for: 'What is AlexNet and what makes it important?'
Retrieved 2 documents:
  1. Model Documentation: 0001 - AlexNet

Introduction

This model documentation corresponds to AlexNet, ...
  2. Influence on Future Models  

AlexNet inspired the development of deeper and more complex CNN archit...

2. Testing LCEL chain (retrieval + generation) for: 'What is AlexNet and what makes it important?'
LCEL Chain Result:
----------------------------------------
Question: What is AlexNet and what makes it important?
Answer: AlexNet is a convolutional neural network architecture developed by Alex Krizhevsky, Ilya Sutskever, and Geoffrey Hinton in 2012, which significantly advanced the field of computer vision. It is important because it won the ImageNet Large Scale Visual Recognition Challenge (ILSVRC) with a top-5 error rate of 15.3%, demonstrating the ef

## RAG Pipeline

In [None]:
# Main RAG pipeline using LangChain Expression Language (LCEL)
class RAGPipeline:
    
    def __init__(self, retriever, llm_model, document_info, custom_prompt=None):
        self.retriever = retriever
        self.llm_model = llm_model
        self.document_info = document_info
        self.custom_prompt = custom_prompt or CUSTOM_RAG_PROMPT
        
        self.llm = ChatOpenAI(
            model=llm_model,
            temperature=0.0
        )
        
        print(f"  RAG Pipeline initialized with {llm_model}")
        print(f"  Using custom prompt: {len(self.custom_prompt)} characters")
        
        # Create prompt template
        prompt_template = PromptTemplate(
            input_variables=["context", "question"],
            template=self.custom_prompt
        )
        
        # Build LCEL chain
        self.qa_chain = (
            {
                "context": self.retriever | format_docs,
                "question": RunnablePassthrough(),
            }
            | prompt_template
            | self.llm
            | StrOutputParser()
        )
        
        print(f"  Using LangChain Expression Language (LCEL) chain")
    
    def query(self, question):
        start_time = time.time()
        
        try:
            # Get documents for metadata/sources tracking
            docs = self.retriever.invoke(question)
            
            # Format context for prompt token counting
            formatted_context = format_docs(docs)
            
            # Build the final prompt that will be sent to LLM and count tokens
            final_prompt = self.custom_prompt.format(
                context=formatted_context,
                question=question
            )
            prompt_tokens = get_token_count(final_prompt)
            
            # Use LCEL chain to get answer
            answer = self.qa_chain.invoke(question)
            
            # Count output tokens
            output_tokens = get_token_count(answer)
            
            # Calculate total tokens (standard billing calculation)
            total_tokens = prompt_tokens + output_tokens
            
            sources = []
            for doc in docs:
                sources.append({
                    "content": doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content,
                    "source": doc.metadata.get("source", "unknown"),
                    "metadata": doc.metadata
                })
            
            response_time = time.time() - start_time
            
            final_result = {
                "question": question,
                "answer": answer,
                "sources": sources,
                "num_sources": len(sources),
                "response_time": response_time,
                "llm_model": self.llm_model,
                "methodology": METHODOLOGY_ID,
                "timestamp": datetime.now().isoformat(),
                "document_name": self.document_info["name"],
                "document_source": self.document_info["source"],
                "document_id": self.document_info["id"],
                "prompt_length": len(self.custom_prompt),
                "retriever_type": type(self.retriever).__name__,
                "prompt_tokens": prompt_tokens,
                "output_tokens": output_tokens,
                "total_tokens": total_tokens
            }
            
            return final_result
            
        except Exception as e:
            return {
                "question": question,
                "answer": f"Error: {str(e)}",
                "sources": [],
                "response_time": time.time() - start_time,
                "llm_model": self.llm_model,
                "methodology": METHODOLOGY_ID,
                "timestamp": datetime.now().isoformat(),
                "num_sources": 0,
                "document_name": self.document_info["name"],
                "document_source": self.document_info["source"],
                "document_id": self.document_info["id"],
                "retriever_type": type(self.retriever).__name__,
                "error": str(e),
                "prompt_tokens": 0,
                "output_tokens": 0,
                "total_tokens": 0
            }
    
    def batch_query(self, questions):
        results = []
        
        for i, question in enumerate(questions, 1):
            print(f"    Question {i}/{len(questions)}: {question[:50]}...")
            result = self.query(question)
            results.append(result)
        
        return results

## Multi-Run Execution Engine

In [10]:
# Run experiments across multiple documents and models
def run_multiple_experiments(questions, documents, llm_models, n_runs):
    if isinstance(llm_models, str):
        llm_models = [llm_models]
    
    all_results = []
    
    total_executions = len(questions) * len(documents) * len(llm_models) * n_runs
    print(f"Starting experiments:")
    print(f"  - {len(documents)} documents")
    print(f"  - {len(questions)} questions per document")
    print(f"  - {len(llm_models)} models")
    print(f"  - {n_runs} runs per question")
    print(f"  - Total executions: {total_executions}")
    
    current_execution = 0
    
    for doc_idx, document in enumerate(documents, 1):
        doc_name = document.metadata.get('source', f'doc_{doc_idx}')
        doc_identifier = f"doc_{doc_idx}_{doc_name.replace('.', '_').replace(' ', '_')}"
        
        print(f"\n=== Document {doc_idx}/{len(documents)}: {doc_name} ===")
        
        document_info = {
            "name": doc_name,
            "source": document.metadata.get('source', 'unknown'),
            "id": doc_identifier
        }
        
        print(f"Setting up retriever for {doc_name}...")
        doc_retriever = RAGRetriever(document, METHODOLOGY_ID, doc_identifier)
        
        for model_idx, model in enumerate(llm_models, 1):
            print(f"\n--- Model {model_idx}/{len(llm_models)}: {model} ---")
            
            pipeline = RAGPipeline(doc_retriever, model, document_info, CUSTOM_RAG_PROMPT)
            
            for run_idx in range(1, n_runs + 1):
                print(f"\n  Run {run_idx}/{n_runs} on {doc_name}")
                
                run_results = pipeline.batch_query(questions)
                
                for result in run_results:
                    result["run_number"] = run_idx
                    result["model_index"] = model_idx
                    result["document_index"] = doc_idx
                    all_results.append(result)
                    
                    current_execution += 1
                    if current_execution % 10 == 0:
                        print(f"    Progress: {current_execution}/{total_executions} ({100*current_execution/total_executions:.1f}%)")
    
    print(f"\nCompleted! Total results: {len(all_results)}")
    return all_results

In [None]:
# Save experiment results to CSV
def save_results_to_csv(results, output_dir):
    df_data = []
    
    for result in results:
        row = {
            "methodology": result["methodology"],
            "llm_model": result["llm_model"],
            "document_name": result["document_name"],
            "document_source": result["document_source"],
            "document_id": result["document_id"],
            "run_number": result["run_number"],
            "question": result["question"],
            "answer": result["answer"],
            "response_time": result["response_time"],
            "num_sources": result["num_sources"],
            "timestamp": result["timestamp"],
            "sources_summary": " | ".join([s["source"] for s in result["sources"]]) if result["sources"] else "",
            "prompt_length": result.get("prompt_length", 0),
            "retriever_type": result.get("retriever_type", "unknown"),
            "error": result.get("error", ""),
            # Token count columns
            "prompt_tokens": result.get("prompt_tokens", 0),
            "output_tokens": result.get("output_tokens", 0),
            "total_tokens": result.get("total_tokens", 0)
        }
        df_data.append(row)
    
    df = pd.DataFrame(df_data)
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{METHODOLOGY_ID}_results_{timestamp}.csv"
    filepath = os.path.join(output_dir, filename)
    
    df.to_csv(filepath, index=False)
    
    print(f"Results saved to: {filepath}")
    print(f"Total rows: {len(df)}")
    print(f"Unique documents: {df['document_name'].nunique()}")
    print(f"Questions per document: {len(df) // df['document_name'].nunique() if df['document_name'].nunique() > 0 else 0}")
    print(f"Retriever types used: {df['retriever_type'].unique().tolist()}")
    
    # Token usage summary
    if 'total_tokens' in df.columns and df['total_tokens'].sum() > 0:
        print(f"\nToken Usage Summary:")
        print(f"Total tokens across all queries: {df['total_tokens'].sum():,}")
        print(f"Average tokens per query: {df['total_tokens'].mean():.1f}")
        print(f"Average prompt tokens: {df['prompt_tokens'].mean():.1f}")
        print(f"Average output tokens: {df['output_tokens'].mean():.1f}")
    
    return filepath

## Load Documents and Setup

In [12]:
# Load documents from configured folder
print(f"Loading documents from: {DOCX_FOLDER_PATH}")
documents = load_docx_documents(DOCX_FOLDER_PATH)

if not documents:
    print("No documents loaded. Please check the folder path or add some DOCX files.")
else:
    print(f"Successfully loaded {len(documents)} documents")
    
    print("\nDocument overview:")
    for i, doc in enumerate(documents[:3]):
        source = doc.metadata.get('source', 'unknown')
        content_preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
        print(f"  {i+1}. {source}: {content_preview}")

Loading documents from: ./model_docs
Found 1 DOCX files to process
Successfully loaded 1 documents
Successfully loaded 1 documents

Document overview:
  1. 0001_ModelDoc.docx: Model Documentation: 0001 - AlexNet

Introduction

This model documentation corresponds to AlexNet, ...


In [13]:
# Confirm documents are ready for processing
if documents:
    print(f"Documents ready for individual processing: {len(documents)} documents loaded")
    print("Each document will be processed separately with its own retriever and vector store")
else:
    print("Cannot proceed without documents")
    documents = []

Documents ready for individual processing: 1 documents loaded
Each document will be processed separately with its own retriever and vector store


## Question Set Definition

In [14]:
# Load questions from text file
def load_questions_from_file(file_path):
    questions = []
    
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line_num, line in enumerate(file, 1):
                line = line.strip()
                
                if not line or line.startswith('#'):
                    continue
                
                questions.append(line)
                
        print(f"Successfully loaded {len(questions)} questions from {file_path}")
        
    except Exception as e:
        print(f"Error reading questions file {file_path}: {e}")
        raise e
        
    return questions

TEST_QUESTIONS = load_questions_from_file(QUESTIONS_FILE_PATH)

print(f"\nLoaded Questions ({len(TEST_QUESTIONS)} total):")
for i, question in enumerate(TEST_QUESTIONS, 1):
    display_question = question if len(question) <= 60 else question[:57] + "..."
    print(f"  {i:2d}. {display_question}")

if len(TEST_QUESTIONS) > 10:
    print(f"  ... and {len(TEST_QUESTIONS) - 10} more questions")

Successfully loaded 2 questions from ./example_questions.txt

Loaded Questions (2 total):
   1. What does this model cover?
   2. Why is this model important?


## Execute RAG Pipeline

In [15]:
# Execute RAG pipeline experiments
if documents and TEST_QUESTIONS:
    print("Starting document-by-document RAG pipeline execution...")
    print(f"Configuration:")
    print(f"  - Methodology: {METHODOLOGY_ID}")
    print(f"  - Models: {LLM_MODELS}")
    print(f"  - Documents: {len(documents)}")
    print(f"  - Questions per document: {len(TEST_QUESTIONS)}")
    print(f"  - Runs per question: {N_RUNS}")
    
    all_results = run_multiple_experiments(
        questions=TEST_QUESTIONS,
        documents=documents,
        llm_models=LLM_MODELS,
        n_runs=N_RUNS
    )
    
    csv_path = save_results_to_csv(all_results, OUTPUT_DIR)
    
    print(f"\n✅ Pipeline execution complete!")
    print(f"Results saved to: {csv_path}")
    
else:
    print("Cannot execute pipeline - missing documents or questions")
    all_results = []

Starting document-by-document RAG pipeline execution...
Configuration:
  - Methodology: semantic
  - Models: ['gpt-4o']
  - Documents: 1
  - Questions per document: 2
  - Runs per question: 3
Starting experiments:
  - 1 documents
  - 2 questions per document
  - 1 models
  - 3 runs per question
  - Total executions: 6

=== Document 1/1: 0001_ModelDoc.docx ===
Setting up retriever for 0001_ModelDoc.docx...
  Processing document: doc_1_0001_ModelDoc_docx
  Created 2 chunks from doc_1_0001_ModelDoc_docx

--- Model 1/1: gpt-4o ---
  RAG Pipeline initialized with gpt-4o
  Using custom prompt: 203 characters
  Using LangChain Expression Language (LCEL) chain

  Run 1/3 on 0001_ModelDoc.docx
    Question 1/2: What does this model cover?...
    Question 2/2: Why is this model important?...

  Run 2/3 on 0001_ModelDoc.docx
    Question 1/2: What does this model cover?...
    Question 2/2: Why is this model important?...

  Run 3/3 on 0001_ModelDoc.docx
    Question 1/2: What does this model cov

## Results Preview

In [16]:
# Display results summary and preview
if all_results:
    print("=== RESULTS PREVIEW ===")
    
    df = pd.DataFrame([{
        "methodology": r["methodology"],
        "llm_model": r["llm_model"],
        "document_name": r["document_name"],
        "run_number": r["run_number"],
        "response_time": r["response_time"],
        "num_sources": r["num_sources"]
    } for r in all_results])
    
    print(f"\nSummary Statistics:")
    print(f"Total results: {len(all_results)}")
    print(f"Unique documents processed: {df['document_name'].nunique()}")
    print(f"Average response time: {df['response_time'].mean():.2f} seconds")
    print(f"Average sources per answer: {df['num_sources'].mean():.1f}")
    
    print(f"\nResults by Model:")
    model_summary = df.groupby('llm_model').agg({
        'response_time': ['mean', 'std'],
        'num_sources': 'mean'
    }).round(2)
    print(model_summary)
    
    print(f"\nResults by Document:")
    doc_summary = df.groupby('document_name').agg({
        'response_time': 'mean',
        'num_sources': 'mean'
    }).round(2)
    print(doc_summary)
    
    print(f"\nSample Results:")
    for i, result in enumerate(all_results[:3]):
        print(f"\n--- Result {i+1} ---")
        print(f"Document: {result['document_name']}")
        print(f"Model: {result['llm_model']}")
        print(f"Run: {result['run_number']}")
        print(f"Question: {result['question']}")
        print(f"Answer: {result['answer'][:150]}...")
        print(f"Sources: {result['num_sources']}")
        print(f"Time: {result['response_time']:.2f}s")

else:
    print("No results to display")

=== RESULTS PREVIEW ===

Summary Statistics:
Total results: 6
Unique documents processed: 1
Average response time: 1.71 seconds
Average sources per answer: 2.0

Results by Model:
          response_time       num_sources
                   mean   std        mean
llm_model                                
gpt-4o             1.71  0.64         2.0

Results by Document:
                    response_time  num_sources
document_name                                 
0001_ModelDoc.docx           1.71          2.0

Sample Results:

--- Result 1 ---
Document: 0001_ModelDoc.docx
Model: gpt-4o
Run: 1
Question: What does this model cover?
Answer: This model documentation covers AlexNet, a convolutional neural network architecture used for computer vision tasks, particularly focusing on its deve...
Sources: 2
Time: 1.57s

--- Result 2 ---
Document: 0001_ModelDoc.docx
Model: gpt-4o
Run: 1
Question: Why is this model important?
Answer: AlexNet is important because it significantly advanced the field of