### 0- Loading 

In [5]:
# Loading a pre-trained embedding model and initializing a Chroma vector store

from langchain_huggingface import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-base-en-v1.5",
    model_kwargs={'device': 'cpu'},  # or 'cuda' if available
    encode_kwargs={'normalize_embeddings': True}  # for cosine search
)


from langchain.vectorstores import Chroma

chroma_db = Chroma(
    persist_directory="chroma_db",
    embedding_function=embedding_model
)

query = "How does the model deal with missing values?"
results = chroma_db.similarity_search(query, k=1)

for i, doc in enumerate(results):
    print(f"\n🔎 Result {i+1}")
    print(doc.page_content[:100])  # Preview first 300 chars
    print("Metadata:", doc.metadata)

  chroma_db = Chroma(
Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given



🔎 Result 1
One	of	the	simplest	ways	to	fill	in	missing	values	is	to	carry	forward	the	last	known	value	prior
to
Metadata: {'page': 37, 'moddate': '2020-03-30T07:09:46+00:00', 'producer': 'PDF Candy', 'page_label': '38', 'total_pages': 365, 'creationdate': '2020-03-30T07:09:46+00:00', 'creator': 'PyPDF', 'source': 'data/ps.pdf'}


## 1- Query translation/optimization:
In order to improve the quality and relevance of the results, we do query translation.
- Multi-Query: Multiple queries are made from a single query using llm; then, we retrieve the relevant results and combine them
- HyDe: Hypothetical document: We create a document from the query and retrieve the most relevants.


In [20]:
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

def generate_long_answer(query: str, prompt: str = "Expand this into a comprehensive answer:", max_tokens: int = 300) -> str:
    model_name = "google/flan-t5-base"

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    text_gen_pipeline = pipeline(
        "text2text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=max_tokens,
        temperature=0.7,
        do_sample=True,
        top_p=0.9
    )

    llm = HuggingFacePipeline(pipeline=text_gen_pipeline)

    # Combine prompt and query
    full_prompt = f"{prompt} {query}"
    response = llm.invoke(full_prompt)
    return response

# Example usage
query = "How does the model deal with missing values?"
prompt = "Rewrite and expand this into a detailed technical question:"
print(generate_long_answer(query, prompt=prompt))


tokenizer_config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu


The model is based on a number of factors, such as the number of digits in a number, the number of digits in a number, the number of digits in a number, the number of digits in a number, the number of digits in a number, the number of digits in a number, the number of digits in a number, the number of digits in a number, the number of digits in a number, the number of digits in a number, the number of digits in a number, the number of digits in a number, the number of digits in a number, the number of digits in a number, the number of digits in a number, the number of digits in a number, the number of digits in a number, the number of digits in a number, the number of digits in a number, the number of digits in a number, the number of digits in a number, the number of digits in a number, the number of digits in a number, the number of digits in a number, the number of digits in a number, the number of digits in a number,


In [2]:
chroma_db.max_marginal_relevance_search(query, k=3, fetch_k=10, lambda_mult=0.5) # lambda_mult: trade-off between relevance (1.0) and diversity (0.0)


[Document(metadata={'page': 37, 'producer': 'PDF Candy', 'page_label': '38', 'moddate': '2020-03-30T07:09:46+00:00', 'creationdate': '2020-03-30T07:09:46+00:00', 'creator': 'PyPDF', 'source': 'data/ps.pdf', 'total_pages': 365}, page_content='One\tof\tthe\tsimplest\tways\tto\tfill\tin\tmissing\tvalues\tis\tto\tcarry\tforward\tthe\tlast\tknown\tvalue\tprior\nto\tthe\tmissing\tone,\tan\tapproach\tknown\tas\t\nforward\tfill\n.\tNo\tmathematics\tor\tcomplicated\tlogic\tis\nrequired.\tSimply\tconsider\tthe\texperience\tof\tmoving\tforward\tin\ttime\twith\tthe\tdata\tthat\twas\navailable,\tand\tyou\tcan\tsee\tthat\tat\ta\tmissing\tpoint\tin\ttime,\tall\tyou\tcan\tbe\tconfident\tof\tis\twhat\ndata\thas\talready\tbeen\trecorded.\tIn\tsuch\ta\tcase,\tit\tmakes\tsense\tto\tuse\tthe\tmost\trecent\tknown\nmeasurement.\nForward\tfill\tcan\tbe\taccomplished\teasily\tusing\t\n\tfrom\tthe\t\n\tpackage:\nThis\twill\tresult\tin\ta\tplot\tthat\tlooks\tnatural\texcept\twhere\tyou\tsee\trepeated\tvalues\tto

**Loading LLM**

In [7]:
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

model_name = "google/flan-t5-small"  

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

llm = HuggingFacePipeline(pipeline=pipe)


Device set to use cpu


Using it in a chain

In [8]:
retriever = chroma_db.as_retriever(search_type="mmr", #search_type can be “similarity”, “mmr”, or “similarity_score_threshold”
                                   search_kwargs={
                                        "k": 2,
                                        "fetch_k": 4,
                                        "lambda_mult": 0.5
                                    })

from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever
)

questions = [
    "How does the model deal with missing values?",
    "what is lag feature.",
    "What feature is the most useful features for time series forecasting?",
]

qa_results = []
for question in questions:
    qa = qa_chain.invoke(question)
    print(f"\n❓ Question: {question}")
    print(f"🤖 Answer: {qa['result']}")
    qa_results.append(qa)

print(qa_results)

Token indices sequence length is longer than the specified maximum sequence length for this model (813 > 512). Running this sequence through the model will result in indexing errors



❓ Question: How does the model deal with missing values?
🤖 Answer: By using the most recent known measurement.

❓ Question: what is lag feature.
🤖 Answer: The function , which is a proxy for calculating the expected value of L 2 ( X 2 )  L ( X )  X ( L is the lag operator).

❓ Question: What feature is the most useful features for time series forecasting?
🤖 Answer: Feature generation
[{'query': 'How does the model deal with missing values?', 'result': 'By using the most recent known measurement.'}, {'query': 'what is lag feature.', 'result': 'The function , which is a proxy for calculating the expected value of L 2 ( X 2 )  L ( X )  X ( L is the lag operator).'}, {'query': 'What feature is the most useful features for time series forecasting?', 'result': 'Feature generation'}]


| Method           | Type              | Requires Ground Truth | Measures                      | Explanation                                                                                  | Common Use Cases                                       |
|------------------|-------------------|------------------------|-------------------------------|----------------------------------------------------------------------------------------------|--------------------------------------------------------|
| Precision@k      | Retrieval Metric   | ✅ Yes                 | Accuracy of top-k results     | Fraction of top-k retrieved documents that are relevant.                                     | Evaluate quality of retrieved documents for QA.        |
| Recall@k         | Retrieval Metric   | ✅ Yes                 | Coverage of relevant results  | Fraction of all relevant documents that are present in the top-k results.                   | Ensure important documents are not missed in retrieval.|
| F1@k             | Retrieval Metric   | ✅ Yes                 | Balance of precision & recall | Harmonic mean of precision and recall, balancing quality and coverage.                      | Optimize both completeness and accuracy of retrieval.  |
| MRR (Reciprocal Rank) | Retrieval Metric | ✅ Yes            | Ranking quality               | Average of the reciprocal ranks of the first relevant doc across queries.                   | Measures how early a relevant document appears.        |
| RAGAS            | RAG Eval Framework | ✅ Yes (Answers + Contexts preferred) | End-to-end RAG quality     | Evaluates the relevance of context, answer correctness, faithfulness to source, etc.        | QA system evaluation, LLM-based retrieval pipelines.   |


**Methods without ground truth**

| Method                    | What It Evaluates         | Ground Truth Needed | Description                                                                                   | Typical Use Cases                                      |
|---------------------------|---------------------------|---------------------|-----------------------------------------------------------------------------------------------|--------------------------------------------------------|
| Embedding Similarity      | Query-context relevance    | ❌ No               | Measures cosine similarity between query and retrieved document embeddings                    | Sanity check for vector retrieval results              |
| Context Overlap           | Surface-level relevance    | ❌ No               | Measures lexical or semantic overlap between query and retrieved text                         | Quick validation of retrieval                         |
| MMR Diversity             | Retrieval diversity        | ❌ No               | Evaluates how different the retrieved documents are from each other                           | Ensure non-redundant context in RAG                   |
| RAGAS (partial)           | Context quality            | ⚠️ Only Query + Context | RAGAS metrics like context precision can run without answer or gold label                     | Evaluate retrieval without needing answers             |
| LLM-as-a-Judge            | Answer/context quality     | ❌ No               | Ask an LLM (local or remote) to evaluate relevance, fluency, or factual consistency           | Quick feedback during RAG development                 |
| Perplexity / LM Loss      | Answer fluency             | ❌ No               | Use the language model's perplexity to estimate how fluent or confident it is in its response | Compare generations across models                     |
| Chunk Keyword Match       | Basic context relevance    | ❌ No               | Check if retrieved chunks contain key terms from the query                                    | Heuristic filter for retrieval quality                |
| Anomaly Detection         | Retrieval reliability      | ❌ No               | Flags chunks that are statistically dissimilar or inconsistent with typical results           | Outlier detection in retrieved context                |







**RAGAS**

In [None]:
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
from langchain_core.documents import Document
from datasets import Dataset
import pandas as pd

print("✅ RAGAS imports successful")

# Prepare evaluation data
eval_data = []
for qa in qa_results:
    question = qa["query"]
    answer = qa["result"]

    # Retrieve relevant context docs
    retrieved_docs = retriever.get_relevant_documents(question)

    # Format for RAGAS
    eval_data.append({
        "question": question,
        "answer": answer,
        "contexts": [Document(page_content=doc.page_content) for doc in retrieved_docs]
    })

# Convert eval_data to Dataset
eval_dataset = Dataset.from_pandas(pd.DataFrame(eval_data))

# Run RAGAS evaluation
# Run RAGAS evaluation with custom LLM + embedding
results = evaluate(
    eval_dataset,
    metrics=[faithfulness, answer_relevancy, context_precision, context_recall],
    llm=llm,
    embeddings=embedding_model
)
# Print results
print(results)


ImportError: cannot import name 'Dataset' from 'ragas' (c:\code\projects\gr\.denv\Lib\site-packages\ragas\__init__.py)

In [16]:
# Create evaluation examples for RAGAS
class EvaluationExample:
    def __init__(self, question, answer, contexts, ground_truth=None):
        self.question = question
        self.answer = answer
        self.contexts = contexts
        self.ground_truth = ground_truth

# Create some evaluation examples using our existing QA results
evaluation_examples = []

# Use our existing qa_results if available, otherwise create some examples
if 'qa_results' in locals() and qa_results:
    for qa in qa_results[:3]:  # Use first 3 results
        # Get contexts from the retrieved documents
        contexts = [doc.page_content for doc in qa.get('source_documents', [])]
        
        example = EvaluationExample(
            question=qa['query'],
            answer=qa['result'],
            contexts=contexts,
            ground_truth=qa['result']  # Using generated answer as ground truth for demo
        )
        evaluation_examples.append(example)
else:
    # Create some example data for testing
    sample_questions = [
        "What is the main purpose of the document?",
        "What are the key findings?",
        "What methodology was used?"
    ]
    
    for question in sample_questions:
        # Get answer from QA chain
        result = qa_chain.invoke({"query": question})
        
        # Get contexts from retriever
        retrieved_docs = retriever.get_relevant_documents(question)
        contexts = [doc.page_content for doc in retrieved_docs]
        
        example = EvaluationExample(
            question=question,
            answer=result['result'],
            contexts=contexts,
            ground_truth=result['result']
        )
        evaluation_examples.append(example)

print(f"✅ Created {len(evaluation_examples)} evaluation examples")
for i, example in enumerate(evaluation_examples):
    print(f"Example {i+1}: {example.question[:50]}...")

✅ Created 3 evaluation examples
Example 1: How does the model deal with missing values?...
Example 2: what is lag feature....
Example 3: What feature is the most useful features for time ...


In [17]:
# Simple RAGAS evaluation without complex wrappers
try:
    from ragas import evaluate
    from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
    import pandas as pd
    
    # Prepare evaluation data as a simple list of dictionaries
    eval_data = []
    for qa in qa_results:
        question = qa["query"]
        answer = qa["result"]
        retrieved_docs = retriever.get_relevant_documents(question)
        
        eval_data.append({
            "question": question,
            "answer": answer,
            "contexts": [doc.page_content for doc in retrieved_docs]
        })
    
    # Convert to pandas DataFrame (works with most RAGAS versions)
    df = pd.DataFrame(eval_data)
    
    # Run evaluation with minimal configuration
    results = evaluate(
        df,
        metrics=[context_precision, context_recall]  # Use only metrics that don't require LLM
    )
    
    print("RAGAS Evaluation Results:")
    print(results)
    
except Exception as e:
    print(f"RAGAS evaluation failed: {e}")
    print("Continuing with other evaluation methods...")


RAGAS evaluation failed: 'DataFrame' object has no attribute 'get_sample_type'
Continuing with other evaluation methods...


In [11]:
# Method 1: Embedding Similarity Evaluation
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from datasets import Dataset
import pandas as pd

def evaluate_embedding_similarity(query, contexts, embedding_model):
    """Evaluate retrieval quality using embedding similarity"""
    # Encode query and contexts
    query_embedding = embedding_model.embed_query(query)
    context_embeddings = embedding_model.embed_documents(contexts)
    
    # Calculate similarities
    similarities = []
    for ctx_emb in context_embeddings:
        sim = cosine_similarity([query_embedding], [ctx_emb])[0][0]
        similarities.append(sim)
    
    return {
        'mean_similarity': np.mean(similarities),
        'max_similarity': np.max(similarities),
        'min_similarity': np.min(similarities),
        'similarities': similarities
    }

# Create evaluation examples based on our retrieval results
class EvaluationExample:
    def __init__(self, question, contexts):
        self.question = question
        self.contexts = contexts

def create_evaluation_examples(questions, retriever, num_contexts=3):
    """Create evaluation examples from questions and retrieval results"""
    examples = []
    
    for question in questions:
        retrieved_docs = retriever.get_relevant_documents(question)
        contexts = [doc.page_content for doc in retrieved_docs[:num_contexts]]
        
        example = EvaluationExample(
            question=question,
            contexts=contexts
        )
        examples.append(example)
    
    return examples

# Create examples
evaluation_examples = create_evaluation_examples(questions, retriever)

# Evaluate embedding similarities for our examples
embedding_results = []
for example in evaluation_examples:
    result = evaluate_embedding_similarity(
        example.question, 
        example.contexts, 
        embedding_model
    )
    result['question'] = example.question
    embedding_results.append(result)

print("Embedding Similarity Results:")
for result in embedding_results[:3]:  # Show first 3
    print(f"\nQ: {result['question'][:50]}...")
    print(f"Mean similarity: {result['mean_similarity']:.3f}")
    print(f"Max similarity: {result['max_similarity']:.3f}")
    print(f"Min similarity: {result['min_similarity']:.3f}")

# RAGAS Evaluation with proper dataset format
try:
    # Prepare data for RAGAS evaluation
    ragas_data = {
        'question': [],
        'answer': [],
        'contexts': [],
        'ground_truth': []
    }
    
    # Convert evaluation examples to RAGAS format
    for example in evaluation_examples:
        ragas_data['question'].append(example.question)
        ragas_data['answer'].append(example.answer)
        ragas_data['contexts'].append(example.contexts)
        ragas_data['ground_truth'].append(example.ground_truth or example.answer)
    
    # Create HuggingFace Dataset
    ragas_dataset = Dataset.from_dict(ragas_data)
    
    print("✅ Dataset created successfully")
    print(f"Dataset size: {len(ragas_dataset)}")
    print("\nDataset structure:")
    print(ragas_dataset)
    
    # Run RAGAS evaluation
    print("\n🔄 Running RAGAS evaluation...")
    
    # Use a subset of metrics to avoid timeout
    metrics = [faithfulness, answer_relevancy]
    
    result = evaluate(
        dataset=ragas_dataset,
        metrics=metrics,
        raise_exceptions=False  # Don't fail on individual metric errors
    )
    
    print("\n✅ RAGAS Evaluation Results:")
    print("-" * 50)
    
    # Convert results to DataFrame for better display
    results_df = result.to_pandas()
    print(results_df[['question', 'faithfulness', 'answer_relevancy']].head())
    
    print(f"\nAverage Scores:")
    for metric in ['faithfulness', 'answer_relevancy']:
        if metric in results_df.columns:
            avg_score = results_df[metric].mean()
            print(f"  {metric}: {avg_score:.3f}")
    
except Exception as e:
    print(f"❌ RAGAS evaluation failed: {str(e)}")
    print("\n🔄 Falling back to custom evaluation methods...")
    
    # Fallback to our custom evaluation functions
    print("\n1. Embedding Similarity Evaluation:")
    embedding_results = []
    for example in evaluation_examples:
        result = evaluate_embedding_similarity(
            example.question, 
            example.contexts, 
            embedding_model
        )
        result['question'] = example.question
        embedding_results.append(result)
    
    embedding_df = pd.DataFrame(embedding_results)
    print(embedding_df[['question', 'avg_similarity', 'max_similarity']].head())
    
    print(f"\nAverage embedding similarity: {embedding_df['avg_similarity'].mean():.3f}")
    
    print("\n2. Keyword Overlap Evaluation:")
    keyword_results = []
    for example in evaluation_examples:
        result = evaluate_keyword_overlap(example.question, example.contexts)
        result['question'] = example.question
        keyword_results.append(result)
    
    keyword_df = pd.DataFrame(keyword_results)
    print(keyword_df[['question', 'avg_overlap', 'max_overlap']].head())
    
    print(f"\nAverage keyword overlap: {keyword_df['avg_overlap'].mean():.3f}")
    
    print("\n✅ Custom evaluation completed successfully")

NameError: name 'evaluation_examples' is not defined

In [None]:
# Method 2: Keyword Overlap Analysis
import re
from collections import Counter
from nltk.corpus import stopwords
import nltk

# Download required NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

def extract_keywords(text, min_length=3):
    """Extract meaningful keywords from text"""
    # Convert to lowercase and extract words
    words = re.findall(r'\b\w+\b', text.lower())
    
    # Filter out stopwords and short words
    stop_words = set(stopwords.words('english'))
    keywords = [word for word in words if word not in stop_words and len(word) >= min_length]
    
    return set(keywords)

def evaluate_keyword_overlap(query, contexts):
    """Evaluate retrieval using keyword overlap"""
    query_keywords = extract_keywords(query)
    
    overlaps = []
    for context in contexts:
        context_keywords = extract_keywords(context)
        
        if len(query_keywords) == 0:
            overlap = 0.0
        else:
            intersection = len(query_keywords.intersection(context_keywords))
            overlap = intersection / len(query_keywords)
        
        overlaps.append(overlap)
    
    return {
        'mean_overlap': np.mean(overlaps),
        'max_overlap': np.max(overlaps),
        'overlaps': overlaps,
        'query_keywords': query_keywords
    }

# Evaluate keyword overlaps
keyword_results = []
for example in evaluation_examples:
    result = evaluate_keyword_overlap(example.question, example.contexts)
    result['question'] = example.question
    keyword_results.append(result)

print("Keyword Overlap Results:")
for result in keyword_results[:3]:
    print(f"\nQ: {result['question'][:50]}...")
    print(f"Query keywords: {list(result['query_keywords'])[:5]}...")  # Show first 5
    print(f"Mean overlap: {result['mean_overlap']:.3f}")
    print(f"Max overlap: {result['max_overlap']:.3f}")

In [None]:
# Method 3: LLM-as-a-Judge for Context Relevance
def evaluate_context_relevance_llm(query, contexts, llm, max_contexts=2):
    """Use LLM to judge context relevance"""
    relevance_scores = []
    
    for i, context in enumerate(contexts[:max_contexts]):  # Limit to avoid token limits
        prompt = f"""
        Evaluate how relevant the following context is to answering the given question.
        Rate the relevance on a scale of 1-5 where:
        1 = Not relevant at all
        2 = Slightly relevant
        3 = Moderately relevant  
        4 = Highly relevant
        5 = Perfectly relevant
        
        Question: {query}
        
        Context: {context[:500]}...
        
        Provide only the numerical score (1-5):
        """
        
        try:
            response = llm.invoke(prompt)
            # Extract numerical score from response
            score_match = re.search(r'[1-5]', response['text'] if isinstance(response, dict) else str(response))
            score = int(score_match.group()) if score_match else 3  # Default to 3 if parsing fails
            relevance_scores.append(score)
        except Exception as e:
            print(f"Error evaluating context {i}: {e}")
            relevance_scores.append(3)  # Default score
    
    return {
        'mean_relevance': np.mean(relevance_scores),
        'relevance_scores': relevance_scores
    }

# Evaluate using LLM-as-a-Judge (limiting to first 2 examples due to processing time)
print("LLM-as-a-Judge Context Relevance Evaluation:")
llm_results = []

for i, example in enumerate(evaluation_examples[:2]):  # Limit for demo
    print(f"\nEvaluating example {i+1}...")
    result = evaluate_context_relevance_llm(example.question, example.contexts, llm)
    result['question'] = example.question
    llm_results.append(result)
    
    print(f"Q: {result['question'][:50]}...")
    print(f"Mean relevance score: {result['mean_relevance']:.2f}/5")
    print(f"Individual scores: {result['relevance_scores']}")

In [None]:
# Method 4: Context Diversity Evaluation
def evaluate_context_diversity(contexts, embedding_model):
    """Evaluate how diverse the retrieved contexts are"""
    if len(contexts) < 2:
        return {'diversity_score': 1.0, 'pairwise_similarities': []}
    
    # Get embeddings for all contexts
    context_embeddings = embedding_model.embed_documents(contexts)
    
    # Calculate pairwise similarities
    pairwise_sims = []
    for i in range(len(context_embeddings)):
        for j in range(i + 1, len(context_embeddings)):
            sim = cosine_similarity([context_embeddings[i]], [context_embeddings[j]])[0][0]
            pairwise_sims.append(sim)
    
    # Diversity score = 1 - average pairwise similarity
    avg_similarity = np.mean(pairwise_sims)
    diversity_score = 1 - avg_similarity
    
    return {
        'diversity_score': diversity_score,
        'avg_pairwise_similarity': avg_similarity,
        'pairwise_similarities': pairwise_sims
    }

# Evaluate diversity for all examples
diversity_results = []
for example in evaluation_examples:
    result = evaluate_context_diversity(example.contexts, embedding_model)
    result['question'] = example.question
    diversity_results.append(result)

print("Context Diversity Results:")
for result in diversity_results[:3]:
    print(f"\nQ: {result['question'][:50]}...")
    print(f"Diversity score: {result['diversity_score']:.3f} (higher = more diverse)")
    print(f"Avg pairwise similarity: {result['avg_pairwise_similarity']:.3f}")

# Comprehensive Evaluation Summary
print("\n" + "="*60)
print("COMPREHENSIVE EVALUATION SUMMARY")
print("="*60)

import pandas as pd

summary_data = []
for i, example in enumerate(evaluation_examples):
    summary_data.append({
        'Query': example.question[:30] + "...",
        'Embedding_Sim': embedding_results[i]['mean_similarity'],
        'Keyword_Overlap': keyword_results[i]['mean_overlap'],
        'Diversity': diversity_results[i]['diversity_score'],
        'Num_Contexts': len(example.contexts)
    })

summary_df = pd.DataFrame(summary_data)
print(summary_df.round(3))

print(f"\nOverall Statistics:")
print(f"Average Embedding Similarity: {summary_df['Embedding_Sim'].mean():.3f}")
print(f"Average Keyword Overlap: {summary_df['Keyword_Overlap'].mean():.3f}")
print(f"Average Diversity: {summary_df['Diversity'].mean():.3f}")

# Quality interpretation
print(f"\nQuality Interpretation:")
avg_emb_sim = summary_df['Embedding_Sim'].mean()
if avg_emb_sim > 0.7:
    print("✅ Excellent embedding similarity - highly relevant contexts")
elif avg_emb_sim > 0.5:
    print("✅ Good embedding similarity - relevant contexts")
else:
    print("⚠️ Low embedding similarity - may need better retrieval")

avg_diversity = summary_df['Diversity'].mean()
if avg_diversity > 0.3:
    print("✅ Good diversity - contexts provide varied information")
else:
    print("⚠️ Low diversity - contexts may be redundant")