In [3]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/sa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/sa/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
import os
import getpass

EMBEDDING_MODEL = "text-embedding-3-small"
CHAT_MODEL = "gpt-4o-mini"   # adjust as desired
COLLECTION_NAME = "adn"
TOP_K = 6
FETCH_K = 24

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

In [5]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from datetime import datetime, timedelta
from langchain_core.documents import Document
import csv
from typing import List

def load_csv_with_csvloader(csv_path: str) -> List[Document]:
    """
    Load a CSV into Documents using LangChain's CSVLoader.
    page_content := only ['title', 'details'].
    All remaining columns preserved as metadata (e.g., id, section, tags, channel_number).
    """
    with open(csv_path, newline="", encoding="utf-8-sig") as f:
        cols = csv.DictReader(f).fieldnames or []

    content_cols = [c for c in ["title", "details"] if c in cols]
    meta_cols = [c for c in cols if c not in content_cols]
    print(f"content_cols: {content_cols}")
    print(f"meta_cols: {meta_cols}")
    loader = CSVLoader(
        file_path=csv_path,
        content_columns=content_cols,
        metadata_columns=meta_cols,
        encoding="utf-8-sig",
        autodetect_encoding=True,
    )
    return loader.load()

# current metadata columns: id,section,subsection,title,details,price_crc,price_text,tags,url,contact_value,channel_number,locale,version
loader = CSVLoader(
    file_path=f"./RAG_data/adn_rag_base_full_v1_3.csv",
    metadata_columns=[
      "id",
      "section",
      "subsection",
      "title",
      "details",
      "price_crc",
      "price_text",
      "tags",
      "url",
      "contact_value",
      "channel_number",
      "locale",
      "version"
    ]
)



In [6]:
adn_data = load_csv_with_csvloader("RAG_data/adn_rag_base_full_v1_3.csv")

for doc in adn_data:
    print(doc.page_content)
    print(doc.metadata)
    break

content_cols: ['title', 'details']
meta_cols: ['id', 'section', 'subsection', 'price_crc', 'price_text', 'tags', 'url', 'contact_value', 'channel_number', 'locale', 'version']
title: Descripción
details: American Data Networks S.A. (ADN) fundada en 2005. Equipo con más de 15 años de experiencia en telecomunicaciones.
Enfoque 100 % en calidad de servicio y soporte oportuno. Actualización constante de tecnologías para brindar servicios de clase mundial.
Opción de transporte nacional e internacional de alta capacidad de datos en Costa Rica; permite optimizar y expandir redes de forma segura y rápida.
{'source': 'RAG_data/adn_rag_base_full_v1_3.csv', 'row': 0, 'id': '2eebd6ef-cd3e-46e4-a268-dd4830bf76aa', 'section': 'Compañía', 'subsection': 'Quiénes Somos', 'price_crc': '', 'price_text': '', 'tags': 'empresa, historia, calidad, telecomunicaciones', 'url': '', 'contact_value': '', 'channel_number': '', 'locale': 'es_CR', 'version': 'v1.3'}


In [7]:
adn_data[0]

Document(metadata={'source': 'RAG_data/adn_rag_base_full_v1_3.csv', 'row': 0, 'id': '2eebd6ef-cd3e-46e4-a268-dd4830bf76aa', 'section': 'Compañía', 'subsection': 'Quiénes Somos', 'price_crc': '', 'price_text': '', 'tags': 'empresa, historia, calidad, telecomunicaciones', 'url': '', 'contact_value': '', 'channel_number': '', 'locale': 'es_CR', 'version': 'v1.3'}, page_content='title: Descripción\ndetails: American Data Networks S.A. (ADN) fundada en 2005. Equipo con más de 15 años de experiencia en telecomunicaciones.\nEnfoque 100 % en calidad de servicio y soporte oportuno. Actualización constante de tecnologías para brindar servicios de clase mundial.\nOpción de transporte nacional e internacional de alta capacidad de datos en Costa Rica; permite optimizar y expandir redes de forma segura y rápida.')

In [8]:
from langchain_community.vectorstores import Qdrant
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

vectorstore = Qdrant.from_documents(
    adn_data,
    embeddings,
    location=":memory:",
    collection_name=COLLECTION_NAME
)



In [9]:
from langchain_experimental.text_splitter import SemanticChunker

semantic_chunker = SemanticChunker(
    embeddings,
    breakpoint_threshold_type="percentile"
)

semantic_documents = semantic_chunker.split_documents(adn_data)

semantic_vectorstore = Qdrant.from_documents(
    semantic_documents,
    embeddings,
    location=":memory:",
    collection_name="ADN_Data_Semantic_Chunks"
)

In [10]:
naive_retriever = vectorstore.as_retriever(search_kwargs={"k" : 6})

semantic_retriever = semantic_vectorstore.as_retriever(search_kwargs={"k" : 6})



In [11]:
from langchain_core.prompts import ChatPromptTemplate

RAG_TEMPLATE = """\
Eres un asistente de soporte que trabaja para American Data Networks. Habla siempre en primera persona como si fueras parte del equipo. Responde SOLO con la información del CONTEXTO proporcionado.
Si la pregunta no se encuentra en el contexto proporcionado de American Data Networks, responde: "No tengo la respuesta a esa pregunta".
Si la pregunta es sobre un producto o servicio que no es de American Data Networks, responde: "No tenemos información sobre ese producto o servicio".

Responde en el mismo idioma en que se hizo la pregunta.
Importante: usa expresiones en primera persona (por ejemplo: “nuestra dirección”, “puedes visitarnos”, “podemos ayudarte”), nunca en tercera persona (“ellos”, “tienen”, “puedes visitarlos”).

Query:
{question}

Context:
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_TEMPLATE)

In [12]:
from langchain_openai import ChatOpenAI

chat_model = ChatOpenAI(model="gpt-4.1-nano")

In [13]:
from langchain_core.runnables import RunnablePassthrough
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

naive_retrieval_chain = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
    {"context": itemgetter("question") | naive_retriever, "question": itemgetter("question")}
    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
    #              by getting the value of the "context" key from the previous step
    | RunnablePassthrough.assign(context=itemgetter("context"))
    # "response" : the "context" and "question" values are used to format our prompt object and then piped
    #              into the LLM and stored in a key called "response"
    # "context"  : populated by getting the value of the "context" key from the previous step
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

In [14]:
semantic_retrieval_chain = (
    {"context": itemgetter("question") | semantic_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

In [15]:
naive_retrieval_chain.invoke({"question" : "tienen History Channel?"})["response"].content

'Sí, en nuestra grilla IPTV tengo disponibles los canales History Channel y History Channel 2.'

In [16]:
semantic_retrieval_chain.invoke({"question" : "Quiero contratar un servicio de ADN"})["response"].content

'Para contratar un servicio de ADN, puedes hacerlo en línea visitando https://data.cr/adquirir-servicios. También puedes contratar por WhatsApp, donde un asesor verificará la cobertura, te recomendará el plan, registrará tus datos y compartirá el enlace de pago o agenda. Si deseas, puedo ayudarte a comenzar con el proceso o proporcionarte más detalles.'

In [17]:
from langchain_community.retrievers import BM25Retriever

bm25_retriever = BM25Retriever.from_documents(adn_data,)

bm25_retrieval_chain = (
    {"context": itemgetter("question") | bm25_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)
bm25_retrieval_chain.invoke({"question" : "Precio de un plan de 100/100 Mbps"})["response"].content

'El plan residencial de 100/100 Mbps tiene un precio de ₡20,500 IVI. Puedes contactarnos si necesitas más información o si deseas contratar este plan.'

In [18]:
from langchain.retrievers.multi_query import MultiQueryRetriever

multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=naive_retriever, llm=chat_model
)

multi_query_retrieval_chain = (
    {"context": itemgetter("question") | multi_query_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

multi_query_retrieval_chain.invoke({"question" : "Precio de un plan de 100 megas, y que mas tienes, y cuales son los planes mas populares"})["response"].content

'Nuestro plan de 100 megas es el Plan 100/100 Mbps, que es un plan residencial simétrico e incluye Equipo Wi-Fi, Firewall y Soporte 24/7, y su precio es de ₡20,500 IVI. \n\nAdemás, tenemos otros planes populares como el Plan 500/500 Mbps por ₡33,410 IVI, y el Plan 1/1 Gbps por ₡26,600 IVI. Los planes más populares son el de 500/500 Mbps y el de 1 Gbps. \n\nTambién puedes contratar en línea, seleccionando el plan y los adicionales que desees, y siguiendo los pasos para completar tu proceso de contratación.'

In [19]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from qdrant_client import QdrantClient, models

parent_docs = adn_data
child_splitter = RecursiveCharacterTextSplitter(chunk_size=750)

from langchain_qdrant import QdrantVectorStore

client = QdrantClient(location=":memory:")

client.create_collection(
    collection_name="full_documents",
    vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE)
)

parent_document_vectorstore = QdrantVectorStore(
    collection_name="full_documents", embedding=OpenAIEmbeddings(model="text-embedding-3-small"), client=client
)

store = InMemoryStore()

parent_document_retriever = ParentDocumentRetriever(
    vectorstore = parent_document_vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)

parent_document_retriever.add_documents(parent_docs, ids=None)

parent_document_retrieval_chain = (
    {"context": itemgetter("question") | parent_document_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | chat_model, "context": itemgetter("context")}
)

parent_document_retrieval_chain.invoke({"question" : "Precio de un plan de 100 megas, y que mas tienes, y cual es el plan mas popular"})["response"].content

'Nuestro plan de 100 megas cuesta ₡20,500 IVI. Además, tenemos otros planes como el de 250 megas por ₡26,600 IVI y el plan más popular de 500 megas por ₡33,410 IVI. Si quieres más información sobre otros planes o detalles específicos, puedo ayudarte con eso.'

In [20]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

In [21]:
from ragas.testset import TestsetGenerator
from typing import List

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)

# Create manual test set with realistic user questions
manual_test_set_updated = [
    # {
    #     "question": "¿Qué es American Data Networks (ADN) y desde cuándo existe?",
    #     "contexts": "",
    #     "answer": "",
    #     "ground_truth": "American Data Networks S.A. (ADN) es una empresa de telecomunicaciones fundada en 2005."
    # },
    # {
    #     "question": "¿Cuál es la misión de ADN?",
    #     "contexts": "",
    #     "answer": "",
    #     "ground_truth": "Ser la primera opción en servicios de telecomunicaciones, garantizando transporte de datos rápido, eficiente y seguro."
    # },
    {
        "question": "¿Y la visión de la empresa cuál es? Y desde cuándo existe?",
        "contexts": "",
        "answer": "",
        "ground_truth": "Ser la compañía líder en vanguardia tecnológica en Costa Rica, con enfoque claro en clientes y estándares de clase mundial. Fue Fundada en 2005"
    },
    # {
    #     "question": "Estoy armando mi plan en casa: ¿cuánto cuesta el plan residencial de 100/100 megas?",
    #     "contexts": "",
    #     "answer": "",
    #     "ground_truth": "₡20 500 IVI."
    # },
    # {
    #     "question": "¿Cuál es el precio del plan de 250/250 megas simétricos?",
    #     "contexts": "",
    #     "answer": "",
    #     "ground_truth": "₡26 600 IVI."
    # },
    # {
    #     "question": "Quiero más velocidad: ¿cuánto vale el plan de 500/500 megas?",
    #     "contexts": "",
    #     "answer": "",
    #     "ground_truth": "₡33 410 IVI."
    # },
    {
        "question": "¿Tienen plan de 1 giga simétrico? ¿En cuánto sale al mes?",
        "contexts": "",
        "answer": "",
        "ground_truth": "₡49 500 IVI por mes."
    },
    {
        "question": "¿Qué incluyen los planes residenciales además del Internet?",
        "contexts": "",
        "answer": "",
        "ground_truth": "Incluyen equipo Wi-Fi, firewall gestionado y soporte técnico 24/7."
    },
    # {
    #     "question": "Quiero agregar telefonía fija a mi plan: ¿cuánto cuesta al mes?",
    #     "contexts": "",
    #     "answer": "",
    #     "ground_truth": "₡3 590 IVI/mes."
    # },
    # {
    #     "question": "Si compro IPTV, ¿cuántos dispositivos como máximo puedo adquirir?",
    #     "contexts": "",
    #     "answer": "",
    #     "ground_truth": "Hasta 10 dispositivos IPTV."
    # },
    # {
    #     "question": "¿Y cuánto cuesta cada dispositivo IPTV?",
    #     "contexts": "",
    #     "answer": "",
    #     "ground_truth": "₡4 590 IVI por dispositivo."
    # },
    # {
    #     "question": "Más o menos, ¿cuántos canales trae el servicio de IPTV?",
    #     "contexts": "",
    #     "answer": "",
    #     "ground_truth": "Más de 100 canales."
    # },
    # {
    #     "question": "¿Qué impuestos y cargos ya vienen incluidos en los precios IVI?",
    #     "contexts": "",
    #     "answer": "",
    #     "ground_truth": "IVA 13 %, 1 % Cruz Roja y 0,75 % 9-1-1."
    # },
    # {
    #     "question": "¿Cuál es el número de WhatsApp para comprar o pedir soporte?",
    #     "contexts": "",
    #     "answer": "",
    #     "ground_truth": "+506 7087-8240."
    # },
    # {
    #     "question": "Necesito llamarles: ¿cuál es el teléfono principal de ADN?",
    #     "contexts": "",
    #     "answer": "",
    #     "ground_truth": "+506 4050-5050."
    # }
]


print("Generating test data for ALL retrievers...")

# Initialize storage for all retrievers
retriever_responses = {
    "naive": [],
    "semantic": [],
    "bm25": [],
    "multi_query": [],
    "parent_document": []
}

# Dictionary of retrieval chains
retrieval_chains = {
    "naive": naive_retrieval_chain,
    "semantic": semantic_retrieval_chain,
    "bm25": bm25_retrieval_chain,
    "multi_query": multi_query_retrieval_chain,
    "parent_document": parent_document_retrieval_chain
}

# Generate responses for each retriever
for retriever_name, chain in retrieval_chains.items():
    print(f"\nGenerating test data for {retriever_name.upper()} retriever...")
    
    for i, item in enumerate(manual_test_set_updated):
        question = item["question"]
        
        try:
            # Use the specific retrieval chain
            response = chain.invoke({"question": question})
            
            updated_item = {
                "question": question,
                "contexts": [doc.page_content for doc in response['context']],
                "answer": response['response'].content,
                "ground_truth": item["ground_truth"]
            }
            
            retriever_responses[retriever_name].append(updated_item)
            print(f"✓ Processed question {i+1}: {question[:40]}...")
            
        except Exception as e:
            print(f"❌ Error processing question {i+1} for {retriever_name}: {str(e)}")
            # Add placeholder data to maintain consistency
            updated_item = {
                "question": question,
                "contexts": ["Error: Could not retrieve context"],
                "answer": "Error: Could not generate answer",
                "ground_truth": item["ground_truth"]
            }
            retriever_responses[retriever_name].append(updated_item)

print("\n✓ All retriever test data generated!")




Generating test data for ALL retrievers...

Generating test data for NAIVE retriever...
✓ Processed question 1: ¿Y la visión de la empresa cuál es? Y de...
✓ Processed question 2: ¿Tienen plan de 1 giga simétrico? ¿En cu...
✓ Processed question 3: ¿Qué incluyen los planes residenciales a...

Generating test data for SEMANTIC retriever...
✓ Processed question 1: ¿Y la visión de la empresa cuál es? Y de...
✓ Processed question 2: ¿Tienen plan de 1 giga simétrico? ¿En cu...
✓ Processed question 3: ¿Qué incluyen los planes residenciales a...

Generating test data for BM25 retriever...
✓ Processed question 1: ¿Y la visión de la empresa cuál es? Y de...
✓ Processed question 2: ¿Tienen plan de 1 giga simétrico? ¿En cu...
✓ Processed question 3: ¿Qué incluyen los planes residenciales a...

Generating test data for MULTI_QUERY retriever...
✓ Processed question 1: ¿Y la visión de la empresa cuál es? Y de...
✓ Processed question 2: ¿Tienen plan de 1 giga simétrico? ¿En cu...
✓ Processed question 

In [22]:
from datasets import Dataset
import pandas as pd

# Create datasets for all retrievers
retriever_datasets = {}
retriever_dataframes = {}

for retriever_name, responses in retriever_responses.items():
    df = pd.DataFrame(responses)
    dataset = Dataset.from_pandas(df)
    
    retriever_datasets[retriever_name] = dataset
    retriever_dataframes[retriever_name] = df
    
    print(f"{retriever_name.capitalize()} dataset shape: {df.shape}")

print("\n✓ All datasets created successfully!")


Naive dataset shape: (3, 4)
Semantic dataset shape: (3, 4)
Bm25 dataset shape: (3, 4)
Multi_query dataset shape: (3, 4)
Parent_document dataset shape: (3, 4)

✓ All datasets created successfully!


In [23]:
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
import pandas as pd
import numpy as np

# Define metrics
metrics = [
    faithfulness,
    answer_relevancy, 
    context_precision,
    context_recall
]

# Storage for evaluation results
evaluation_results = {}

def extract_score_from_result(result, metric_name):
    """Safely extract score from RAGAS EvaluationResult object."""
    try:
        # Try dictionary-style access first
        score = result[metric_name]
        if isinstance(score, list):
            # Calculate average for list of scores
            valid_scores = [s for s in score if s is not None and not np.isnan(s)]
            return sum(valid_scores) / len(valid_scores) if valid_scores else 0.0
        elif isinstance(score, (int, float)) and not np.isnan(score):
            return float(score)
        else:
            return 0.0
    except (KeyError, TypeError, AttributeError):
        try:
            # Try attribute access as fallback
            score = getattr(result, metric_name, None)
            if score is not None:
                if isinstance(score, list):
                    valid_scores = [s for s in score if s is not None and not np.isnan(s)]
                    return sum(valid_scores) / len(valid_scores) if valid_scores else 0.0
                elif isinstance(score, (int, float)) and not np.isnan(score):
                    return float(score)
            return 0.0
        except:
            return 0.0

print("Starting comprehensive evaluation of all retrievers...")
print("=" * 60)

for retriever_name, dataset in retriever_datasets.items():
    print(f"\nEvaluating {retriever_name.upper()} retriever...")
    print("-" * 40)
    
    try:
        result = evaluate(
            dataset,
            metrics=metrics,
            llm=generator_llm,
            embeddings=generator_embeddings
        )
        
        evaluation_results[retriever_name] = result
        print(f"✅ {retriever_name.capitalize()} evaluation completed successfully!")
        
        # Show immediate results using corrected extraction
        for metric_name in ['faithfulness', 'answer_relevancy', 'context_precision', 'context_recall']:
            avg_score = extract_score_from_result(result, metric_name)
            print(f"  {metric_name}: {avg_score:.4f}")
            
    except Exception as e:
        print(f"❌ Error evaluating {retriever_name}: {str(e)}")
        # Create placeholder result
        evaluation_results[retriever_name] = {
            'faithfulness': 0.0,
            'answer_relevancy': 0.0, 
            'context_precision': 0.0,
            'context_recall': 0.0
        }

print("\n" + "=" * 60)
print("✅ ALL EVALUATIONS COMPLETED!")

Starting comprehensive evaluation of all retrievers...

Evaluating NAIVE retriever...
----------------------------------------


Evaluating:   0%|          | 0/12 [00:00<?, ?it/s]

✅ Naive evaluation completed successfully!
  faithfulness: 0.9333
  answer_relevancy: 0.9188
  context_precision: 0.3931
  context_recall: 0.6667

Evaluating SEMANTIC retriever...
----------------------------------------


Evaluating:   0%|          | 0/12 [00:00<?, ?it/s]

✅ Semantic evaluation completed successfully!
  faithfulness: 0.9333
  answer_relevancy: 0.9165
  context_precision: 0.2264
  context_recall: 0.6667

Evaluating BM25 retriever...
----------------------------------------


Evaluating:   0%|          | 0/12 [00:00<?, ?it/s]

✅ Bm25 evaluation completed successfully!
  faithfulness: 1.0000
  answer_relevancy: 0.3249
  context_precision: 0.0000
  context_recall: 0.0000

Evaluating MULTI_QUERY retriever...
----------------------------------------


Evaluating:   0%|          | 0/12 [00:00<?, ?it/s]

✅ Multi_query evaluation completed successfully!
  faithfulness: 0.8917
  answer_relevancy: 0.9239
  context_precision: 0.2819
  context_recall: 0.6667

Evaluating PARENT_DOCUMENT retriever...
----------------------------------------


Evaluating:   0%|          | 0/12 [00:00<?, ?it/s]

✅ Parent_document evaluation completed successfully!
  faithfulness: 0.9333
  answer_relevancy: 0.9231
  context_precision: 0.4630
  context_recall: 0.6667

✅ ALL EVALUATIONS COMPLETED!


In [24]:


def extract_score_safe(result, metric_name):
    """Safely extract score from RAGAS result with comprehensive error handling."""
    try:
        # Handle both EvaluationResult objects and dictionaries
        if hasattr(result, '__getitem__'):
            try:
                score = result[metric_name]
            except (KeyError, TypeError):
                # Fallback to attribute access
                score = getattr(result, metric_name, None)
        else:
            # Try attribute access first
            score = getattr(result, metric_name, None)
        
        if score is None:
            return 0.0
            
        if isinstance(score, list):
            # Calculate average for list of scores
            valid_scores = [s for s in score if s is not None and not (isinstance(s, float) and np.isnan(s))]
            return sum(valid_scores) / len(valid_scores) if valid_scores else 0.0
        elif isinstance(score, (int, float)) and not np.isnan(score):
            return float(score)
        else:
            return 0.0
    except (KeyError, TypeError, AttributeError) as e:
        print(f"Warning: Could not extract {metric_name}: {e}")
        return 0.0

# Extract scores for all retrievers
metrics_names = ['faithfulness', 'answer_relevancy', 'context_precision', 'context_recall']
retriever_names = ['naive', 'semantic', 'bm25', 'multi_query', 'parent_document']

# Create comprehensive results matrix
results_matrix = {}
for retriever_name in retriever_names:
    result = evaluation_results[retriever_name]
    scores = [extract_score_safe(result, metric) for metric in metrics_names]
    results_matrix[retriever_name] = scores

# Create comparison DataFrame
comparison_data = {
    'Metric': [
        'Faithfulness',
        'Answer Relevancy',
        'Context Precision', 
        'Context Recall'
    ]
}

# Add columns for each retriever
for retriever_name in retriever_names:
    comparison_data[retriever_name.replace('_', ' ').title()] = results_matrix[retriever_name]

# Add descriptions
comparison_data['Description'] = [
    'Factual consistency with context',
    'Relevance to the question',
    'Precision of retrieved context',
    'Recall of relevant context'
]

comparison_df = pd.DataFrame(comparison_data)

# Calculate averages for each retriever
averages = {}
for retriever_name in retriever_names:
    scores = results_matrix[retriever_name]
    valid_scores = [score for score in scores if score > 0]
    averages[retriever_name] = np.mean(valid_scores) if valid_scores else 0.0

print("=" * 100)
print("                    COMPREHENSIVE RAG RETRIEVER PERFORMANCE COMPARISON")
print("=" * 100)
print()

# Display main comparison table
print(comparison_df.round(4).to_string(index=False))
print()

print("-" * 100)
print(f"{'OVERALL AVERAGES':^100}")
print("-" * 100)

# Sort retrievers by performance
sorted_retrievers = sorted(averages.items(), key=lambda x: x[1], reverse=True)

for i, (retriever_name, avg_score) in enumerate(sorted_retrievers):
    rank_emoji = "🥇" if i == 0 else "🥈" if i == 1 else "🥉" if i == 2 else f"{i+1}️⃣"
    print(f"{rank_emoji} {retriever_name.replace('_', ' ').title():<20}: {avg_score:.4f}")

print()
best_retriever = sorted_retrievers[0][0]
best_score = sorted_retrievers[0][1]
print(f"🏆 WINNER: {best_retriever.replace('_', ' ').title().upper()} RETRIEVER!")
print(f"Best Overall Score: {best_score:.4f}")
print("=" * 100)

                    COMPREHENSIVE RAG RETRIEVER PERFORMANCE COMPARISON

           Metric  Naive  Semantic   Bm25  Multi Query  Parent Document                      Description
     Faithfulness 0.9333    0.9333 1.0000       0.8917           0.9333 Factual consistency with context
 Answer Relevancy 0.9188    0.9165 0.3249       0.9239           0.9231        Relevance to the question
Context Precision 0.3931    0.2264 0.0000       0.2819           0.4630   Precision of retrieved context
   Context Recall 0.6667    0.6667 0.0000       0.6667           0.6667       Recall of relevant context

----------------------------------------------------------------------------------------------------
                                          OVERALL AVERAGES                                          
----------------------------------------------------------------------------------------------------
🥇 Parent Document     : 0.7465
🥈 Naive               : 0.7280
🥉 Multi Query         : 0.6910
4️⃣ Se

In [25]:
print("\n" + "=" * 80)
print("                        DETAILED PERFORMANCE ANALYSIS")
print("=" * 80)

# Metric-by-metric analysis
print("\nMETRIC-BY-METRIC WINNERS:")
print("-" * 40)

for i, metric in enumerate(comparison_df['Metric']):
    metric_scores = {}
    for j, retriever_name in enumerate(retriever_names):
        col_name = retriever_name.replace('_', ' ').title()
        metric_scores[retriever_name] = comparison_df.iloc[i][col_name]
    
    # Find best performer for this metric
    best_performer = max(metric_scores.items(), key=lambda x: x[1])
    worst_performer = min(metric_scores.items(), key=lambda x: x[1])
    
    print(f"📊 {metric}:")
    print(f"   🏆 Best:  {best_performer[0].replace('_', ' ').title()} ({best_performer[1]:.4f})")
    print(f"   📉 Worst: {worst_performer[0].replace('_', ' ').title()} ({worst_performer[1]:.4f})")
    print()

# Strengths and weaknesses analysis
print("RETRIEVER STRENGTHS & WEAKNESSES:")
print("-" * 45)

for retriever_name in retriever_names:
    scores = results_matrix[retriever_name]
    best_metric_idx = np.argmax(scores)
    worst_metric_idx = np.argmin(scores)
    
    best_metric = metrics_names[best_metric_idx]
    worst_metric = metrics_names[worst_metric_idx]
    
    print(f"🔍 {retriever_name.replace('_', ' ').title()}:")
    print(f"   ✅ Strongest: {best_metric.replace('_', ' ').title()} ({scores[best_metric_idx]:.4f})")
    print(f"   ⚠️  Weakest:  {worst_metric.replace('_', ' ').title()} ({scores[worst_metric_idx]:.4f})")
    print()

# Recommendations
print("🎯 PRODUCTION RECOMMENDATIONS:")
print("-" * 35)

if best_retriever == "semantic":
    print("✅ Deploy SEMANTIC RETRIEVER for production")
    print("✅ Excellent balance of context precision and recall")
elif best_retriever == "multi_query":
    print("✅ Deploy MULTI-QUERY RETRIEVER for production") 
    print("✅ Superior query understanding and context retrieval")
elif best_retriever == "parent_document":
    print("✅ Deploy PARENT DOCUMENT RETRIEVER for production")
    print("✅ Excellent for maintaining document context integrity")
elif best_retriever == "bm25":
    print("✅ Deploy BM25 RETRIEVER for production")
    print("✅ Strong traditional IR performance with keyword matching")
else:
    print("✅ Deploy NAIVE RETRIEVER for production")
    print("✅ Simple and effective baseline performance")



print("\n" + "=" * 80)


                        DETAILED PERFORMANCE ANALYSIS

METRIC-BY-METRIC WINNERS:
----------------------------------------
📊 Faithfulness:
   🏆 Best:  Bm25 (1.0000)
   📉 Worst: Multi Query (0.8917)

📊 Answer Relevancy:
   🏆 Best:  Multi Query (0.9239)
   📉 Worst: Bm25 (0.3249)

📊 Context Precision:
   🏆 Best:  Parent Document (0.4630)
   📉 Worst: Bm25 (0.0000)

📊 Context Recall:
   🏆 Best:  Naive (0.6667)
   📉 Worst: Bm25 (0.0000)

RETRIEVER STRENGTHS & WEAKNESSES:
---------------------------------------------
🔍 Naive:
   ✅ Strongest: Faithfulness (0.9333)
   ⚠️  Weakest:  Context Precision (0.3931)

🔍 Semantic:
   ✅ Strongest: Faithfulness (0.9333)
   ⚠️  Weakest:  Context Precision (0.2264)

🔍 Bm25:
   ✅ Strongest: Faithfulness (1.0000)
   ⚠️  Weakest:  Context Precision (0.0000)

🔍 Multi Query:
   ✅ Strongest: Answer Relevancy (0.9239)
   ⚠️  Weakest:  Context Precision (0.2819)

🔍 Parent Document:
   ✅ Strongest: Faithfulness (0.9333)
   ⚠️  Weakest:  Context Precision (0.4630)

🎯 