In [None]:
import sys, os, textwrap, json, math, re
from getpass import getpass

print("üîß Hardening environment (prevents common Colab/py3.12 numpy corruption)...")

!pip -q uninstall -y numpy || true
!pip -q install --no-cache-dir --force-reinstall "numpy==1.26.4"

!pip -q install -U deepeval openai scikit-learn pandas tqdm

print("‚úÖ Packages installed.")


import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from deepeval import evaluate
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.metrics import (
    AnswerRelevancyMetric,
    FaithfulnessMetric,
    ContextualRelevancyMetric,
    ContextualPrecisionMetric,
    ContextualRecallMetric,
    GEval,
)

print("‚úÖ Imports loaded successfully.")


OPENAI_API_KEY = getpass("üîë Enter OPENAI_API_KEY (leave empty to run without OpenAI): ").strip()
openai_enabled = bool(OPENAI_API_KEY)

if openai_enabled:
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
print(f"üîå OpenAI enabled: {openai_enabled}")

In [None]:
DOCS = [
    {
        "id": "doc_01",
        "title": "DeepEval Overview",
        "text": (
            "DeepEval is an open-source LLM evaluation framework for unit testing LLM apps. "
            "It supports LLM-as-a-judge metrics, custom metrics like G-Eval, and RAG metrics "
            "such as contextual precision and faithfulness."
        ),
    },
    {
        "id": "doc_02",
        "title": "RAG Evaluation: Why Faithfulness Matters",
        "text": (
            "Faithfulness checks whether the answer is supported by retrieved context. "
            "In RAG, hallucinations occur when the model states claims not grounded in context."
        ),
    },
    {
        "id": "doc_03",
        "title": "Contextual Precision",
        "text": (
            "Contextual precision evaluates how well retrieved chunks are ranked by relevance "
            "to a query. High precision means relevant chunks appear earlier in the ranked list."
        ),
    },
    {
        "id": "doc_04",
        "title": "Contextual Recall",
        "text": (
            "Contextual recall measures whether the retriever returns enough relevant context "
            "to answer the query. Low recall means key information was missed in retrieval."
        ),
    },
    {
        "id": "doc_05",
        "title": "Answer Relevancy",
        "text": (
            "Answer relevancy measures whether the generated answer addresses the user's query. "
            "Even grounded answers can be irrelevant if they don't respond to the question."
        ),
    },
    {
        "id": "doc_06",
        "title": "G-Eval (GEval) Custom Rubrics",
        "text": (
            "G-Eval lets you define evaluation criteria in natural language. "
            "It uses an LLM judge to score outputs against your rubric (e.g., correctness, tone, policy)."
        ),
    },
    {
        "id": "doc_07",
        "title": "What a DeepEval Test Case Contains",
        "text": (
            "A test case typically includes input (query), actual_output (model answer), "
            "expected_output (gold answer), and retrieval_context (ranked retrieved passages) for RAG."
        ),
    },
    {
        "id": "doc_08",
        "title": "Common Pitfall: Missing expected_output",
        "text": (
            "Some RAG metrics require expected_output in addition to input and retrieval_context. "
            "If expected_output is None, evaluation fails for metrics like contextual precision/recall."
        ),
    },
]


EVAL_QUERIES = [
    {
        "query": "What is DeepEval used for?",
        "expected": "DeepEval is used to evaluate and unit test LLM applications using metrics like LLM-as-a-judge, G-Eval, and RAG metrics.",
    },
    {
        "query": "What does faithfulness measure in a RAG system?",
        "expected": "Faithfulness measures whether the generated answer is supported by the retrieved context and avoids hallucinations not grounded in that context.",
    },
    {
        "query": "What does contextual precision mean?",
        "expected": "Contextual precision evaluates whether relevant retrieved chunks are ranked higher than irrelevant ones for a given query.",
    },
    {
        "query": "What does contextual recall mean in retrieval?",
        "expected": "Contextual recall measures whether the retriever returns enough relevant context to answer the query, capturing key missing information issues.",
    },
    {
        "query": "Why might an answer be relevant but still low quality in RAG?",
        "expected": "An answer can address the question (relevant) but still be low quality if it is not grounded in retrieved context or misses important details.",
    },
]

In [None]:
class TfidfRetriever:
    def __init__(self, docs):
        self.docs = docs
        self.texts = [f"{d['title']}\n{d['text']}" for d in docs]
        self.vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))
        self.matrix = self.vectorizer.fit_transform(self.texts)

    def retrieve(self, query, k=4):
        qv = self.vectorizer.transform([query])
        sims = cosine_similarity(qv, self.matrix).flatten()
        top_idx = np.argsort(-sims)[:k]
        results = []
        for i in top_idx:
            results.append(
                {
                    "id": self.docs[i]["id"],
                    "score": float(sims[i]),
                    "text": self.texts[i],
                }
            )
        return results

retriever = TfidfRetriever(DOCS)

In [None]:
def extractive_baseline_answer(query, retrieved_contexts):
    """
    Offline fallback: we create a short answer by extracting the most relevant sentences.
    This keeps the notebook runnable even without OpenAI.
    """
    joined = "\n".join(retrieved_contexts)
    sents = re.split(r"(?<=[.!?])\s+", joined)
    keywords = [w.lower() for w in re.findall(r"[a-zA-Z]{4,}", query)]
    scored = []
    for s in sents:
        s_l = s.lower()
        score = sum(1 for k in keywords if k in s_l)
        if len(s.strip()) > 20:
            scored.append((score, s.strip()))
    scored.sort(key=lambda x: (-x[0], -len(x[1])))
    best = [s for sc, s in scored[:3] if sc > 0]
    if not best:
        best = [s.strip() for s in sents[:2] if len(s.strip()) > 20]
    ans = " ".join(best).strip()
    if not ans:
        ans = "I could not find enough context to answer confidently."
    return ans

def openai_answer(query, retrieved_contexts, model="gpt-4.1-mini"):
    """
    Simple RAG prompt for demonstration. DeepEval metrics can still evaluate even if
    your generation prompt differs; the key is we store retrieval_context separately.
    """
    from openai import OpenAI
    client = OpenAI()

    context_block = "\n\n".join([f"[CTX {i+1}]\n{c}" for i, c in enumerate(retrieved_contexts)])
    prompt = f"""You are a concise technical assistant.
Use ONLY the provided context to answer the query. If the answer is not in context, say you don't know.

Query:
{query}

Context:
{context_block}

Answer:"""
    resp = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2,
    )
    return resp.choices[0].message.content.strip()

def rag_answer(query, retrieved_contexts):
    if openai_enabled:
        try:
            return openai_answer(query, retrieved_contexts)
        except Exception as e:
            print(f"‚ö†Ô∏è OpenAI generation failed, falling back to extractive baseline. Error: {e}")
            return extractive_baseline_answer(query, retrieved_contexts)
    else:
        return extractive_baseline_answer(query, retrieved_contexts)

In [None]:
print("\nüöÄ Running RAG to create test cases...")

test_cases = []
K = 4

for item in tqdm(EVAL_QUERIES):
    q = item["query"]
    expected = item["expected"]

    retrieved = retriever.retrieve(q, k=K)
    retrieval_context = [r["text"] for r in retrieved]

    actual = rag_answer(q, retrieval_context)

    tc = LLMTestCase(
        input=q,
        actual_output=actual,
        expected_output=expected,
        retrieval_context=retrieval_context,
    )
    test_cases.append(tc)

print(f"‚úÖ Built {len(test_cases)} LLMTestCase objects.")

print("\n‚úÖ Metrics configured.")

metrics = [
    AnswerRelevancyMetric(threshold=0.5, model="gpt-4.1", include_reason=True, async_mode=True),
    FaithfulnessMetric(threshold=0.5, model="gpt-4.1", include_reason=True, async_mode=True),
    ContextualRelevancyMetric(threshold=0.5, model="gpt-4.1", include_reason=True, async_mode=True),
    ContextualPrecisionMetric(threshold=0.5, model="gpt-4.1", include_reason=True, async_mode=True),
    ContextualRecallMetric(threshold=0.5, model="gpt-4.1", include_reason=True, async_mode=True),

    GEval(
        name="RAG Correctness Rubric (GEval)",
        criteria=(
            "Score the answer for correctness and usefulness. "
            "The answer must directly address the query, must not invent facts not supported by context, "
            "and should be concise but complete."
        ),
        evaluation_params=[
            LLMTestCaseParams.INPUT,
            LLMTestCaseParams.ACTUAL_OUTPUT,
            LLMTestCaseParams.EXPECTED_OUTPUT,
            LLMTestCaseParams.RETRIEVAL_CONTEXT,
        ],
        model="gpt-4.1",
        threshold=0.5,
        async_mode=True,
    ),
]

if not openai_enabled:
    print("\n‚ö†Ô∏è You did NOT provide an OpenAI API key.")
    print("DeepEval's LLM-as-a-judge metrics (AnswerRelevancy/Faithfulness/Contextual* and GEval) require an LLM judge.")
    print("Re-run this cell and provide OPENAI_API_KEY to run DeepEval metrics.")
    print("\n‚úÖ However, your RAG pipeline + test case construction succeeded end-to-end.")
    rows = []
    for i, tc in enumerate(test_cases):
        rows.append({
            "id": i,
            "query": tc.input,
            "actual_output": tc.actual_output[:220] + ("..." if len(tc.actual_output) > 220 else ""),
            "expected_output": tc.expected_output[:220] + ("..." if len(tc.expected_output) > 220 else ""),
            "contexts": len(tc.retrieval_context or []),
        })
    display(pd.DataFrame(rows))
    raise SystemExit("Stopped before evaluation (no OpenAI key).")

In [3]:
print("\nüß™ Running DeepEval evaluate(...) ...")

results = evaluate(test_cases=test_cases, metrics=metrics)

summary_rows = []
for idx, tc in enumerate(test_cases):
    row = {
        "case_id": idx,
        "query": tc.input,
        "actual_output": tc.actual_output[:200] + ("..." if len(tc.actual_output) > 200 else ""),
    }
    for m in metrics:
        row[m.__class__.__name__ if hasattr(m, "__class__") else str(m)] = None

    summary_rows.append(row)

def try_extract_case_metrics(results_obj):
    extracted = []
    candidates = []
    for attr in ["test_results", "results", "evaluations"]:
        if hasattr(results_obj, attr):
            candidates = getattr(results_obj, attr)
            break
    if not candidates and isinstance(results_obj, list):
        candidates = results_obj

    for case_i, case_result in enumerate(candidates or []):
        item = {"case_id": case_i}
        metrics_list = None
        for attr in ["metrics_data", "metrics", "metric_results"]:
            if hasattr(case_result, attr):
                metrics_list = getattr(case_result, attr)
                break
        if isinstance(metrics_list, dict):
            for k, v in metrics_list.items():
                item[f"{k}_score"] = getattr(v, "score", None) if v is not None else None
                item[f"{k}_reason"] = getattr(v, "reason", None) if v is not None else None
        else:
            for mr in metrics_list or []:
                name = getattr(mr, "name", None) or getattr(getattr(mr, "metric", None), "name", None)
                if not name:
                    name = mr.__class__.__name__
                item[f"{name}_score"] = getattr(mr, "score", None)
                item[f"{name}_reason"] = getattr(mr, "reason", None)
        extracted.append(item)
    return extracted

case_metrics = try_extract_case_metrics(results)

df_base = pd.DataFrame([{
    "case_id": i,
    "query": tc.input,
    "actual_output": tc.actual_output,
    "expected_output": tc.expected_output,
} for i, tc in enumerate(test_cases)])

df_metrics = pd.DataFrame(case_metrics) if case_metrics else pd.DataFrame([])
df = df_base.merge(df_metrics, on="case_id", how="left")

score_cols = [c for c in df.columns if c.endswith("_score")]
compact = df[["case_id", "query"] + score_cols].copy()

print("\nüìä Compact score table:")
display(compact)

print("\nüßæ Full details (includes reasons):")
display(df)

print("\n‚úÖ Done. Tip: if contextual precision/recall are low, improve retriever ranking/coverage; if faithfulness is low, tighten generation to only use context.")

üîß Hardening environment (prevents common Colab/py3.12 numpy corruption)...
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.3 which is incompatible.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
shap 0.50.0 requires numpy>=2, but you have numpy 1.26.4 which is incompatible.
jaxlib 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
pytensor 2.35.1 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
jax 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
rasterio 1.4.4 requires click!=8.2.*,>=4.0, but you have

  0%|          | 0/5 [00:00<?, ?it/s]

‚úÖ Built 5 LLMTestCase objects.

‚úÖ Metrics configured.

üß™ Running DeepEval evaluate(...) ...


Output()

INFO:deepeval.evaluate.execute:in _a_execute_llm_test_cases
INFO:deepeval.evaluate.execute:in _a_execute_llm_test_cases
INFO:deepeval.evaluate.execute:in _a_execute_llm_test_cases
INFO:deepeval.evaluate.execute:in _a_execute_llm_test_cases
INFO:deepeval.evaluate.execute:in _a_execute_llm_test_cases
ERROR:deepeval.retry.openai:call timed out after 88.5s (per attempt). Increase DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE (None disables) or reduce work per attempt. Retrying: 1 time(s)...
INFO:deepeval.retry.openai:Retrying in 1.6284550566606426 s (attempt 1) after TimeoutError('call timed out after 88.5s (per attempt). Increase DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE (None disables) or reduce work per attempt.')
ERROR:deepeval.retry.openai:call timed out after 88.5s (per attempt). Increase DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE (None disables) or reduce work per attempt. Retrying: 1 time(s)...
INFO:deepeval.retry.openai:Retrying in 2.088525996982983 s (attempt 1) after Time



Metrics Summary

  - ‚úÖ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4.1, reason: The score is 1.00 because the answer was fully relevant and directly addressed the question without any irrelevant information. Great job!, error: None)
  - ‚úÖ Faithfulness (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4.1, reason: The score is 1.00 because there are no contradictions‚Äîgreat job staying true to the retrieval context!, error: None)
  - ‚ùå Contextual Relevancy (score: 0.25, threshold: 0.5, strict: False, evaluation model: gpt-4.1, reason: The score is 0.25 because while most of the context does not define or explain 'contextual precision' (e.g., 'does not define or explain what contextual precision means'), there are a couple of relevant statements that do address the input ('Contextual precision evaluates how well retrieved chunks are ranked by relevance to a query.' and 'High precision means relevant chunks appear earlier in 


üìä Compact score table:


Unnamed: 0,case_id,query,Answer Relevancy_score,Faithfulness_score,Contextual Relevancy_score,Contextual Precision_score,Contextual Recall_score,RAG Correctness Rubric (GEval) [GEval]_score
0,0,What is DeepEval used for?,1.0,1.0,0.25,1.0,1.0,0.996267
1,1,What does faithfulness measure in a RAG system?,1.0,1.0,0.5,1.0,1.0,0.993991
2,2,What does contextual precision mean?,1.0,1.0,0.5,1.0,1.0,0.8
3,3,What does contextual recall mean in retrieval?,1.0,1.0,0.428571,1.0,1.0,0.903733
4,4,Why might an answer be relevant but still low ...,1.0,1.0,0.833333,1.0,1.0,0.873106



üßæ Full details (includes reasons):


Unnamed: 0,case_id,query,actual_output,expected_output,Answer Relevancy_score,Answer Relevancy_reason,Faithfulness_score,Faithfulness_reason,Contextual Relevancy_score,Contextual Relevancy_reason,Contextual Precision_score,Contextual Precision_reason,Contextual Recall_score,Contextual Recall_reason,RAG Correctness Rubric (GEval) [GEval]_score,RAG Correctness Rubric (GEval) [GEval]_reason
0,0,What is DeepEval used for?,DeepEval is used as an open-source evaluation ...,DeepEval is used to evaluate and unit test LLM...,1.0,The score is 1.00 because the answer was fully...,1.0,The score is 1.00 because there are no contrad...,0.25,The score is 0.25 because while most of the co...,1.0,The score is 1.00 because the first node in th...,1.0,The score is 1.00 because the sentence in the ...,0.996267,The Actual Output directly and fully answers t...
1,1,What does faithfulness measure in a RAG system?,Faithfulness measures whether the answer is su...,Faithfulness measures whether the generated an...,1.0,The score is 1.00 because the answer was fully...,1.0,The score is 1.00 because there are no contrad...,0.5,The score is 0.50 because while some statement...,1.0,The score is 1.00 because the top-ranked node ...,1.0,The score is 1.00 because both sentences in th...,0.993991,The Actual Output directly and fully answers t...
2,2,What does contextual precision mean?,Contextual precision evaluates how well retrie...,Contextual precision evaluates whether relevan...,1.0,The score is 1.00 because the answer was fully...,1.0,The score is 1.00 because there are no contrad...,0.5,The score is 0.50 because while some statement...,1.0,The score is 1.00 because the most relevant no...,1.0,The score is 1.00 because the sentence in the ...,0.8,The Actual Output correctly states that faithf...
3,3,What does contextual recall mean in retrieval?,Contextual recall in retrieval measures whethe...,Contextual recall measures whether the retriev...,1.0,The score is 1.00 because the answer was fully...,1.0,The score is 1.00 because there are no contrad...,0.428571,"The score is 0.43 because, while there are som...",1.0,The score is 1.00 because the first node in th...,1.0,The score is 1.00 because the sentence in the ...,0.903733,The Actual Output directly and fully answers t...
4,4,Why might an answer be relevant but still low ...,An answer might be relevant but still low qual...,An answer can address the question (relevant) ...,1.0,The score is 1.00 because the answer was fully...,1.0,The score is 1.00 because there are no contrad...,0.833333,The score is 0.83 because while several statem...,1.0,The score is 1.00 because all relevant nodes a...,1.0,The score is 1.00 because the sentence in the ...,0.873106,The Actual Output directly answers the query b...



‚úÖ Done. Tip: if contextual precision/recall are low, improve retriever ranking/coverage; if faithfulness is low, tighten generation to only use context.
