# RAGEx for RAG

basierend auf: https://dl.acm.org/doi/pdf/10.1145/3626772.3657660

In [12]:
import sys
from pathlib import Path

# Füge das Projektverzeichnis (mit `src/`) dem Python-Pfad hinzu, egal von wo das Notebook gestartet wird.
project_root = next((p for p in [Path.cwd()] + list(Path.cwd().parents) if (p / 'src').exists()), None)
if project_root is None:
    raise RuntimeError("\"src\"-Verzeichnis nicht gefunden. Bitte Notebook im Projekt laufen lassen.")
root_str = str(project_root)
if root_str not in sys.path:
    sys.path.insert(0, root_str)


In [13]:
from src.modules.explainers.rag_ex_explainable import RAGExExplainable, RAGExConfig
from src.modules.rag.rag_engine import RAGEngine
from src.modules.rag.multihop_rag_engine import MultiHopRAGEngine, _format_documents
from src.modules.llm.llm_client import LLMClient
from src.modules.loader.medmcqa_data_loader import MedMCQADataLoader, format_medmcqa_question
from src.modules.loader.statspearls_data_loader import StatPearlsDataLoader
from src.evaluation.evaluator import Evaluator

import tomllib

In [14]:
import html
from IPython.display import display, HTML
def highlight_text(text, snippets):
    out = html.escape(text.lower())
    for s in snippets:
        out = out.replace(
            html.escape(s.lower()),
            f"<mark>{html.escape(s)}</mark>"
        )
    return out

### Real data example

In [19]:
config_path = project_root / "config.toml"
config = {}

if config_path.exists():
    with open(config_path, "rb") as f:
        config = tomllib.load(f)

medmcqa_config = config.get("medmcqa") or {}
rag_config = config.get("rag") or {}
llm_config = config.get("llm") or {}

llm_model = llm_config.get("model", "gemma3:4b")
llm_provider = llm_config.get("provider", "ollama")

client = LLMClient(provider=llm_provider, model_name=llm_model)

LIMIT = medmcqa_config.get("n_qa_questions", 10)
SPLIT = medmcqa_config.get("split", "val")
PERSIST_DIR = project_root / "data" / "vector_db_statpearls"
NUM_HOPS = rag_config.get('n_hops', 2)
kg_capable_ids = medmcqa_config.get("kg_capable", [])

In [16]:
stat_loader = StatPearlsDataLoader(root_dir=str(project_root / "data"))
documents, stats = stat_loader.setup()

rag_engine = RAGEngine(persist_dir=str(PERSIST_DIR))
rag_engine.setup(documents=documents)

multi_hop = MultiHopRAGEngine(rag_engine=rag_engine, llm_client=client, num_hops=NUM_HOPS)
evaluator = Evaluator()

Loading existing vector store from /Users/benediktveith/Documents/Uni/Semester 7/XAI/xai-rag/data/vector_db_statpearls...
RagEngine ready.
Connecting to local Ollama (gemma3:4b)...


# MedMCQA with sentence pertubation

In [24]:
med_loader = MedMCQADataLoader()
questions = med_loader.setup(split=SPLIT, as_documents=False, limit=LIMIT, ids=kg_capable_ids)

if not questions:
    raise RuntimeError("No MedMCQA questions loaded.")

results = []
for item in questions:
    question_text = format_medmcqa_question(item)
    if not question_text:
        continue

    trace, all_documents = multi_hop.run_and_trace(question_text, extra='Only answer based on your context not your knowledge. Do not include any explanations, reasoning, or extra fields.\n Example: Final Answer: B: Housing')
    final_answer = (trace.get("final_answer") or "").strip()

    documents_for_explanation = all_documents
    context_blocks = []
    for doc in documents_for_explanation:
        content = getattr(doc, "page_content", None)
        if content is None:
            content = str(doc)
        context_blocks.append(str(content).strip())
    
    context = "\n\n".join([c for c in context_blocks if c])

    config = RAGExConfig()
    config.pertubation_depth = 1
    config.pertubation_mode = 'sentences'
    explainer = RAGExExplainable(llm_client=client, config=config)
    explanation = explainer.explain(query=question_text, answer=final_answer, context=context)
    metrics = explainer.metrics()

    perturbed_answers = []
    for result_item in explanation.get("results", []):
        for detail in result_item.get("details", []):
            perturbed_answer = detail.get("perturbed_answer")
            if perturbed_answer:
                perturbed_answers.append(perturbed_answer)

    answer_scores = evaluator.evaluate(perturbed_answers, baseline_answer=final_answer)

    results.append(
        {
            "question": question_text,
            "final_answer": final_answer,
            "trace": trace,
            "explanation": explanation,
            "metrics": metrics,
            "answer_scores": answer_scores,
            "documents": all_documents,
        }
    )

--- Starting Multi-Hop Search for: 'Childhood osteopetrosis is characterized by – a) B/L frontal bossingb) Multiple # (fracture)c) Hepatosplenomegalyd) Cataracte) Mental retardation

Options:
A: bc
B: c
C: ac
D: ad' ---

[ Hop 1 ]
Executing search with query: 'Childhood osteopetrosis is characterized by – a) B/L frontal bossingb) Multiple # (fracture)c) Hepatosplenomegalyd) Cataracte) Mental retardation

Options:
A: bc
B: c
C: ac
D: ad'
Generating next query...

[ Hop 2 ]
Executing search with query: '“osteopetrosis symptoms”'

Generating final answer...

--- Multi-Hop Search Complete. Final Answer: I cannot answer this question based on the provided information. ---
--- Multi-Hop Context: 

 ('<doc id="chunk-1-1" from_hop="1" search_query="Childhood osteopetrosis is characterized by – a) B/L frontal bossingb) Multiple # (fracture)c) Hepatosplenomegalyd) Cataracte) Mental retardation\n\nOptions:\nA: bc\nB: c\nC: ac\nD: ad">\n[Document(id=\'1eb7a40b-ddf9-4884-98e7-b57658414621\', metada

In [25]:
for i, res in enumerate(results):
    print(f"=== Result {i + 1} ===")
    print(res["question"])
    a = res["final_answer"]
    html_out = "<h2>Answer</h2>"
    html_out += f"<p>{a}</p><hr>"

    result = res["explanation"]
    metrics = res["metrics"]
    documents = res["documents"]

    fi_scores = [r.get("importance", 0.0) for r in result.get("results", [])]
    max_fi = max(fi_scores)

    evidences = []
    for item in result.get("results", []):
        if item.get("importance", 0.0) >= max_fi:
            evidences.append(item.get("token", ""))
    
    for i, d in enumerate(documents):
        html_out += f"<h3>Document {i + 1}</h3>"
        body = highlight_text(d.page_content, evidences)
        html_out += f"<pre>{body}</pre>"
    display(HTML(html_out))

    print("Explain metrics:", metrics)
    print(RAGExExplainable.prettify(result))

    print(f"=== Result {i + 1} END ===")

=== Result 1 ===
Childhood osteopetrosis is characterized by – a) B/L frontal bossingb) Multiple # (fracture)c) Hepatosplenomegalyd) Cataracte) Mental retardation

Options:
A: bc
B: c
C: ac
D: ad


Explain metrics: {'duration_seconds': 1275.9946081251837, 'steps': 65}
RAG-Ex Explanation Results

[CONTEXT] Token 0: "Mucolipidosis III X-linked hypophosphatemia 1-alpha hydroxylase deficiency Te..."
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.9343
      - Similarity: 0.0657
      - NLI: CONTRADICTION (ent: 0.038) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.8813
      - Similarity: 0.1187
      - NLI: NEUTRAL (ent: 0.082) - Can explain: ✗

[CONTEXT] Token 2: "Most syndromic craniosynostosis involves the coronal suture producing brachyc..."
  Importance: 0.9432
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.8813
      - Similarity: 0.1187
      - NLI: NEUTRAL (ent: 0.171) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.8813
      - Similarity: 0.1187
      - NLI: NEUTRAL (ent: 0.060) - Can explain: ✗

[CONTEXT] Token 3: "[53] Apert syndrome Crouzon syndrome Muenke syndrome 

Explain metrics: {'duration_seconds': 1751.8656067922711, 'steps': 85}
RAG-Ex Explanation Results

[CONTEXT] Token 0: "by an offending stimulus, most commonly bladder or bowel distension, which ca..."
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.3392
      - Similarity: 0.6608
      - NLI: NEUTRAL (ent: 0.117) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.3392
      - Similarity: 0.6608
      - NLI: ENTAILMENT (ent: 0.494) - Can explain: ✗

[CONTEXT] Token 2: "Patients most commonly describing headache, discomfort, nausea, anxiety, blur..."
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.3392
      - Similarity: 0.6608
      - NLI: NEUTRAL (ent: 0.284) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.3392
      - Similarity: 0.6608
      - NLI: NEUTRAL (ent: 0.471) - Can explain: ✗

[CONTEXT] Token 3: "[58][61] Physical examination may reveal a significantly

Explain metrics: {'duration_seconds': 462.9189072921872, 'steps': 57}
RAG-Ex Explanation Results

[CONTEXT] Token 0: "Patients developing diarrhea while receiving treatment with antibiotics shoul..."
  Importance: 0.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.997) - Can explain: ✓
    • random_noise:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.979) - Can explain: ✓

[CONTEXT] Token 2: "Colonic flora is changed when receiving treatment with antibiotics, making it..."
  Importance: 0.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.995) - Can explain: ✓
    • random_noise:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.980) - Can explain: ✓

[CONTEXT] Token 4: "Diagnostics and treatment focused on Clostridium

Explain metrics: {'duration_seconds': 418.4855860839598, 'steps': 57}
RAG-Ex Explanation Results

[CONTEXT] Token 0: "Patients developing diarrhea while receiving treatment with antibiotics shoul..."
  Importance: 0.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.997) - Can explain: ✓
    • random_noise:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.988) - Can explain: ✓

[CONTEXT] Token 2: "Colonic flora is changed when receiving treatment with antibiotics, making it..."
  Importance: 0.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.995) - Can explain: ✓
    • random_noise:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.981) - Can explain: ✓

[CONTEXT] Token 4: "Diagnostics and treatment focused on Clostridium

Explain metrics: {'duration_seconds': 1306.7440160000697, 'steps': 87}
RAG-Ex Explanation Results

[CONTEXT] Token 0: "of the neck Blurred vision Diplopia Dizziness/vertigo Drop attacks Dysarthria..."
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.8623
      - Similarity: 0.1377
      - NLI: NEUTRAL (ent: 0.027) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.8973
      - Similarity: 0.1027
      - NLI: CONTRADICTION (ent: 0.001) - Can explain: ✗

[CONTEXT] Token 1: "[14][15][16] Personnel The technique requires practitioner"
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.8973
      - Similarity: 0.1027
      - NLI: CONTRADICTION (ent: 0.000) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.8973
      - Similarity: 0.1027
      - NLI: CONTRADICTION (ent: 0.001) - Can explain: ✗

[CONTEXT] Token 3: "middle cerebral artery territory that can cause disorientation 

Explain metrics: {'duration_seconds': 416.23758291685954, 'steps': 61}
RAG-Ex Explanation Results

[CONTEXT] Token 0: "bilaterally."
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.8571
      - Similarity: 0.1429
      - NLI: ENTAILMENT (ent: 0.979) - Can explain: ✓
    • random_noise:
      - Importance (raw): 0.8571
      - Similarity: 0.1429
      - NLI: NEUTRAL (ent: 0.263) - Can explain: ✗

[CONTEXT] Token 2: "Each hip bone, in turn, is composed of three bones: the ilium, the ischium, a..."
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.8571
      - Similarity: 0.1429
      - NLI: ENTAILMENT (ent: 0.946) - Can explain: ✓
    • random_noise:
      - Importance (raw): 0.8571
      - Similarity: 0.1429
      - NLI: ENTAILMENT (ent: 0.742) - Can explain: ✓

[CONTEXT] Token 4: "The primary function of the pelvis is to transfer the weight of the upper bod..."
  Importance: 1.0000
  Per-Strategy 

Explain metrics: {'duration_seconds': 419.2761788330972, 'steps': 63}
RAG-Ex Explanation Results

[CONTEXT] Token 0: "suicides."
  Importance: 0.9908
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.8506
      - Similarity: 0.1494
      - NLI: NEUTRAL (ent: 0.060) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.8506
      - Similarity: 0.1494
      - NLI: ENTAILMENT (ent: 0.914) - Can explain: ✓

[CONTEXT] Token 2: "Besides the manic phase and impulsive behavior, these individuals can also de..."
  Importance: 0.9908
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.8506
      - Similarity: 0.1494
      - NLI: NEUTRAL (ent: 0.028) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.8506
      - Similarity: 0.1494
      - NLI: NEUTRAL (ent: 0.232) - Can explain: ✗

[CONTEXT] Token 4: "Other comorbid factors that increase the risk of suicide in bipolar individua..."
  Importance: 0.9908
  Per-Strategy Details:
 

Explain metrics: {'duration_seconds': 1324.3997107502073, 'steps': 83}
RAG-Ex Explanation Results

[CONTEXT] Token 0: "performing the procedure and the overall number of attempts required before a..."
  Importance: 0.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: NEUTRAL (ent: 0.002) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: NEUTRAL (ent: 0.002) - Can explain: ✗

[CONTEXT] Token 2: "Overall, the estimated incidence is 25% following spinal anesthesia."
  Importance: 0.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: NEUTRAL (ent: 0.002) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: NEUTRAL (ent: 0.002) - Can explain: ✗

[CONTEXT] Token 3: "[23] The management of post-dural puncture headaches has been a topic o

Explain metrics: {'duration_seconds': 957.0579641247168, 'steps': 69}
RAG-Ex Explanation Results

[CONTEXT] Token 0: "of cardiovascular and all-cause mortality."
  Importance: 0.9693
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.7787
      - Similarity: 0.2213
      - NLI: ENTAILMENT (ent: 0.777) - Can explain: ✓
    • random_noise:
      - Importance (raw): 0.7787
      - Similarity: 0.2213
      - NLI: ENTAILMENT (ent: 0.747) - Can explain: ✓

[CONTEXT] Token 1: "[24] Symptom Control Beta-blockers: Beta-blockers have been shown to decrease..."
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.8034
      - Similarity: 0.1966
      - NLI: ENTAILMENT (ent: 0.448) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.7787
      - Similarity: 0.2213
      - NLI: ENTAILMENT (ent: 0.629) - Can explain: ✓

[CONTEXT] Token 2: "[25]Nitrates: Nitrates relax vascular smooth muscle, leading to dilation of v..."
  Im

Explain metrics: {'duration_seconds': 434.4756372915581, 'steps': 63}
RAG-Ex Explanation Results

[CONTEXT] Token 0: "or with obesity, the optimal dosage for folic acid supplementation is less de..."
  Importance: 0.9186
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.5965
      - Similarity: 0.4035
      - NLI: CONTRADICTION (ent: 0.010) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.5965
      - Similarity: 0.4035
      - NLI: CONTRADICTION (ent: 0.006) - Can explain: ✗

[CONTEXT] Token 1: "[49] In patients with anemia, additional iron supplementation may be prescrib..."
  Importance: 0.9186
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.5965
      - Similarity: 0.4035
      - NLI: CONTRADICTION (ent: 0.004) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.5965
      - Similarity: 0.4035
      - NLI: CONTRADICTION (ent: 0.004) - Can explain: ✗

[CONTEXT] Token 3: "Several daily oral iron supplements 

Explain metrics: {'duration_seconds': 361.5945729590021, 'steps': 55}
RAG-Ex Explanation Results

[CONTEXT] Token 0: "increased sensitivity and lower pain thresholds, with the remodeling of the c..."
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.8571
      - Similarity: 0.1429
      - NLI: NEUTRAL (ent: 0.263) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.8847
      - Similarity: 0.1153
      - NLI: NEUTRAL (ent: 0.103) - Can explain: ✗

[CONTEXT] Token 2: "The consequence may be: Allodynia (exaggerated response to a nonpainful stimu..."
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.8571
      - Similarity: 0.1429
      - NLI: NEUTRAL (ent: 0.251) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.8847
      - Similarity: 0.1153
      - NLI: NEUTRAL (ent: 0.049) - Can explain: ✗

[CONTEXT] Token 4: "Substances released include vasoactive peptides such as calc

Explain metrics: {'duration_seconds': 963.7957681249827, 'steps': 67}
RAG-Ex Explanation Results

[CONTEXT] Token 0: "unstable heart failure with left ventricle ejection fraction (LVEF) less than..."
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.7158
      - Similarity: 0.2842
      - NLI: NEUTRAL (ent: 0.011) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.6711
      - Similarity: 0.3289
      - NLI: NEUTRAL (ent: 0.004) - Can explain: ✗

[CONTEXT] Token 2: "period of the structures implicated into the circuit."
  Importance: 0.9375
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.6711
      - Similarity: 0.3289
      - NLI: NEUTRAL (ent: 0.002) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.6711
      - Similarity: 0.3289
      - NLI: NEUTRAL (ent: 0.008) - Can explain: ✗

[CONTEXT] Token 4: "The clinical tolerance of a ventricular tachycardia relates to the heart rate..."
  Imp

# MedMCQA with document pertubation

In [22]:
med_loader = MedMCQADataLoader()
questions = med_loader.setup(split=SPLIT, as_documents=False, limit=LIMIT, ids=kg_capable_ids)

if not questions:
    raise RuntimeError("No MedMCQA questions loaded.")

results = []
for item in questions:
    question_text = format_medmcqa_question(item)
    if not question_text:
        continue

    trace, all_documents = multi_hop.run_and_trace(question_text, extra='Only answer based on your context not your knowledge. Do not include any explanations, reasoning, or extra fields.\n Example: Final Answer: B: Housing')
    final_answer = (trace.get("final_answer") or "").strip()

    documents_for_explanation = all_documents
    context_blocks = []
    for doc in documents_for_explanation:
        content = getattr(doc, "page_content", None)
        if content is None:
            content = str(doc)
        context_blocks.append(str(content).strip())
    
    context = "\n\n".join([c for c in context_blocks if c])

    config = RAGExConfig()
    config.pertubation_depth = 1
    config.pertubation_mode = 'paragraphs'
    explainer = RAGExExplainable(llm_client=client, config=config)
    explanation = explainer.explain(query=question_text, answer=final_answer, context=context)
    metrics = explainer.metrics()

    perturbed_answers = []
    for result_item in explanation.get("results", []):
        for detail in result_item.get("details", []):
            perturbed_answer = detail.get("perturbed_answer")
            if perturbed_answer:
                perturbed_answers.append(perturbed_answer)

    answer_scores = evaluator.evaluate(perturbed_answers, baseline_answer=final_answer)

    feature_scores = sorted(
        (
            (result_item.get("segment", ""), result_item.get("token", ""), result_item.get("importance", 0.0))
            for result_item in explanation.get("results", [])
        ),
        key=lambda x: x[2],
        reverse=True,
    )

    results.append(
        {
            "question": question_text,
            "final_answer": final_answer,
            "trace": trace,
            "explanation": explanation,
            "metrics": metrics,
            "answer_scores": answer_scores,
            "documents": all_documents,
        }
    )

--- Starting Multi-Hop Search for: 'Childhood osteopetrosis is characterized by – a) B/L frontal bossingb) Multiple # (fracture)c) Hepatosplenomegalyd) Cataracte) Mental retardation

Options:
A: bc
B: c
C: ac
D: ad' ---

[ Hop 1 ]
Executing search with query: 'Childhood osteopetrosis is characterized by – a) B/L frontal bossingb) Multiple # (fracture)c) Hepatosplenomegalyd) Cataracte) Mental retardation

Options:
A: bc
B: c
C: ac
D: ad'
Generating next query...

[ Hop 2 ]
Executing search with query: '“Jackson-Weiss syndrome”'

Generating final answer...

--- Multi-Hop Search Complete. Final Answer: Childhood osteopetrosis is characterized by craniosynostosis. ---
--- Multi-Hop Context: 

 ('<doc id="chunk-1-1" from_hop="1" search_query="Childhood osteopetrosis is characterized by – a) B/L frontal bossingb) Multiple # (fracture)c) Hepatosplenomegalyd) Cataracte) Mental retardation\n\nOptions:\nA: bc\nB: c\nC: ac\nD: ad">\n[Document(id=\'1eb7a40b-ddf9-4884-98e7-b57658414621\', metadata=

In [23]:
for i, res in enumerate(results):
    print(f"=== Result {i + 1} ===")
    print(res["question"])
    a = res["final_answer"]
    html_out = "<h2>Answer</h2>"
    html_out += f"<p>{a}</p><hr>"

    result = res["explanation"]
    metrics = res["metrics"]
    documents = res["documents"]

    fi_scores = [r.get("importance", 0.0) for r in result.get("results", [])]
    max_fi = max(fi_scores)

    evidences = []
    for item in result.get("results", []):
        if item.get("importance", 0.0) >= max_fi:
            evidences.append(item.get("token", ""))
    
    for i, d in enumerate(documents):
        html_out += f"<h3>Document {i + 1}</h3>"
        body = highlight_text(d.page_content, evidences)
        html_out += f"<pre>{body}</pre>"
    display(HTML(html_out))

    print("Explain metrics:", metrics)
    print(RAGExExplainable.prettify(result))

    print(f"=== Result {i + 1} END ===")

=== Result 1 ===
Childhood osteopetrosis is characterized by – a) B/L frontal bossingb) Multiple # (fracture)c) Hepatosplenomegalyd) Cataracte) Mental retardation

Options:
A: bc
B: c
C: ac
D: ad


Explain metrics: {'duration_seconds': 221.8930835002102, 'steps': 15}
RAG-Ex Explanation Results

[CONTEXT] Token 0: "Mucolipidosis III X-linked hypophosphatemia 1-alpha hydroxylase deficiency Te..."
  Importance: 0.9869
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.9135
      - Similarity: 0.0865
      - NLI: ENTAILMENT (ent: 0.972) - Can explain: ✓
    • random_noise:
      - Importance (raw): 0.9869
      - Similarity: 0.0131
      - NLI: ENTAILMENT (ent: 0.997) - Can explain: ✓

[CONTEXT] Token 2: "History and Physical The parents usually complain that the baby’s head doesn’..."
  Importance: 0.9869
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.9135
      - Similarity: 0.0865
      - NLI: ENTAILMENT (ent: 0.998) - Can explain: ✓
    • random_noise:
      - Importance (raw): 0.9869
      - Similarity: 0.0131
      - NLI: ENTAILMENT (ent: 0.997) - Can explain: ✓

[CONTEXT] Token 4: "at trauma centers is not mandatory. The facial b

Explain metrics: {'duration_seconds': 231.06677049957216, 'steps': 17}
RAG-Ex Explanation Results

[CONTEXT] Token 0: "by an offending stimulus, most commonly bladder or bowel distension, which ca..."
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.3392
      - Similarity: 0.6608
      - NLI: NEUTRAL (ent: 0.032) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.3392
      - Similarity: 0.6608
      - NLI: ENTAILMENT (ent: 0.576) - Can explain: ✓

[CONTEXT] Token 2: "Consequently, patients present with multisystem complaints ranging from nause..."
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.3392
      - Similarity: 0.6608
      - NLI: NEUTRAL (ent: 0.122) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.3392
      - Similarity: 0.6608
      - NLI: NEUTRAL (ent: 0.380) - Can explain: ✗

[CONTEXT] Token 4: "typical angina usually presents as chest discomfort or a

Explain metrics: {'duration_seconds': 94.64830820914358, 'steps': 13}
RAG-Ex Explanation Results

[CONTEXT] Token 0: "Patients developing diarrhea while receiving treatment with antibiotics shoul..."
  Importance: 0.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.995) - Can explain: ✓
    • random_noise:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.989) - Can explain: ✓

[CONTEXT] Token 2: "the spread of hospital-acquired infections such as Clostridium difficile and ..."
  Importance: 0.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.999) - Can explain: ✓
    • random_noise:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.988) - Can explain: ✓

[CONTEXT] Token 4: "suggested for this patient population is cytolyt

Explain metrics: {'duration_seconds': 90.71981362532824, 'steps': 13}
RAG-Ex Explanation Results

[CONTEXT] Token 0: "Patients developing diarrhea while receiving treatment with antibiotics shoul..."
  Importance: 0.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: NEUTRAL (ent: 0.079) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.989) - Can explain: ✓

[CONTEXT] Token 2: "the spread of hospital-acquired infections such as Clostridium difficile and ..."
  Importance: 0.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.992) - Can explain: ✓
    • random_noise:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.984) - Can explain: ✓

[CONTEXT] Token 4: "suggested for this patient population is cytolytic 

Explain metrics: {'duration_seconds': 238.19464033376426, 'steps': 17}
RAG-Ex Explanation Results

[CONTEXT] Token 0: "of the neck Blurred vision Diplopia Dizziness/vertigo Drop attacks Dysarthria..."
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.8973
      - Similarity: 0.1027
      - NLI: NEUTRAL (ent: 0.002) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.8973
      - Similarity: 0.1027
      - NLI: CONTRADICTION (ent: 0.001) - Can explain: ✗

[CONTEXT] Token 2: "middle cerebral artery territory that can cause disorientation in around 80% ..."
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.8623
      - Similarity: 0.1377
      - NLI: NEUTRAL (ent: 0.002) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.8973
      - Similarity: 0.1027
      - NLI: CONTRADICTION (ent: 0.001) - Can explain: ✗

[CONTEXT] Token 4: "while basal ganglia, thalamus, and cingulate co

Explain metrics: {'duration_seconds': 96.96658412460238, 'steps': 15}
RAG-Ex Explanation Results

[CONTEXT] Token 0: "bilaterally. Each hip bone, in turn, is composed of three bones: the ilium, t..."
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.9932
      - Similarity: 0.0068
      - NLI: NEUTRAL (ent: 0.002) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.9966
      - Similarity: 0.0034
      - NLI: NEUTRAL (ent: 0.116) - Can explain: ✗

[CONTEXT] Token 2: "bone grafting is another option that some surgeons prefer, particularly on he..."
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.9932
      - Similarity: 0.0068
      - NLI: NEUTRAL (ent: 0.101) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.9966
      - Similarity: 0.0034
      - NLI: NEUTRAL (ent: 0.220) - Can explain: ✗

[CONTEXT] Token 4: "favorable cosmetic outcomes, and enable acceptable sexual fu

Explain metrics: {'duration_seconds': 76.96914683422074, 'steps': 11}
RAG-Ex Explanation Results

[CONTEXT] Token 0: "suicides. Besides the manic phase and impulsive behavior, these individuals c..."
  Importance: 0.9555
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.8506
      - Similarity: 0.1494
      - NLI: ENTAILMENT (ent: 0.652) - Can explain: ✓
    • random_noise:
      - Importance (raw): 0.8506
      - Similarity: 0.1494
      - NLI: ENTAILMENT (ent: 0.634) - Can explain: ✓

[CONTEXT] Token 2: "index, and its use has to be closely monitored. Compliance with lithium thera..."
  Importance: 0.9555
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.8506
      - Similarity: 0.1494
      - NLI: NEUTRAL (ent: 0.334) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.8506
      - Similarity: 0.1494
      - NLI: ENTAILMENT (ent: 0.647) - Can explain: ✓

[CONTEXT] Token 4: "for management and hence avoidance of the cascade o

Explain metrics: {'duration_seconds': 245.69705933332443, 'steps': 17}
RAG-Ex Explanation Results

[CONTEXT] Token 0: "performing the procedure and the overall number of attempts required before a..."
  Importance: 0.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: NEUTRAL (ent: 0.361) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: NEUTRAL (ent: 0.002) - Can explain: ✗

[CONTEXT] Token 2: "as hypertension or hypotension, cardiac arrhythmias, and respiratory failure ..."
  Importance: 0.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: NEUTRAL (ent: 0.363) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: NEUTRAL (ent: 0.002) - Can explain: ✗

[CONTEXT] Token 4: "dura and likely cerebrospinal fluid (CSF) leak occurring se

Explain metrics: {'duration_seconds': 152.6546696666628, 'steps': 15}
RAG-Ex Explanation Results

[CONTEXT] Token 0: "of cardiovascular and all-cause mortality.[24] Symptom Control Beta-blockers:..."
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.9163
      - Similarity: 0.0837
      - NLI: NEUTRAL (ent: 0.007) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.9163
      - Similarity: 0.0837
      - NLI: NEUTRAL (ent: 0.117) - Can explain: ✗

[CONTEXT] Token 2: "a level of PEEP, which balances the benefit of optimal recruitment with the r..."
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.9163
      - Similarity: 0.0837
      - NLI: NEUTRAL (ent: 0.008) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.9163
      - Similarity: 0.0837
      - NLI: NEUTRAL (ent: 0.218) - Can explain: ✗

[CONTEXT] Token 4: "from improving alveolar recruitment & oxygenation. PEEP need

Explain metrics: {'duration_seconds': 95.00724574970081, 'steps': 13}
RAG-Ex Explanation Results

[CONTEXT] Token 0: "or with obesity, the optimal dosage for folic acid supplementation is less de..."
  Importance: 0.9962
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.8388
      - Similarity: 0.1612
      - NLI: NEUTRAL (ent: 0.030) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.8388
      - Similarity: 0.1612
      - NLI: ENTAILMENT (ent: 0.927) - Can explain: ✓

[CONTEXT] Token 2: "by smaller-than-normal RBCs with reduced hemoglobin content, as seen in iron ..."
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.8388
      - Similarity: 0.1612
      - NLI: NEUTRAL (ent: 0.046) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.8420
      - Similarity: 0.1580
      - NLI: ENTAILMENT (ent: 0.580) - Can explain: ✓

[CONTEXT] Token 4: "or total circulating red cell mass below the reference

Explain metrics: {'duration_seconds': 77.71106666699052, 'steps': 11}
RAG-Ex Explanation Results

[CONTEXT] Token 0: "increased sensitivity and lower pain thresholds, with the remodeling of the c..."
  Importance: 0.9688
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.8289
      - Similarity: 0.1711
      - NLI: NEUTRAL (ent: 0.128) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.8571
      - Similarity: 0.1429
      - NLI: NEUTRAL (ent: 0.115) - Can explain: ✗

[CONTEXT] Token 2: "is Injured The inflammatory cascade can sensitize peripheral nociceptors. Sub..."
  Importance: 0.9688
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.8571
      - Similarity: 0.1429
      - NLI: NEUTRAL (ent: 0.071) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.8571
      - Similarity: 0.1429
      - NLI: NEUTRAL (ent: 0.088) - Can explain: ✗

[CONTEXT] Token 4: "response to a nonpainful stimulus) Hyperalgesia (excessive p

Explain metrics: {'duration_seconds': 181.5229893331416, 'steps': 15}
RAG-Ex Explanation Results

[CONTEXT] Token 0: "unstable heart failure with left ventricle ejection fraction (LVEF) less than..."
  Importance: 0.6390
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.6303
      - Similarity: 0.3697
      - NLI: CONTRADICTION (ent: 0.050) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.6235
      - Similarity: 0.3765
      - NLI: NEUTRAL (ent: 0.006) - Can explain: ✗

[CONTEXT] Token 2: "period of the structures implicated into the circuit. The clinical tolerance ..."
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.9864
      - Similarity: 0.0136
      - NLI: CONTRADICTION (ent: 0.055) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: NEUTRAL (ent: 0.004) - Can explain: ✗

[CONTEXT] Token 4: "failed attempts, immediate intubation and arteri

# Simple German QA Dataset

Documents are created for each sentence. 

In [6]:
from src.evaluation.simple_qa_dataset import get_all_contexts_as_documents, get_dataset

dataset = get_dataset()
all_documents = get_all_contexts_as_documents()

config = RAGExConfig()
config.pertubation_depth = 1
config.pertubation_mode = 'sentences'
explainer = RAGExExplainable(llm_client=client, config=config)

results = []
for i, item in enumerate(dataset):
    documents = all_documents[i][1]

    evidence = item["evidence"]
    question_text = item["question"]
    answer_prompt = client._create_final_answer_prompt(question_text, _format_documents(documents, from_query=question_text), extra='Only answer based on your context not your knowledge. Do not include any explanations, reasoning, or extra fields.\n Example: Final Answer: Housing')
    final_answer_response = client._base_llm.invoke(answer_prompt)
    final_answer = final_answer_response.content.strip()

    gt = item["answer"]

    context_blocks = []
    for doc in documents:
        content = getattr(doc, "page_content", None)
        if content is None:
            content = str(doc)
        
        context_blocks.append(str(content).strip())
        context_blocks.append("\n")

    context = "\n\n".join([c for c in context_blocks if c])

    explanation = explainer.explain(query=question_text, answer=final_answer, context=context, ground_truth_evidence=evidence)
    metrics = explainer.metrics()
    perturbed_answers = []
    for result_item in explanation.get("results", []):
        for detail in result_item.get("details", []):
            perturbed_answer = detail.get("perturbed_answer")
            if perturbed_answer:
                perturbed_answers.append(perturbed_answer)

    answer_scores = evaluator.evaluate(perturbed_answers, baseline_answer=final_answer)
    
    results.append(
        {
            "question": question_text,
            "final_answer": final_answer,
            "explanation": explanation,
            "metrics": metrics,
            "answer_scores": answer_scores,
            "documents": documents,
        }
    )

Perturbating 1 of 3
Perturbating 2 of 3
Perturbating 3 of 3
Perturbating 1 of 3
Perturbating 2 of 3
Perturbating 3 of 3
Perturbating 1 of 3
Perturbating 2 of 3
Perturbating 3 of 3
Perturbating 1 of 3
Perturbating 2 of 3
Perturbating 3 of 3
Perturbating 1 of 3
Perturbating 2 of 3
Perturbating 3 of 3
Perturbating 1 of 7
Perturbating 2 of 7
Perturbating 3 of 7
Perturbating 4 of 7
Perturbating 5 of 7
Perturbating 6 of 7
Perturbating 7 of 7
Perturbating 1 of 6
Perturbating 2 of 6
Perturbating 3 of 6
Perturbating 4 of 6
Perturbating 5 of 6
Perturbating 6 of 6
Perturbating 1 of 8
Perturbating 2 of 8
Perturbating 3 of 8
Perturbating 4 of 8
Perturbating 5 of 8
Perturbating 6 of 8
Perturbating 7 of 8
Perturbating 8 of 8
Perturbating 1 of 6
Perturbating 2 of 6
Perturbating 3 of 6
Perturbating 4 of 6
Perturbating 5 of 6
Perturbating 6 of 6
Perturbating 1 of 9
Perturbating 2 of 9
Perturbating 3 of 9
Perturbating 4 of 9
Perturbating 5 of 9
Perturbating 6 of 9
Perturbating 7 of 9
Perturbating 8 of 9


In [None]:
for i, res in enumerate(results):
    print(f"=== Result {i + 1} ===")
    print(' '.join(doc.page_content for doc in res["documents"]))
    print(res["question"])
    
    a = res["final_answer"]
    html_out = "<h2>Answer</h2>"
    html_out += f"<p>{a}</p><hr>"

    result = res["explanation"]
    metrics = res["metrics"]
    documents = res["documents"]

    fi_scores = [r.get("importance", 0.0) for r in result.get("results", [])]
    max_fi = max(fi_scores)

    evidences = []
    for item in result.get("results", []):
        if item.get("importance", 0.0) >= max_fi:
            evidences.append(item.get("token", ""))
    
    for i, d in enumerate(documents):
        html_out += f"<h3>Document {i + 1}</h3>"
        body = highlight_text(d.page_content, evidences)
        html_out += f"<pre>{body}</pre>"
    display(HTML(html_out))
    
    print("------------")
    print("Explain metrics:", metrics)
    print(RAGExExplainable.prettify(result))

=== Result 1 ===
Max hat ein rotes Auto. Tom hat ein blaues Auto. Lisa fährt einen grünen Wagen.
Welche Autofarbe hat Tom?
['Tom hat ein blaues Auto.']


------------
Explain metrics: {'duration_seconds': 29.204636000096798, 'steps': 7}
RAG-Ex Explanation Results

🎯 Interpretability (Feature Importance vs Ground Truth):
  • Jaccard Score: 1.0000
  • Top Importance: 1.0000
  • Intersection: 5 tokens
  • Union: 5 tokens

[CONTEXT] Token 0: "Max hat ein rotes Auto."
  Importance: 0.5504
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.4709
      - Similarity: 0.5291
      - NLI: ENTAILMENT (ent: 0.992) - Can explain: ✓
    • random_noise:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.997) - Can explain: ✓

[CONTEXT] Token 2: "Tom hat ein blaues Auto."
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.8556
      - Similarity: 0.1444
      - NLI: NEUTRAL (ent: 0.001) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.943) - Can explain: ✓

[CONTEXT] Tok

------------
Explain metrics: {'duration_seconds': 17.856158416252583, 'steps': 6}
RAG-Ex Explanation Results

🎯 Interpretability (Feature Importance vs Ground Truth):
  • Jaccard Score: 1.0000
  • Top Importance: 1.0000
  • Intersection: 4 tokens
  • Union: 4 tokens

[CONTEXT] Token 0: "Peter hat einen Hund namens Bello."
  Importance: 0.3093
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.1862
      - Similarity: 0.8138
      - NLI: ENTAILMENT (ent: 0.994) - Can explain: ✓
    • random_noise:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.993) - Can explain: ✓

[CONTEXT] Token 2: "Anna besitzt eine Katze."
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.6018
      - Similarity: 0.3982
      - NLI: NEUTRAL (ent: 0.001) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.1291
      - Similarity: 0.8709
      - NLI: ENTAILMENT (ent: 0.988) - Can explain: ✓

[C

------------
Explain metrics: {'duration_seconds': 18.98262641718611, 'steps': 6}
RAG-Ex Explanation Results

🎯 Interpretability (Feature Importance vs Ground Truth):
  • Jaccard Score: 1.0000
  • Top Importance: 1.0000
  • Intersection: 7 tokens
  • Union: 7 tokens

[CONTEXT] Token 0: "Sarah arbeitet als Lehrerin an einer Grundschule."
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.8518
      - Similarity: 0.1482
      - NLI: NEUTRAL (ent: 0.002) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.4335
      - Similarity: 0.5665
      - NLI: ENTAILMENT (ent: 0.984) - Can explain: ✓

[CONTEXT] Token 2: "Ihr Bruder ist Arzt."
  Importance: 0.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.888) - Can explain: ✓
    • random_noise:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.956) - Can expl

------------
Explain metrics: {'duration_seconds': 17.78548837499693, 'steps': 6}
RAG-Ex Explanation Results

🎯 Interpretability (Feature Importance vs Ground Truth):
  • Jaccard Score: 1.0000
  • Top Importance: 1.0000
  • Intersection: 4 tokens
  • Union: 4 tokens

[CONTEXT] Token 0: "Klaus wohnt in Berlin."
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.7980
      - Similarity: 0.2020
      - NLI: CONTRADICTION (ent: 0.003) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.981) - Can explain: ✓

[CONTEXT] Token 2: "Seine Schwester lebt in München."
  Importance: 0.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.976) - Can explain: ✓
    • random_noise:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.984) - Can explain: ✓

[

------------
Explain metrics: {'duration_seconds': 17.475657375063747, 'steps': 6}
RAG-Ex Explanation Results

🎯 Interpretability (Feature Importance vs Ground Truth):
  • Jaccard Score: 0.0000
  • Top Importance: 1.0000
  • Intersection: 0 tokens
  • Union: 10 tokens

[CONTEXT] Token 0: "Julia spielt gerne Tennis am Wochenende."
  Importance: 0.9665
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.7303
      - Similarity: 0.2697
      - NLI: NEUTRAL (ent: 0.002) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.0994
      - Similarity: 0.9006
      - NLI: ENTAILMENT (ent: 0.995) - Can explain: ✓

[CONTEXT] Token 2: "Ihr Freund fotografiert gern."
  Importance: 0.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.962) - Can explain: ✓
    • random_noise:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.989) - Can ex

------------
Explain metrics: {'duration_seconds': 63.31552462512627, 'steps': 14}
RAG-Ex Explanation Results

🎯 Interpretability (Feature Importance vs Ground Truth):
  • Jaccard Score: 1.0000
  • Top Importance: 1.0000
  • Intersection: 13 tokens
  • Union: 13 tokens

[CONTEXT] Token 0: "Mark geht oft in verschiedene Restaurants."
  Importance: 0.3630
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.1880
      - Similarity: 0.8120
      - NLI: CONTRADICTION (ent: 0.007) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.1880
      - Similarity: 0.8120
      - NLI: CONTRADICTION (ent: 0.003) - Can explain: ✗

[CONTEXT] Token 2: "Als Kind mochte er nur Nudeln."
  Importance: 0.3630
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.1880
      - Similarity: 0.8120
      - NLI: CONTRADICTION (ent: 0.000) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.1880
      - Similarity: 0.8120
      - NLI: CONTRADICTION (e

------------
Explain metrics: {'duration_seconds': 35.644876750186086, 'steps': 12}
RAG-Ex Explanation Results

🎯 Interpretability (Feature Importance vs Ground Truth):
  • Jaccard Score: 1.0000
  • Top Importance: 1.0000
  • Intersection: 12 tokens
  • Union: 12 tokens

[CONTEXT] Token 0: "Emma wurde 1998 geboren und hat dieses Jahr ihren Geburtstag bereits gefeiert."
  Importance: 1.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.8900
      - Similarity: 0.1100
      - NLI: CONTRADICTION (ent: 0.000) - Can explain: ✗
    • random_noise:
      - Importance (raw): 0.8309
      - Similarity: 0.1691
      - NLI: ENTAILMENT (ent: 0.992) - Can explain: ✓

[CONTEXT] Token 2: "Ihr älterer Bruder Felix kam 1993 zur Welt und arbeitet seit fünf Jahren als ..."
  Importance: 0.9336
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.8309
      - Similarity: 0.1691
      - NLI: ENTAILMENT (ent: 0.765) - Can explain: ✓
    • random_noise:
      - 

------------
Explain metrics: {'duration_seconds': 57.307032415643334, 'steps': 16}
RAG-Ex Explanation Results

🎯 Interpretability (Feature Importance vs Ground Truth):
  • Jaccard Score: 0.1538
  • Top Importance: 0.0000
  • Intersection: 4 tokens
  • Union: 26 tokens

[CONTEXT] Token 0: "David ist 45 Jahre alt und seit 15 Jahren verheiratet."
  Importance: 0.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.831) - Can explain: ✓
    • random_noise:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.893) - Can explain: ✓

[CONTEXT] Token 2: "Sein Kollege Stefan ist kinderlos und reist viel."
  Importance: 0.0000
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      - NLI: ENTAILMENT (ent: 0.950) - Can explain: ✓
    • random_noise:
      - Importance (raw): 0.0000
      - Similarity: 1.0000
      -

------------
Explain metrics: {'duration_seconds': 44.17843341687694, 'steps': 12}
RAG-Ex Explanation Results

🎯 Interpretability (Feature Importance vs Ground Truth):
  • Jaccard Score: 1.0000
  • Top Importance: 1.0000
  • Intersection: 13 tokens
  • Union: 13 tokens

[CONTEXT] Token 0: "Maria plant ihre Sommerreise sehr sorgfältig."
  Importance: 0.7334
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.3539
      - Similarity: 0.6461
      - NLI: ENTAILMENT (ent: 0.998) - Can explain: ✓
    • random_noise:
      - Importance (raw): 0.4503
      - Similarity: 0.5497
      - NLI: ENTAILMENT (ent: 0.998) - Can explain: ✓

[CONTEXT] Token 2: "Letztes Jahr war sie in Italien und hat Rom besichtigt."
  Importance: 0.5764
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.3539
      - Similarity: 0.6461
      - NLI: ENTAILMENT (ent: 0.998) - Can explain: ✓
    • random_noise:
      - Importance (raw): 0.3539
      - Similarity: 0.6461
      - NL

------------
Explain metrics: {'duration_seconds': 61.97016845922917, 'steps': 18}
RAG-Ex Explanation Results

🎯 Interpretability (Feature Importance vs Ground Truth):
  • Jaccard Score: 1.0000
  • Top Importance: 1.0000
  • Intersection: 13 tokens
  • Union: 13 tokens

[CONTEXT] Token 0: "Tim ist sehr sportlich und aktiv."
  Importance: 0.7028
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.5953
      - Similarity: 0.4047
      - NLI: ENTAILMENT (ent: 0.977) - Can explain: ✓
    • random_noise:
      - Importance (raw): 0.5953
      - Similarity: 0.4047
      - NLI: ENTAILMENT (ent: 0.980) - Can explain: ✓

[CONTEXT] Token 2: "Als Kind nahm er Schwimmunterricht, genau wie seine Schwester heute noch gern..."
  Importance: 0.7028
  Per-Strategy Details:
    • leave_one_out:
      - Importance (raw): 0.5953
      - Similarity: 0.4047
      - NLI: ENTAILMENT (ent: 0.981) - Can explain: ✓
    • random_noise:
      - Importance (raw): 0.5953
      - Similarity: 0.40

## Qualitative Evaluation

Es gibt drei Bereiche:
- Satzweise Pertubation auf MedMCQA-Datensatz
- Dokumentweise Pertubation auf MedMCQA-Datensatz
- Satzweise Pertubation auf eigenen simplen QA Datensatz

Pro Antwort siehst du:
- Die Frage
- Die Antwort
- Pro Dokument highlighting welche Stelle relevant war
    - Wenn kein Highlight existiert = Alles gleich relevant

- Explain metrics: Gibt dir Dauer und Anzahl an LLM Steps an
- RAG-Ex Explanation Results: Irrelevant für Qualitative Evaluation

### Was man bereits direkt sieht:
Umso länger der Kontext (#Dokumente, und Größe der Dokumente) umso schlechter funktioniert es.