# Upper Bound Evaluation (Real Context)

This notebook evaluates the *upper bound* by giving the model the real context
from the dataset and scoring against the gold answer.


In [1]:
from __future__ import annotations

from pathlib import Path
import json
import pandas as pd
import dspy
import asyncio
import random
import numpy as np

# Import LiteLLM config helper
from litellm_client import load_llm_config

CSV_PATH = Path('../data/data_evaluation/GSKI_Fragen-Antworten-Fundstellen.csv')
SAMPLE_SIZE = 0  # set >0 to sample
SEED = 42

random.seed(SEED)
np.random.seed(SEED)

In [2]:
df = pd.read_csv(CSV_PATH, sep=';', encoding='utf-8-sig')
if SAMPLE_SIZE and SAMPLE_SIZE > 0:
    df = df.sample(n=min(SAMPLE_SIZE, len(df)), random_state=SEED)

df = df[['Frage', 'Fundstellen im IT-Grundschutz-Kompendium 2023', 'Antwort']]
df = df.dropna()
df.head()


Unnamed: 0,Frage,Fundstellen im IT-Grundschutz-Kompendium 2023,Antwort
0,Was ist der Unterschied zwischen Prozess- und ...,Hinweise zum Schichtenmodell und zur Modellier...,Prozess-Bausteine gelten in der Regel für sämt...
1,Welche grundlegenden Sicherheitsmaßnahmen müss...,APP.3.2.A1 Sichere Konfiguration eines Webserv...,Nach der Installation eines Webservers muss ei...
2,Wie müssen Webserver-Dateien geschützt werden?,APP.3.2.A2 Schutz der Webserver-Dateien (B)\r\...,"Alle Dateien auf dem Webserver, insbesondere S..."
3,Welche Sicherheitsanforderungen gelten für Dat...,APP.3.2.A3 Absicherung von Datei-Uploads und -...,Alle mithilfe des Webservers veröffentlichten ...
4,Welche Ereignisse müssen auf einem Webserver p...,APP.3.2.A4 Protokollierung von Ereignissen (B)...,Der Webserver muss mindestens folgende Ereigni...


In [3]:
class UpperBoundAnswer(dspy.Signature):
    """Answer in German, short and precise (2-3 sentences).
    Use only the provided context. If the answer is not in the context, say so."""
    question: str = dspy.InputField()
    context: str = dspy.InputField()
    response: str = dspy.OutputField(desc='Antwort auf Deutsch, kurz und praezise.')


class UpperBoundModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.predict = dspy.Predict(UpperBoundAnswer)

    def forward(self, question: str, context: str):
        return self.predict(question=question, context=context)


In [4]:
import json

def build_examples(df: pd.DataFrame):
    examples = []
    for _, row in df.iterrows():
        examples.append(
            {
                "question": row["Frage"],
                "context": row["Fundstellen im IT-Grundschutz-Kompendium 2023"],
                "answer": row["Antwort"],
            }
        )
    return examples


def build_dspy_examples(examples):
    dspy_examples = []
    for ex in examples:
        dspy_examples.append(
            dspy.Example(
                question=ex["question"],
                context=ex["context"],
                response=ex["answer"],
            ).with_inputs("question", "context")
        )
    return dspy_examples


async def _score_with_ragas(rows, eval_model=None, batch_size=16, concurrency=10):
    import instructor
    import litellm
    from ragas.embeddings.litellm_provider import LiteLLMEmbeddings
    from ragas.llms import llm_factory
    from ragas.metrics.collections import (
        AnswerCorrectness,
        Faithfulness,
        AnswerRelevancy
    )
    async def seeded_completion(**kwargs):
        return await litellm.acompletion(
            **kwargs,
            seed=SEED,        # per-request seed
            temperature=0.2,  # optional for determinism
        )

    llm_cfg = load_llm_config()
    eval_model = eval_model or llm_cfg.model

    litellm.api_base = llm_cfg.api_base
    litellm.api_key = llm_cfg.api_key
    client = instructor.from_litellm(seeded_completion, mode=instructor.Mode.MD_JSON, is_async=True)
    ragas_llm = llm_factory(eval_model, client=client, adapter="litellm")

    embeddings = LiteLLMEmbeddings(
        model=llm_cfg.embedding_model,
        api_key=llm_cfg.api_key,
        api_base=llm_cfg.api_base,
        encoding_format="float",
    )

    scorers = {
        "faithfulness": Faithfulness(llm=ragas_llm),
        "answer_correctness": AnswerCorrectness(llm=ragas_llm, embeddings=embeddings),
        "answer_relevancy": AnswerRelevancy(llm=ragas_llm, embeddings=embeddings),
    }

    async def _score_row(row, sem):
        async with sem:
            return {
                "faithfulness": (
                    await scorers["faithfulness"].ascore(
                        user_input=row["question"],
                        response=row["answer"],
                        retrieved_contexts=row["contexts"],
                    )
                ).value,
                "answer_correctness": (
                    await scorers["answer_correctness"].ascore(
                        user_input=row["question"],
                        response=row["answer"],
                        reference=row["ground_truth_answer"],
                    )
                ).value,
                "answer_relevancy": (
                    await scorers["answer_relevancy"].ascore(
                        user_input=row["question"],
                        response=row["answer"],
                    )
                ).value,
            }

    async def _score_dataset_batched(rows, batch_size, concurrency):
        sem = asyncio.Semaphore(concurrency)
        results = []
        for i in range(0, len(rows), batch_size):
            batch = rows[i : i + batch_size]
            tasks = [asyncio.create_task(_score_row(r, sem)) for r in batch]
            results.extend(await asyncio.gather(*tasks))
        return results

    scores = await _score_dataset_batched(rows, batch_size, concurrency)
    stats = {
        k: {
            "avg": sum(s[k] for s in scores) / len(scores),
            "min": min(s[k] for s in scores),
            "max": max(s[k] for s in scores),
        }
        for k in scores[0].keys()
    }
    return scores, stats


def make_program(program_type, dspy_trainset, mipro_auto="light", mipro_threads=6):
    if program_type == "base":
        return UpperBoundModule()

    if program_type == "fewshot":
        return dspy.teleprompt.BootstrapFewShot(
            metric=None,
            max_bootstrapped_demos=4,
            max_labeled_demos=4,
        ).compile(UpperBoundModule(), trainset=dspy_trainset)

    if program_type == "mipro_v2":
        from dspy.evaluate import SemanticF1
        metric = SemanticF1(decompositional=True)
        tp = dspy.MIPROv2(metric=metric, auto=mipro_auto, num_threads=mipro_threads)
        return tp.compile(UpperBoundModule(), trainset=dspy_trainset)

    raise ValueError(f"Unknown program_type: {program_type}")


async def evaluate_models(
    models: list[str],
    examples,
    eval_model=None,
    program_types=None,
    output_path: Path | None = None,
):
    llm_cfg = load_llm_config()
    results = []
    program_types = program_types or ["base"]

    dspy_examples = build_dspy_examples(examples)
    output_rows = []

    for model in models:
        print(f"\n=== Evaluating: {model} ===")
        dspy.configure(
            lm=dspy.LM(
                model=model,
                api_base=llm_cfg.api_base,
                api_key=llm_cfg.api_key,
                temperature=0.2,
                seed=SEED,
            )
        )

        for program_type in program_types:
            print(f"Program: {program_type}")
            program = make_program(program_type, dspy_examples)

            rows = []
            for ex in examples:
                pred = program(question=ex["question"], context=ex["context"])
                row = {
                    "question": ex["question"],
                    "contexts": [ex["context"]],
                    "ground_truth_answer": ex["answer"],
                    "ground_truth_context": ex["context"],
                    "answer": pred.response,
                    "model": model,
                    "program": program_type,
                }
                rows.append(row)

            per_row_scores, stats = await _score_with_ragas(rows, eval_model=eval_model)
            for row, metrics in zip(rows, per_row_scores): 
                row["metrics"] = metrics
            output_rows.extend(rows)

            result_row = {"model": model, "program": program_type}
            result_row.update({f"{k}_avg": v["avg"] for k, v in stats.items()})
            results.append(result_row)

    if output_path is not None:
        output_path.parent.mkdir(parents=True, exist_ok=True)
        with output_path.open("w", encoding="utf-8") as f:
            for row in output_rows:
                f.write(json.dumps(row, ensure_ascii=False) + "\n")

    return pd.DataFrame(results)


examples = build_examples(df)

MODELS = [
    "openai/gpt-oss-120b",
    # "openai/granite-4-h-tiny",
]

PROGRAMS = [
    "base",
    "fewshot",
    "mipro_v2",
]

# You can set a separate eval model for RAGAS if you want:
RAGAS_EVAL_MODEL = None
OUTPUT_JSONL = Path("../data/data_evaluation/upper_bound_outputs_gpt.jsonl")

results_df = await evaluate_models(
    MODELS,
    examples,
    eval_model=RAGAS_EVAL_MODEL,
    program_types=PROGRAMS,
    output_path=OUTPUT_JSONL,
)
results_df



=== Evaluating: openai/gpt-oss-120b ===
Program: base
Program: fewshot


 10%|▉         | 4/42 [00:00<00:02, 17.52it/s]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.


2026/02/02 16:39:30 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 10
minibatch: False
num_fewshot_candidates: 6
num_instruct_candidates: 3
valset size: 33

2026/02/02 16:39:30 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2026/02/02 16:39:30 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2026/02/02 16:39:30 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=6 sets of demonstrations...


Program: mipro_v2
Bootstrapping set 1/6
Bootstrapping set 2/6
Bootstrapping set 3/6


 44%|████▍     | 4/9 [00:00<00:00, 10.85it/s]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 4/6


 44%|████▍     | 4/9 [00:00<00:00, 11.18it/s]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 5/6


 44%|████▍     | 4/9 [00:00<00:00, 13.38it/s]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 6/6


 11%|█         | 1/9 [00:00<00:00, 11.72it/s]
2026/02/02 16:39:31 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2026/02/02 16:39:31 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.
2026/02/02 16:39:31 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=3 instructions...



Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.


2026/02/02 16:39:31 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2026/02/02 16:39:31 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Answer in German, short and precise (2-3 sentences).
Use only the provided context. If the answer is not in the context, say so.

2026/02/02 16:39:31 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Formuliere eine knappe, aber vollständige Antwort **auf Deutsch** in **2‑3 Sätzen**, die **ausschließlich** aus dem bereitgestellten Kontext stammt.  
- Wandele Aufzählungen, Stichpunkte und Aufzählungs‑Buchstaben in flüssige, deklarative Sätze um.  
- Bewahre die Modalverben „MUSS“, „SOLL“, „SOLLTE“, „DÜRFT“, „KANN“ usw. und deren verpflichtende bzw. optionale Bedeutung.  
- Entferne alle Referenz‑ und Gliederungscodes (z. B. APP.3.2.A1, Nummerierungen) sowie Formatierungen, behalte jedoch die fachlichen Begriffe und technischen Details bei.  
- Wenn die gesuchte Information im Kontext nicht vorhanden ist, antworte explizit mit „Die

Average Metric: 26.97 / 33 (81.7%): 100%|██████████| 33/33 [00:00<00:00, 138.80it/s]

2026/02/02 16:39:32 INFO dspy.evaluate.evaluate: Average Metric: 26.971635721131733 / 33 (81.7%)
2026/02/02 16:39:32 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 81.73

  sampler = optuna.samplers.TPESampler(seed=seed, multivariate=True)
2026/02/02 16:39:32 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 2 / 10 =====



Average Metric: 26.32 / 33 (79.7%): 100%|██████████| 33/33 [00:00<00:00, 141.87it/s]

2026/02/02 16:39:32 INFO dspy.evaluate.evaluate: Average Metric: 26.315728715707625 / 33 (79.7%)
2026/02/02 16:39:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 79.74 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 3'].
2026/02/02 16:39:32 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [81.73, 79.74]
2026/02/02 16:39:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 81.73


2026/02/02 16:39:32 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 3 / 10 =====



Average Metric: 26.34 / 33 (79.8%): 100%|██████████| 33/33 [00:00<00:00, 117.36it/s]

2026/02/02 16:39:32 INFO dspy.evaluate.evaluate: Average Metric: 26.33840691428802 / 33 (79.8%)
2026/02/02 16:39:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 79.81 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].





2026/02/02 16:39:32 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [81.73, 79.74, 79.81]
2026/02/02 16:39:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 81.73


2026/02/02 16:39:32 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 4 / 10 =====


Average Metric: 26.71 / 33 (81.0%): 100%|██████████| 33/33 [00:00<00:00, 151.44it/s]

2026/02/02 16:39:32 INFO dspy.evaluate.evaluate: Average Metric: 26.713695853818685 / 33 (81.0%)
2026/02/02 16:39:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.95 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 5'].
2026/02/02 16:39:32 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [81.73, 79.74, 79.81, 80.95]
2026/02/02 16:39:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 81.73


2026/02/02 16:39:32 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 5 / 10 =====



Average Metric: 25.71 / 33 (77.9%): 100%|██████████| 33/33 [00:00<00:00, 162.18it/s]

2026/02/02 16:39:33 INFO dspy.evaluate.evaluate: Average Metric: 25.707325354366223 / 33 (77.9%)
2026/02/02 16:39:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 77.9 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 2'].
2026/02/02 16:39:33 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [81.73, 79.74, 79.81, 80.95, 77.9]
2026/02/02 16:39:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 81.73


2026/02/02 16:39:33 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 6 / 10 =====



Average Metric: 27.13 / 33 (82.2%): 100%|██████████| 33/33 [00:00<00:00, 53.93it/s]

2026/02/02 16:39:33 INFO dspy.evaluate.evaluate: Average Metric: 27.13398268398685 / 33 (82.2%)
2026/02/02 16:39:33 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 82.22
2026/02/02 16:39:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 82.22 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5'].
2026/02/02 16:39:33 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [81.73, 79.74, 79.81, 80.95, 77.9, 82.22]
2026/02/02 16:39:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 82.22


2026/02/02 16:39:33 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 10 =====



Average Metric: 26.34 / 33 (79.8%): 100%|██████████| 33/33 [00:00<00:00, 465.71it/s]

2026/02/02 16:39:34 INFO dspy.evaluate.evaluate: Average Metric: 26.33840691428802 / 33 (79.8%)
2026/02/02 16:39:34 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 79.81 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].
2026/02/02 16:39:34 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [81.73, 79.74, 79.81, 80.95, 77.9, 82.22, 79.81]
2026/02/02 16:39:34 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 82.22


2026/02/02 16:39:34 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 8 / 10 =====



Average Metric: 26.64 / 33 (80.7%): 100%|██████████| 33/33 [00:00<00:00, 105.96it/s]

2026/02/02 16:39:34 INFO dspy.evaluate.evaluate: Average Metric: 26.637972656641452 / 33 (80.7%)
2026/02/02 16:39:34 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.72 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5'].
2026/02/02 16:39:34 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [81.73, 79.74, 79.81, 80.95, 77.9, 82.22, 79.81, 80.72]
2026/02/02 16:39:34 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 82.22


2026/02/02 16:39:34 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 9 / 10 =====



Average Metric: 25.58 / 33 (77.5%): 100%|██████████| 33/33 [00:00<00:00, 73.57it/s]

2026/02/02 16:39:34 INFO dspy.evaluate.evaluate: Average Metric: 25.579959598386875 / 33 (77.5%)
2026/02/02 16:39:34 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 77.52 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 4'].
2026/02/02 16:39:34 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [81.73, 79.74, 79.81, 80.95, 77.9, 82.22, 79.81, 80.72, 77.52]
2026/02/02 16:39:34 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 82.22


2026/02/02 16:39:34 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 10 / 10 =====



Average Metric: 26.64 / 33 (80.7%): 100%|██████████| 33/33 [00:00<00:00, 290.17it/s]

2026/02/02 16:39:35 INFO dspy.evaluate.evaluate: Average Metric: 26.637972656641452 / 33 (80.7%)
2026/02/02 16:39:35 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.72 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5'].
2026/02/02 16:39:35 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [81.73, 79.74, 79.81, 80.95, 77.9, 82.22, 79.81, 80.72, 77.52, 80.72]
2026/02/02 16:39:35 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 82.22


2026/02/02 16:39:35 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 11 / 10 =====



Average Metric: 27.13 / 33 (82.2%): 100%|██████████| 33/33 [00:00<00:00, 238.26it/s]

2026/02/02 16:39:35 INFO dspy.evaluate.evaluate: Average Metric: 27.13398268398685 / 33 (82.2%)
2026/02/02 16:39:35 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 82.22 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5'].
2026/02/02 16:39:35 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [81.73, 79.74, 79.81, 80.95, 77.9, 82.22, 79.81, 80.72, 77.52, 80.72, 82.22]
2026/02/02 16:39:35 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 82.22


2026/02/02 16:39:35 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 82.22!





Unnamed: 0,model,program,faithfulness_avg,answer_correctness_avg,answer_relevancy_avg
0,openai/gpt-oss-120b,base,0.744868,0.752881,0.819628
1,openai/gpt-oss-120b,fewshot,0.749588,0.749589,0.755468
2,openai/gpt-oss-120b,mipro_v2,0.725935,0.761426,0.749566


In [5]:

MODELS = [
    # "openai/gpt-oss-120b",
    "openai/granite-4-h-tiny",
]

PROGRAMS = [
    "base",
    "fewshot",
    "mipro_v2",
]

# You can set a separate eval model for RAGAS if you want:
RAGAS_EVAL_MODEL = None
OUTPUT_JSONL = Path("../data/data_evaluation/upper_bound_outputs_granite.jsonl")

results_df = await evaluate_models(
    MODELS,
    examples,
    eval_model=RAGAS_EVAL_MODEL,
    program_types=PROGRAMS,
    output_path=OUTPUT_JSONL,
)
results_df



=== Evaluating: openai/granite-4-h-tiny ===
Program: base
Program: fewshot


 10%|▉         | 4/42 [00:00<00:02, 16.56it/s]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.


2026/02/02 16:50:15 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 10
minibatch: False
num_fewshot_candidates: 6
num_instruct_candidates: 3
valset size: 33

2026/02/02 16:50:15 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2026/02/02 16:50:15 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2026/02/02 16:50:15 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=6 sets of demonstrations...


Program: mipro_v2
Bootstrapping set 1/6
Bootstrapping set 2/6
Bootstrapping set 3/6


 44%|████▍     | 4/9 [00:00<00:00,  9.07it/s]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 4/6


 44%|████▍     | 4/9 [00:00<00:00, 13.31it/s]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 5/6


 44%|████▍     | 4/9 [00:00<00:00, 13.45it/s]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 6/6


 11%|█         | 1/9 [00:00<00:01,  7.23it/s]
2026/02/02 16:50:16 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2026/02/02 16:50:16 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.


2026/02/02 16:50:17 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=3 instructions...

2026/02/02 16:50:17 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2026/02/02 16:50:17 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Answer in German, short and precise (2-3 sentences).
Use only the provided context. If the answer is not in the context, say so.

2026/02/02 16:50:17 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Erkläre präzise, in 2-3 Sätzen auf Deutsch, die wesentlichen Sicherheitsmaßnahmen, die beim Einrichten eines Webservers erforderlich sind, basierend auf dem gegebenen Kontext.

2026/02/02 16:50:17 INFO dspy.teleprompt.mipro_optimizer_v2: 2: Erkläre präzise, in 2-3 Sätzen, die Unterschiede und spezifischen Schutzmaßnahmen für prozess- und systemorientierte Bausteine im IT-Grundschutz-Kompendium. Nutze das bereitgestellte Kontextmaterial.

2026/02/02 16:50:17 INFO dspy.teleprompt.mipro_optimizer_v2: 

2026/02/02 16:50:17 INFO dspy.teleprompt.mi

Average Metric: 28.13 / 33 (85.2%): 100%|██████████| 33/33 [00:00<00:00, 178.88it/s]

2026/02/02 16:50:17 INFO dspy.evaluate.evaluate: Average Metric: 28.12715160300349 / 33 (85.2%)
2026/02/02 16:50:17 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 85.23

  sampler = optuna.samplers.TPESampler(seed=seed, multivariate=True)
2026/02/02 16:50:17 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 2 / 10 =====



Average Metric: 29.59 / 33 (89.7%): 100%|██████████| 33/33 [00:00<00:00, 108.70it/s]

2026/02/02 16:50:17 INFO dspy.evaluate.evaluate: Average Metric: 29.592491405164218 / 33 (89.7%)
2026/02/02 16:50:17 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 89.67
2026/02/02 16:50:17 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 89.67 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 3'].
2026/02/02 16:50:17 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [85.23, 89.67]
2026/02/02 16:50:17 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 89.67


2026/02/02 16:50:17 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 3 / 10 =====



Average Metric: 30.45 / 33 (92.3%): 100%|██████████| 33/33 [00:00<00:00, 143.70it/s]

2026/02/02 16:50:18 INFO dspy.evaluate.evaluate: Average Metric: 30.450404638883903 / 33 (92.3%)
2026/02/02 16:50:18 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 92.27
2026/02/02 16:50:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.27 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].
2026/02/02 16:50:18 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [85.23, 89.67, 92.27]
2026/02/02 16:50:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 92.27


2026/02/02 16:50:18 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 4 / 10 =====



Average Metric: 30.35 / 33 (92.0%): 100%|██████████| 33/33 [00:00<00:00, 132.18it/s]

2026/02/02 16:50:18 INFO dspy.evaluate.evaluate: Average Metric: 30.349902248289347 / 33 (92.0%)
2026/02/02 16:50:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 91.97 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 5'].
2026/02/02 16:50:18 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [85.23, 89.67, 92.27, 91.97]
2026/02/02 16:50:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 92.27


2026/02/02 16:50:18 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 5 / 10 =====



Average Metric: 30.26 / 33 (91.7%): 100%|██████████| 33/33 [00:00<00:00, 142.39it/s]

2026/02/02 16:50:18 INFO dspy.evaluate.evaluate: Average Metric: 30.257129518474056 / 33 (91.7%)
2026/02/02 16:50:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 91.69 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 2'].
2026/02/02 16:50:18 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [85.23, 89.67, 92.27, 91.97, 91.69]
2026/02/02 16:50:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 92.27


2026/02/02 16:50:18 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 6 / 10 =====



Average Metric: 29.82 / 33 (90.4%): 100%|██████████| 33/33 [00:00<00:00, 141.46it/s]

2026/02/02 16:50:18 INFO dspy.evaluate.evaluate: Average Metric: 29.818018172303887 / 33 (90.4%)
2026/02/02 16:50:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 90.36 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5'].





2026/02/02 16:50:18 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [85.23, 89.67, 92.27, 91.97, 91.69, 90.36]
2026/02/02 16:50:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 92.27


2026/02/02 16:50:18 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 10 =====


Average Metric: 30.45 / 33 (92.3%): 100%|██████████| 33/33 [00:00<00:00, 339.26it/s]

2026/02/02 16:50:19 INFO dspy.evaluate.evaluate: Average Metric: 30.450404638883903 / 33 (92.3%)
2026/02/02 16:50:19 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.27 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].
2026/02/02 16:50:19 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [85.23, 89.67, 92.27, 91.97, 91.69, 90.36, 92.27]
2026/02/02 16:50:19 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 92.27


2026/02/02 16:50:19 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 8 / 10 =====



Average Metric: 30.05 / 33 (91.1%): 100%|██████████| 33/33 [00:00<00:00, 78.81it/s]

2026/02/02 16:50:19 INFO dspy.evaluate.evaluate: Average Metric: 30.05240156668728 / 33 (91.1%)
2026/02/02 16:50:19 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 91.07 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5'].
2026/02/02 16:50:19 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [85.23, 89.67, 92.27, 91.97, 91.69, 90.36, 92.27, 91.07]
2026/02/02 16:50:19 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 92.27


2026/02/02 16:50:19 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 9 / 10 =====



Average Metric: 30.71 / 33 (93.1%): 100%|██████████| 33/33 [00:00<00:00, 155.48it/s]

2026/02/02 16:50:19 INFO dspy.evaluate.evaluate: Average Metric: 30.712216636410187 / 33 (93.1%)





2026/02/02 16:50:19 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 93.07
2026/02/02 16:50:19 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 93.07 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 4'].
2026/02/02 16:50:19 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [85.23, 89.67, 92.27, 91.97, 91.69, 90.36, 92.27, 91.07, 93.07]
2026/02/02 16:50:19 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 93.07


2026/02/02 16:50:19 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 10 / 10 =====


Average Metric: 30.05 / 33 (91.1%): 100%|██████████| 33/33 [00:00<00:00, 805.41it/s]

2026/02/02 16:50:20 INFO dspy.evaluate.evaluate: Average Metric: 30.05240156668728 / 33 (91.1%)
2026/02/02 16:50:20 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 91.07 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5'].
2026/02/02 16:50:20 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [85.23, 89.67, 92.27, 91.97, 91.69, 90.36, 92.27, 91.07, 93.07, 91.07]
2026/02/02 16:50:20 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 93.07


2026/02/02 16:50:20 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 11 / 10 =====



Average Metric: 30.71 / 33 (93.1%): 100%|██████████| 33/33 [00:00<00:00, 2137.87it/s]

2026/02/02 16:50:20 INFO dspy.evaluate.evaluate: Average Metric: 30.712216636410187 / 33 (93.1%)
2026/02/02 16:50:20 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 93.07 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 4'].
2026/02/02 16:50:20 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [85.23, 89.67, 92.27, 91.97, 91.69, 90.36, 92.27, 91.07, 93.07, 91.07, 93.07]
2026/02/02 16:50:20 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 93.07


2026/02/02 16:50:20 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 93.07!





Unnamed: 0,model,program,faithfulness_avg,answer_correctness_avg,answer_relevancy_avg
0,openai/granite-4-h-tiny,base,0.810184,0.706191,0.849721
1,openai/granite-4-h-tiny,fewshot,0.787009,0.713835,0.844699
2,openai/granite-4-h-tiny,mipro_v2,0.880695,0.701805,0.830923


# Ground Truth Qualität

- **Zusammenfassung**: Faithfulness liegt im Mittel bei ~0.83 und Answer Relevancy bei ~0.63. Das deutet darauf hin, dass viele Gold-Antworten nicht vollstaendig durch den gegebenen Kontext gedeckt sind und nur locker zur exakten Frage passen.
- **Beobachtete Probleme**:
  - Uebergeneraliserte oder unvollstaendige Zusammenfassungen (z. B. „KPIs, Register, Berichte“, ohne die konkrete Anforderung zu treffen).
  - Antworten paraphrasieren breitere Leitlinien, statt sich strikt am zitierten Kontext zu orientieren (z. B. Outsourcing- oder ISMS-Fragen, bei denen mehrere Abschnitte vermischt werden).
- **Auswirkung auf die Metriken**:
  - Niedrige Faithfulness, wenn Details fehlen oder nicht explizit im Kontext stehen.
  - Niedrige Relevancy, wenn die Antwort das Thema allgemein trifft, aber nicht die konkrete Frage.
- **Empfehlungen**:
  - Die Zuordnung Frage → Kontext → Gold-Antwort schaerfen.
  - Antworten extraktiver formulieren (z. B. Schluesselsaetze oder -klauseln uebernehmen).
  - Zusatzinformationen entfernen, die nicht im Kontext stehen.
  - Pro Frage nur den minimal notwendigen, praezisen Kontext bereitstellen.

In [6]:
async def score_gt_with_ragas(
    examples,
    eval_model=None,
    metrics=("faithfulness",),  # choose: "faithfulness", "answer_correctness", "answer_relevancy"
    batch_size=16,
    concurrency=10,
):
    import asyncio
    import instructor
    import litellm
    from ragas.embeddings.litellm_provider import LiteLLMEmbeddings
    from ragas.llms import llm_factory
    from ragas.metrics.collections import AnswerCorrectness, Faithfulness, AnswerRelevancy

    async def seeded_completion(**kwargs):
        return await litellm.acompletion(
            **kwargs,
            seed=SEED,
            temperature=0.2,
        )

    llm_cfg = load_llm_config()
    eval_model = eval_model or llm_cfg.model

    litellm.api_base = llm_cfg.api_base
    litellm.api_key = llm_cfg.api_key
    client = instructor.from_litellm(
        seeded_completion,
        mode=instructor.Mode.MD_JSON,
        is_async=True,
    )
    ragas_llm = llm_factory(eval_model, client=client, adapter="litellm")

    embeddings = LiteLLMEmbeddings(
        model=llm_cfg.embedding_model,
        api_key=llm_cfg.api_key,
        api_base=llm_cfg.api_base,
        encoding_format="float",
    )

    scorers = {}
    if "faithfulness" in metrics:
        scorers["faithfulness"] = Faithfulness(llm=ragas_llm)
    if "answer_correctness" in metrics:
        scorers["answer_correctness"] = AnswerCorrectness(llm=ragas_llm, embeddings=embeddings)
    if "answer_relevancy" in metrics:
        scorers["answer_relevancy"] = AnswerRelevancy(llm=ragas_llm, embeddings=embeddings)

    rows = [
        {
            "question": ex["question"],
            "contexts": [ex["context"]],
            "answer": ex["answer"],  # response = ground truth
            "ground_truth_answer": ex["answer"],
        }
        for ex in examples
    ]

    async def _score_row(row, sem):
        async with sem:
            out = {}
            if "faithfulness" in scorers:
                out["faithfulness"] = (
                    await scorers["faithfulness"].ascore(
                        user_input=row["question"],
                        response=row["answer"],
                        retrieved_contexts=row["contexts"],
                    )
                ).value
            if "answer_correctness" in scorers:
                out["answer_correctness"] = (
                    await scorers["answer_correctness"].ascore(
                        user_input=row["question"],
                        response=row["answer"],
                        reference=row["ground_truth_answer"],
                    )
                ).value
            if "answer_relevancy" in scorers:
                out["answer_relevancy"] = (
                    await scorers["answer_relevancy"].ascore(
                        user_input=row["question"],
                        response=row["answer"],
                    )
                ).value
            return out

    async def _score_dataset_batched(rows, batch_size, concurrency):
        sem = asyncio.Semaphore(concurrency)
        results = []
        for i in range(0, len(rows), batch_size):
            batch = rows[i : i + batch_size]
            tasks = [asyncio.create_task(_score_row(r, sem)) for r in batch]
            results.extend(await asyncio.gather(*tasks))
        return results

    scores = await _score_dataset_batched(rows, batch_size, concurrency)

    stats = {
        k: {
            "avg": sum(s[k] for s in scores) / len(scores),
            "min": min(s[k] for s in scores),
            "max": max(s[k] for s in scores),
        }
        for k in scores[0].keys()
    }
    return scores, stats


In [8]:
gt_scores, gt_stats = await score_gt_with_ragas(
    examples,
    metrics=("faithfulness", "answer_relevancy"),
)

gt_output_path = Path("../data/data_evaluation/gt_scores.jsonl")
gt_output_path.parent.mkdir(parents=True, exist_ok=True)

with gt_output_path.open("w", encoding="utf-8") as f:
    for ex, metrics in zip(examples, gt_scores):
        row = {
            "question": ex["question"],
            "context": ex["context"],
            "ground_truth_answer": ex["answer"],
            "metrics": metrics,
        }
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

gt_stats


{'faithfulness': {'avg': 0.8347127739984883, 'min': 0.25, 'max': 1.0},
 'answer_relevancy': {'avg': 0.6312429580957198,
  'min': 0.2178090669544465,
  'max': 0.9000045447383475}}

In [13]:
import json
import pandas as pd
from pathlib import Path

GT_JSONL = Path("../data/data_evaluation/gt_scores.jsonl")

rows = []
with GT_JSONL.open("r", encoding="utf-8") as f:
    for line in f:
        rows.append(json.loads(line))

df = pd.DataFrame(rows)
df["faithfulness"] = df["metrics"].apply(lambda m: m.get("faithfulness"))
df["answer_relevancy"] = df["metrics"].apply(lambda m: m.get("answer_relevancy"))

# Lowest faithfulness
worst_faith = df.sort_values("faithfulness").head(20)[
    ["question", "context", "ground_truth_answer", "faithfulness", "answer_relevancy"]
]

# Lowest relevancy
worst_rel = df.sort_values("answer_relevancy").head(20)[
    ["question", "context", "ground_truth_answer", "faithfulness", "answer_relevancy"]
]




In [14]:
from IPython.display import HTML

html = ""
for _, r in worst_faith.iterrows():
    html += f"""
    <p><b>Question:</b> {r['question']}<br>
       <b>Faithfulness:</b> {r['faithfulness']:.3f} |
       <b>Relevancy:</b> {r['answer_relevancy']:.3f}<br>
       <b>Ground Truth:</b> {r['ground_truth_answer']}<br>
       <b>Context:</b> {r['context']}
    </p>
    <hr>
    """
HTML(html)


In [15]:
from IPython.display import HTML

html = ""
for _, r in worst_rel.iterrows():
    html += f"""
    <p><b>Question:</b> {r['question']}<br>
       <b>Faithfulness:</b> {r['faithfulness']:.3f} |
       <b>Relevancy:</b> {r['answer_relevancy']:.3f}<br>
       <b>Ground Truth:</b> {r['ground_truth_answer']}<br>
       <b>Context:</b> {r['context']}
    </p>
    <hr>
    """
HTML(html)


In [27]:
# Combined: lowest on either metric
worst_combined = (
    df.assign(worst_score=df[["faithfulness", "answer_relevancy"]].min(axis=1))
      .sort_values("worst_score")
      .head(20)[["question", "context", "ground_truth_answer", "faithfulness", "answer_relevancy"]]
)


from IPython.display import HTML

html = ""
for _, r in worst_combined.iterrows():
    html += f"""
    <p><b>Question:</b> {r['question']}<br>
       <b>Faithfulness:</b> {r['faithfulness']:.3f} |
       <b>Relevancy:</b> {r['answer_relevancy']:.3f}<br>
       <b>Ground Truth:</b> {r['ground_truth_answer']}<br>
       <b>Context:</b> {r['context']}
    </p>
    <hr>
    """
HTML(html)


In [29]:
df["best_score"] = df[["faithfulness", "answer_relevancy"]].mean(axis=1)

best = (
    df.sort_values("best_score", ascending=False)
      .head(20)
      [["question", "context", "ground_truth_answer", "faithfulness", "answer_relevancy", "best_score"]]
)

from IPython.display import HTML

html = ""
for _, r in best.iterrows():
    html += f"""
    <p><b>Question:</b> {r['question']}<br>
       <b>Faithfulness:</b> {r['faithfulness']:.3f} |
       <b>Relevancy:</b> {r['answer_relevancy']:.3f}<br>
       <b>Ground Truth:</b> {r['ground_truth_answer']}<br>
       <b>Context:</b> {r['context']}
    </p>
    <hr>
    """
HTML(html)


In [30]:
REL_THR = 0.7
FAITH_THR = 0.8

df["rel_flag"] = df["answer_relevancy"] < REL_THR
df["faith_flag"] = df["faithfulness"] < FAITH_THR
df["flag_type"] = df.apply(
    lambda r: "BOTH" if r["rel_flag"] and r["faith_flag"]
    else ("REL" if r["rel_flag"] else ("FAITH" if r["faith_flag"] else "OK")),
    axis=1
)

# Nur Flagged (REL / FAITH / BOTH)
flagged = (
    df[df["flag_type"] != "OK"]
      .sort_values(["flag_type", "answer_relevancy", "faithfulness"])
      [["question", "context", "ground_truth_answer", "faithfulness", "answer_relevancy", "flag_type"]]
)

from IPython.display import HTML

html = ""
for _, r in flagged.iterrows():
    html += f"""
    <p><b>Flag:</b> {r['flag_type']}<br>
       <b>Question:</b> {r['question']}<br>
       <b>Faithfulness:</b> {r['faithfulness']:.3f} |
       <b>Relevancy:</b> {r['answer_relevancy']:.3f}<br>
       <b>Ground Truth:</b> {r['ground_truth_answer']}<br>
       <b>Context:</b> {r['context']}
    </p>
    <hr>
    """
HTML(html)


In [33]:

from pathlib import Path

out_csv = Path("../data/data_evaluation/flagged_cases.csv")

flagged.to_csv(out_csv, index=False, encoding="utf-8")

out_csv


PosixPath('../data/data_evaluation/flagged_cases.csv')

## Findings

- TBD: Add key observations after running the notebook.
- TBD: Summarize metrics/results (e.g., faithfulness/answer correctness).
- TBD: Note any dataset or retrieval quality issues discovered.
