# 05_json_preprocessing

Dieses Notebook lädt `grundschutz.json`, erzeugt kompakte Retrieval-Dokumente und speichert optional JSONL/Parquet.

In [None]:
import json
from pathlib import Path
import pandas as pd


In [None]:
# Pfad zur JSON
data_path = Path('../data/data_preprocessed/grundschutz.json')
data = json.loads(data_path.read_text())
data.keys()


In [None]:
# Struktur-Check
len(data.get('schichten', [])), data.get('schichten', [])[0].keys()


In [None]:
def build_retrieval_docs(gs):
    docs = []
    for schicht in gs.get('schichten', []):
        schicht_name = schicht.get('name')
        schicht_typ = schicht.get('typ')
        for b in schicht.get('bausteine', []):
            baustein_id = b.get('id')
            baustein_titel = b.get('titel')
            beschreibung = (b.get('beschreibung') or {}).get('text', '')
            gefaehrdungslage = (b.get('gefaehrdungslage') or {}).get('text', '')

            anforderungen = b.get('anforderungen') or {}
            for level, reqs in anforderungen.items():
                for req in reqs:
                    text = req.get('inhalt', '')
                    if not text:
                        continue
                    docs.append({
                        'id': req.get('id'),
                        'text': text,
                        'meta': {
                            'schicht': schicht_name,
                            'schicht_typ': schicht_typ,
                            'baustein_id': baustein_id,
                            'baustein_titel': baustein_titel,
                            'level': level,  # basis/standard/erhoeht
                            'typ': req.get('typ'),
                            'modal_verben': req.get('modal_verben', []),
                            'beschreibung': beschreibung,
                            'gefaehrdungslage': gefaehrdungslage,
                        }
                    })
    return docs


In [None]:
docs = build_retrieval_docs(data)
len(docs)


In [None]:
df = pd.DataFrame(docs)
df.head()


In [11]:
from pathlib import Path
import sys

# Make notebooks/ importable when running from Jupyter
NOTEBOOK_DIR = Path.cwd()
if (NOTEBOOK_DIR / "litellm_client.py").exists():
    sys.path.insert(0, str(NOTEBOOK_DIR))
elif (NOTEBOOK_DIR / "notebooks" / "litellm_client.py").exists():
    sys.path.insert(0, str(NOTEBOOK_DIR / "notebooks"))

from litellm_client import (
    chat_completion,
    get_embeddings,
    get_qdrant_client,
    load_llm_config,
    load_vectordb_config,
)
# Qdrant-Verbindung prüfen
from qdrant_client.http import models as qmodels

llm_cfg = load_llm_config()
vec_cfg = load_vectordb_config()
client = get_qdrant_client(vec_cfg)

collection_name = "grundschutz_json"



  return QdrantClient(url=url, api_key=cfg.api_key)


In [None]:
# chunks aus df (eine Zeile = ein Chunk)
chunks = df["text"].tolist()
metas = df["meta"].tolist()

embeddings = get_embeddings(chunks, llm_cfg, batch_size=256)
vector_size = len(embeddings[0])

client.recreate_collection(
    collection_name=collection_name,
    vectors_config=qmodels.VectorParams(size=vector_size, distance=qmodels.Distance.COSINE),
)

points = []
for idx, (text, vector, meta) in enumerate(zip(chunks, embeddings, metas)):
    points.append(
        qmodels.PointStruct(
            id=idx,
            vector=vector,
            payload={"text": text, **meta},
        )
    )

BATCH_SIZE = 128
for start in range(0, len(points), BATCH_SIZE):
    client.upsert(collection_name=collection_name, points=points[start:start+BATCH_SIZE])


# INFERENCE

In [43]:
from datasets import Dataset
import pandas as pd

def _retrieve_contexts(question: str, k: int, client, collection_name: str, llm_cfg):
    query_emb = get_embeddings([question], llm_cfg, batch_size=1)[0]
    results = client.query_points(
        collection_name=collection_name,
        query=query_emb,
        limit=k,
    ).points
    return [res.payload.get('text', '') for res in results]

def build_eval_dataset(
    csv_path: str = '../data/data_evaluation/GSKI_Fragen-Antworten-Fundstellen.csv',
    top_k: int = 5,
) -> Dataset:
    llm_cfg = load_llm_config()
    vec_cfg = load_vectordb_config()
    qdrant_client = get_qdrant_client(vec_cfg)
    collection_name = vec_cfg.collection or 'grundschutz_xml'

    df = pd.read_csv(Path(csv_path), sep=';', encoding='utf-8-sig')
    records = []

    for _, row in df.iterrows():
        question = row['Frage']
        ground_truth_answer = row['Antwort']
        ground_truth_context = row['Fundstellen im IT-Grundschutz-Kompendium 2023']

        contexts = _retrieve_contexts(question, top_k, qdrant_client, collection_name, llm_cfg)

        records.append({
            'question': question,
            'contexts': contexts,
            'ground_truth_answer': ground_truth_answer,
            'ground_truth_context': ground_truth_context,
        })

    return Dataset.from_list(records)

dataset = build_eval_dataset(top_k=6)
dataset


  return QdrantClient(url=url, api_key=cfg.api_key)


Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing embeddings 0 to 1 / 1
Processing

Dataset({
    features: ['question', 'contexts', 'ground_truth_answer', 'ground_truth_context'],
    num_rows: 42
})

In [44]:
import json
import dspy
def docs_to_context(docs):
    # docs can be list of strings OR list of dicts
    doc_list = []
    for i, d in enumerate(docs):
        text = d["text"] if isinstance(d, dict) else d
        doc_list.append({"doc_id": i+1, "title": "", "text": text, "source": ""})
    return "<documents>\n" + json.dumps(doc_list, ensure_ascii=False, indent=2) + "\n</documents>"


class RAGAnswerJSON(dspy.Signature):
    """Antworte auf Deutsch, kurz und präzise, max. 2–3 Sätze.
    Nutze ausschließlich den Kontext in <documents>...</documents>.
    Wenn die Antwort nicht im Kontext steht, sage das."""
    question: str = dspy.InputField()
    context: str = dspy.InputField()
    response: str = dspy.OutputField(desc='Antwort auf Deutsch, kurz und präzise, maximal 2–3 Sätze.')

class RAGModuleJSON(dspy.Module):
    def __init__(self):
        super().__init__()
        self.predict = dspy.Predict(RAGAnswerJSON)

    def forward(self, question, docs):
        context = docs_to_context(docs)
        return self.predict(question=question, context=context)

rag_json = RAGModuleJSON()


In [45]:
import dspy
from litellm_client import load_llm_config

llm_cfg = load_llm_config()
model = "openai/gpt-oss-120b"
# model = "openai/granite-4-h-tiny"

# LiteLLM‑Proxy (OpenAI‑kompatibel)
dspy_llm = dspy.LM(
    model=model,
    api_base=llm_cfg.api_base,
    api_key=llm_cfg.api_key,
)

dspy.configure(lm=dspy_llm)


In [46]:
import dspy
from dspy.evaluate import SemanticF1

# DSPy Examples aus dem vorhandenen Dataset
examples = []
for row in dataset:
    examples.append(
        dspy.Example(
            question=row["question"],
            docs=row["contexts"],
            response=row["ground_truth_answer"],  # <-- richtiges Feld
        ).with_inputs("question", "docs")
    )


# einfache Splits
trainset = examples[: max(1, len(examples)//5)]
devset = examples[max(1, len(examples)//5):]

metric = SemanticF1(decompositional=True)

# Optimizer (wenig Threads zum Start)
tp = dspy.MIPROv2(metric=metric, auto='light', num_threads=6)
optimized_rag = tp.compile(rag_json, trainset=trainset)


2026/01/30 19:04:32 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 10
minibatch: False
num_fewshot_candidates: 6
num_instruct_candidates: 3
valset size: 6

2026/01/30 19:04:32 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2026/01/30 19:04:32 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2026/01/30 19:04:32 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=6 sets of demonstrations...


Bootstrapping set 1/6
Bootstrapping set 2/6
Bootstrapping set 3/6


100%|██████████| 2/2 [00:00<00:00, 14.11it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 4/6


100%|██████████| 2/2 [00:00<00:00, 17.12it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 5/6


 50%|█████     | 1/2 [00:00<00:00, 17.88it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 6/6


 50%|█████     | 1/2 [00:00<00:00, 17.30it/s]
2026/01/30 19:04:32 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2026/01/30 19:04:32 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.
2026/01/30 19:04:32 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=3 instructions...



Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.


2026/01/30 19:04:32 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2026/01/30 19:04:32 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Antworte auf Deutsch, kurz und präzise, max. 2–3 Sätze.
Nutze ausschließlich den Kontext in <documents>...</documents>.
Wenn die Antwort nicht im Kontext steht, sage das.

2026/01/30 19:04:32 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Antworte auf Deutsch in maximal 2–3 Sätzen. Verwende ausschließlich den Text, der zwischen den `<documents>…</documents>`‑Tags steht.  

- Zitiere den relevanten Abschnitt **exakt** (inkl. der Wörter „MUSS“ oder „SOLLTE“, falls vorhanden) und **nenne die zugehörige Baustein‑Kennung** (z. B. APP.2.1, SYS.3.2.1).  
- Wenn mehrere passende Passagen existieren, führe nur die **erste** passende Passage an.  
- Sollte die benötigte Information im Kontext nicht vorhanden sein, antworte wörtlich: **„Die Information ist nicht im Kontext enthalten.“**

2026/01/30 19:04:32 INFO dspy.teleprompt.mipro_opt

Average Metric: 4.41 / 6 (73.5%): 100%|██████████| 6/6 [00:00<00:00, 431.80it/s]

2026/01/30 19:04:32 INFO dspy.evaluate.evaluate: Average Metric: 4.409090909033575 / 6 (73.5%)
2026/01/30 19:04:32 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 73.48

  sampler = optuna.samplers.TPESampler(seed=seed, multivariate=True)
2026/01/30 19:04:32 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 2 / 10 =====



Average Metric: 4.15 / 6 (69.1%): 100%|██████████| 6/6 [00:00<00:00, 959.06it/s]

2026/01/30 19:04:32 INFO dspy.evaluate.evaluate: Average Metric: 4.1472222222462225 / 6 (69.1%)
2026/01/30 19:04:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 69.12 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 3'].
2026/01/30 19:04:32 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [73.48, 69.12]
2026/01/30 19:04:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 73.48


2026/01/30 19:04:32 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 3 / 10 =====



Average Metric: 4.14 / 6 (69.0%): 100%|██████████| 6/6 [00:00<00:00, 2814.02it/s]

2026/01/30 19:04:33 INFO dspy.evaluate.evaluate: Average Metric: 4.138189588213588 / 6 (69.0%)
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.97 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [73.48, 69.12, 68.97]
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 73.48


2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 4 / 10 =====



Average Metric: 4.15 / 6 (69.1%): 100%|██████████| 6/6 [00:00<00:00, 2785.37it/s]

2026/01/30 19:04:33 INFO dspy.evaluate.evaluate: Average Metric: 4.1472222222462225 / 6 (69.1%)
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 69.12 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 5'].
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [73.48, 69.12, 68.97, 69.12]
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 73.48


2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 5 / 10 =====



Average Metric: 4.01 / 6 (66.8%): 100%|██████████| 6/6 [00:00<00:00, 536.96it/s]

2026/01/30 19:04:33 INFO dspy.evaluate.evaluate: Average Metric: 4.008147512864494 / 6 (66.8%)
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 66.8 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 2'].
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [73.48, 69.12, 68.97, 69.12, 66.8]
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 73.48


2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 6 / 10 =====



Average Metric: 4.01 / 6 (66.8%): 100%|██████████| 6/6 [00:00<00:00, 348.91it/s]

2026/01/30 19:04:33 INFO dspy.evaluate.evaluate: Average Metric: 4.009090909081409 / 6 (66.8%)
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 66.82 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5'].
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [73.48, 69.12, 68.97, 69.12, 66.8, 66.82]
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 73.48


2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 10 =====



Average Metric: 4.14 / 6 (69.0%): 100%|██████████| 6/6 [00:00<00:00, 875.00it/s] 

2026/01/30 19:04:33 INFO dspy.evaluate.evaluate: Average Metric: 4.138189588213588 / 6 (69.0%)
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.97 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [73.48, 69.12, 68.97, 69.12, 66.8, 66.82, 68.97]
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 73.48


2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 8 / 10 =====



Average Metric: 4.01 / 6 (66.8%): 100%|██████████| 6/6 [00:00<00:00, 2891.63it/s]

2026/01/30 19:04:33 INFO dspy.evaluate.evaluate: Average Metric: 4.008147512864494 / 6 (66.8%)
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 66.8 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5'].
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [73.48, 69.12, 68.97, 69.12, 66.8, 66.82, 68.97, 66.8]
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 73.48


2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 9 / 10 =====



Average Metric: 4.15 / 6 (69.1%): 100%|██████████| 6/6 [00:00<00:00, 3040.82it/s]

2026/01/30 19:04:33 INFO dspy.evaluate.evaluate: Average Metric: 4.1472222222462225 / 6 (69.1%)
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 69.12 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 4'].
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [73.48, 69.12, 68.97, 69.12, 66.8, 66.82, 68.97, 66.8, 69.12]
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 73.48


2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 10 / 10 =====



Average Metric: 4.01 / 6 (66.8%): 100%|██████████| 6/6 [00:00<00:00, 2695.57it/s]

2026/01/30 19:04:33 INFO dspy.evaluate.evaluate: Average Metric: 4.008147512864494 / 6 (66.8%)
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 66.8 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5'].
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [73.48, 69.12, 68.97, 69.12, 66.8, 66.82, 68.97, 66.8, 69.12, 66.8]
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 73.48


2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 11 / 10 =====



Average Metric: 4.41 / 6 (73.5%): 100%|██████████| 6/6 [00:00<00:00, 2545.60it/s]

2026/01/30 19:04:33 INFO dspy.evaluate.evaluate: Average Metric: 4.409090909033575 / 6 (73.5%)
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 73.48 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 0'].
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [73.48, 69.12, 68.97, 69.12, 66.8, 66.82, 68.97, 66.8, 69.12, 66.8, 73.48]
2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 73.48


2026/01/30 19:04:33 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 73.48!





In [47]:
dspy_dev_answers = []
split = max(1, len(dataset) // 5)
dev_rows = list(dataset)[split:]
for i, row in enumerate(dev_rows):
    print(f"\n--- SAMPLE {i+1} ---")
    contexts = row["contexts"]
    print("QUESTION:", row["question"])
    pred = optimized_rag(question=row["question"], docs=contexts)
    dspy_dev_answers.append(pred.response)
    print("PREDICTED ANSWER:", pred.response)
    print("GROUND TRUTH:", row["ground_truth_answer"])

# Neues Dataset fürs Scoring
from datasets import Dataset

dev_dataset = Dataset.from_dict({
    "question": [r["question"] for r in dev_rows],
    "contexts": [r["contexts"] for r in dev_rows],
    "ground_truth_answer": [r["ground_truth_answer"] for r in dev_rows],
    "ground_truth_context": [r["ground_truth_context"] for r in dev_rows],
    "answer": dspy_dev_answers,   # <- DSPy Antworten
})




--- SAMPLE 1 ---
QUESTION: Was ist bei der Auswahl eines externen Webhosters zu beachten?
PREDICTED ANSWER: Bei der Auswahl eines externen Webhosters muss vertraglich geregelt werden, wie die Dienste zu erbringen sind, insbesondere Sicherheitsaspekte in einem Service‑Level‑Agreement (SLA). Der Host­er sollte die eingesetzten IT‑Systeme regelmäßig kontrollieren und warten, bei technischen Problemen oder Kompromittierungen zeitnah reagieren und grundlegende technische sowie organisatorische Maßnahmen zum Schutz des Informationsverbundes umsetzen.
GROUND TRUTH: Bei der Nutzung externer Webhosting-Dienste sollte vertraglich geregelt werden, wie die Dienste erbracht werden. Sicherheitsaspekte müssen im Service Level Agreement (SLA) festgehalten werden. Der Webhoster sollte regelmäßig Kontrollen und Wartungen durchführen und bei technischen Problemen oder Kompromittierungen zeitnah reagieren. Er muss grundlegende technische und organisatorische Sicherheitsmaßnahmen umsetzen.

--- SAMPLE 2 -

In [48]:
import asyncio
from ragas.llms import llm_factory
from ragas.embeddings.litellm_provider import LiteLLMEmbeddings
from ragas.metrics.collections import ContextPrecision, ContextRecall, Faithfulness, AnswerCorrectness
import instructor
import litellm

# RAGAS LLM (LiteLLM proxy)
litellm.api_base = llm_cfg.api_base
litellm.api_key = llm_cfg.api_key
client = instructor.from_litellm(litellm.acompletion, mode=instructor.Mode.MD_JSON)
ragas_llm = llm_factory(llm_cfg.model, client=client, adapter='litellm')

embeddings = LiteLLMEmbeddings(
    model=llm_cfg.embedding_model,
    api_key=llm_cfg.api_key,
    api_base=llm_cfg.api_base,
    encoding_format='float',
)

scorers = {
    'context_precision': ContextPrecision(llm=ragas_llm),
    'context_recall': ContextRecall(llm=ragas_llm),
    'faithfulness': Faithfulness(llm=ragas_llm),
    'answer_correctness': AnswerCorrectness(llm=ragas_llm, embeddings=embeddings),
}

async def _score_row(row, sem):
    async with sem:
        return {
            'context_precision': (await scorers['context_precision'].ascore(
                user_input=row['question'],
                reference=row['ground_truth_context'],
                retrieved_contexts=row['contexts'],
            )).value,
            'context_recall': (await scorers['context_recall'].ascore(
                user_input=row['question'],
                reference=row['ground_truth_context'],
                retrieved_contexts=row['contexts'],
            )).value,
            'faithfulness': (await scorers['faithfulness'].ascore(
                user_input=row['question'],
                response=row['answer'],
                retrieved_contexts=row['contexts'],
            )).value,
            'answer_correctness': (await scorers['answer_correctness'].ascore(
                user_input=row['question'],
                response=row['answer'],
                reference=row['ground_truth_answer'],
            )).value,
        }

async def score_dataset_batched(ds, batch_size=10, concurrency=5):
    sem = asyncio.Semaphore(concurrency)
    rows = list(ds)
    results = []
    for i in range(0, len(rows), batch_size):
        batch = rows[i : i + batch_size]
        tasks = [asyncio.create_task(_score_row(r, sem)) for r in batch]
        results.extend(await asyncio.gather(*tasks))
    return results

scores = await score_dataset_batched(dev_dataset, batch_size=16, concurrency=10)
stats = {
    k: {
        'avg': sum(s[k] for s in scores) / len(scores),
        'min': min(s[k] for s in scores),
        'max': max(s[k] for s in scores),
    }
    for k in scores[0].keys()
}
print(stats)


{'context_precision': {'avg': 0.8942156862489662, 'min': 0.4166666666458333, 'max': 0.9999999999833333}, 'context_recall': {'avg': 0.9254901960784314, 'min': 0.3333333333333333, 'max': 1.0}, 'faithfulness': {'avg': 0.7097508700449877, 'min': 0.0, 'max': 1.0}, 'answer_correctness': {'avg': 0.6126667759628727, 'min': 0.1397254220608889, 'max': 0.991924659349372}}


## Findings

- TBD: Add key observations after running the notebook.
- TBD: Summarize metrics/results (e.g., faithfulness/answer correctness).
- TBD: Note any dataset or retrieval quality issues discovered.
