## Import the Multi-Run Evaluation Function

In [None]:
import sys
from pathlib import Path

# Add project root to path for imports
NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == "notebooks" else NOTEBOOK_DIR

if str(PROJECT_ROOT / "scripts") not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT / "scripts"))
if str(PROJECT_ROOT / "notebooks") not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT / "notebooks"))

from run_multi_evaluation import run_multi_evaluation

## Function Parameters

The `run_multi_evaluation()` function accepts the following parameters:

### Required Parameters

| Parameter | Type | Description |
|-----------|------|-------------|
| `llm` | str | LLM model identifier (e.g., `"openai/gpt-oss-120b"`) |
| `embedding_model` | str | Embedding model identifier (e.g., `"openai/octen-embedding-8b"`) |
| `input_data_description` | str | Free-text description of input data and preprocessing |
| `chunk_size` | int | Character size of chunks in vector database |
| `chunk_overlap` | int | Character overlap between chunks |
| `top_k` | int | Number of chunks to retrieve per question |
| `output_name` | str | Base name for output files (should include model info) |
| `temperature` | float | LLM temperature setting |

### Optional Parameters (with defaults)

| Parameter | Type | Default | Description |
|-----------|------|---------|-------------|
| `num_runs` | int | 100 | Number of evaluation runs |
| `master_seed` | int | 42 | Master seed for generating run seeds |
| `eval_csv_path` | str | `"data/data_evaluation/GSKI_Fragen-Antworten-Fundstellen.csv"` | Path to evaluation CSV |

## Example: Multi-Run Evaluation with GPT-OSS-120B

**Note**: Running 100 evaluations will take a significant amount of time. For testing, you can reduce `num_runs` to a smaller value (e.g., 5).

In [None]:
# Run multi-run evaluation (reduce num_runs for testing)
results_path = run_multi_evaluation(
    # Required parameters
    llm="openai/gpt-oss-120b",
    embedding_model="openai/octen-embedding-8b",
    input_data_description="XML Kompendium 2023 (data/grundschutz.xml), character-based chunking with 4000 char chunks and 200 char overlap",
    chunk_size=4000,
    chunk_overlap=200,
    top_k=5,
    output_name="gpt-oss-120b_kompendium-xml_multi",  # Include model info in name
    temperature=0.2,
    
    # Multi-run specific parameters
    num_runs=5,  # Use 100 for full evaluation, 5 for testing
    master_seed=42,
)

print(f"\nResults saved to: {results_path}")

## Output Files

The function generates three files in `data/results/`:

1. **Per-question statistics** (`{output_name}_per_question.csv`):
   - `Frage` - Question
   - `{metric}_mean` - Mean value across runs
   - `{metric}_min` - Minimum value across runs
   - `{metric}_max` - Maximum value across runs
   - `{metric}_std` - Standard deviation across runs

2. **Overall statistics** (`{output_name}_overall.csv`):
   - Aggregated statistics for all questions across all runs

3. **README file** (`{output_name}.md`):
   - Full documentation with configuration and summary statistics

## View Generated Results

In [None]:
import pandas as pd
from IPython.display import display, Markdown

# Read and display results
results_dir = PROJECT_ROOT / "data" / "results"

# List all multi-run result files
print("Available multi-run result files:")
for f in sorted(results_dir.glob("*multi*")):
    print(f"  - {f.name}")

In [None]:
# Load per-question statistics
result_name = "gpt-oss-120b_kompendium-xml_multi"

per_question_file = results_dir / f"{result_name}_per_question.csv"
if per_question_file.exists():
    per_question_df = pd.read_csv(per_question_file, sep=";", encoding="utf-8-sig")
    print(f"Loaded {len(per_question_df)} questions")
    display(per_question_df.head())
else:
    print(f"File not found: {per_question_file}")

In [None]:
# Load overall statistics
overall_file = results_dir / f"{result_name}_overall.csv"
if overall_file.exists():
    overall_df = pd.read_csv(overall_file, sep=";", encoding="utf-8-sig")
    print("Overall System Statistics:")
    display(overall_df)
else:
    print(f"File not found: {overall_file}")

In [None]:
# Display the README
readme_file = results_dir / f"{result_name}.md"
if readme_file.exists():
    display(Markdown(readme_file.read_text(encoding="utf-8")))
else:
    print(f"File not found: {readme_file}")

## Analyze Variance

Identify questions with high variance (may need more stable prompting or retrieval).

In [None]:
if 'per_question_df' in dir() and per_question_df is not None:
    # Find questions with highest variance in answer correctness
    high_variance = per_question_df.nlargest(5, 'answer_correctness_std')[['Frage', 'answer_correctness_mean', 'answer_correctness_std']]
    
    display(Markdown("### Questions with Highest Answer Correctness Variance"))
    display(high_variance)
    
    # Find questions with lowest variance (most stable)
    low_variance = per_question_df.nsmallest(5, 'answer_correctness_std')[['Frage', 'answer_correctness_mean', 'answer_correctness_std']]
    
    display(Markdown("### Questions with Lowest Answer Correctness Variance (Most Stable)"))
    display(low_variance)

In [None]:
if 'per_question_df' in dir() and per_question_df is not None:
    # Summary statistics of the variance itself
    metrics = ['context_precision', 'context_recall', 'faithfulness', 'answer_correctness']
    
    variance_summary = []
    for metric in metrics:
        std_col = f"{metric}_std"
        variance_summary.append({
            'Metric': metric,
            'Avg Std Dev': per_question_df[std_col].mean() * 100,
            'Min Std Dev': per_question_df[std_col].min() * 100,
            'Max Std Dev': per_question_df[std_col].max() * 100,
        })
    
    variance_df = pd.DataFrame(variance_summary)
    display(Markdown("### Variance Summary (Std Dev in Percentage Points)"))
    display(variance_df.round(2))