## Import the Evaluation Function

In [2]:
import sys
from pathlib import Path

# Add project root to path for imports
NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == "notebooks" else NOTEBOOK_DIR

if str(PROJECT_ROOT / "scripts") not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT / "scripts"))
if str(PROJECT_ROOT / "notebooks") not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT / "notebooks"))

from run_evaluation import generate_evaluation_results

ModuleNotFoundError: No module named 'pandas'

## Function Parameters

The `generate_evaluation_results()` function requires the following parameters:

### Required Parameters

| Parameter | Type | Description |
|-----------|------|-------------|
| `llm` | str | LLM model identifier (e.g., `"openai/gpt-oss-120b"`) |
| `embedding_model` | str | Embedding model identifier (e.g., `"openai/octen-embedding-8b"`) |
| `input_data_description` | str | Free-text description of input data and preprocessing |
| `chunk_size` | int | Character size of chunks in vector database |
| `chunk_overlap` | int | Character overlap between chunks |
| `top_k` | int | Number of chunks to retrieve per question |
| `output_name` | str | Base name for output files (without extension) |
| `temperature` | float | LLM temperature setting |

### Optional Parameters (with defaults)

| Parameter | Type | Default | Description |
|-----------|------|---------|-------------|
| `seed` | int | 42 | Random seed for reproducibility |
| `eval_csv_path` | str | `"data/data_evaluation/GSKI_Fragen-Antworten-Fundstellen.csv"` | Path to evaluation CSV |

## Example: XML Kompendium with GPT-OSS-120B

In [None]:
# Run evaluation with XML Kompendium data
csv_path = generate_evaluation_results(
    # Required parameters
    llm="openai/gpt-oss-120b",
    embedding_model="openai/octen-embedding-8b",
    input_data_description="XML Kompendium 2023 (data/grundschutz.xml), character-based chunking with 4000 char chunks and 200 char overlap",
    chunk_size=4000,
    chunk_overlap=200,
    top_k=5,
    output_name="gpt-oss-120b_kompendium-xml",
    temperature=0.2,
    
    # Optional parameters (using defaults)
    seed=42,
)

print(f"\nResults saved to: {csv_path}")

## Example: Different Configuration

You can run multiple evaluations with different configurations to compare results.

In [None]:
# Example with different parameters (uncomment to run)

# csv_path = generate_evaluation_results(
#     llm="openai/gpt-oss-120b",
#     embedding_model="openai/octen-embedding-8b",
#     input_data_description="XML Kompendium 2023, larger chunks (8000 chars) with 400 overlap",
#     chunk_size=8000,
#     chunk_overlap=400,
#     top_k=3,  # Fewer but larger chunks
#     output_name="gpt-oss-120b_kompendium-xml-large-chunks",
#     temperature=0.1,  # Lower temperature for more deterministic outputs
#     seed=123,  # Different seed
# )

## Output Files

The function generates two files in `data/results/`:

1. **CSV file** (`{output_name}.csv`): Contains all evaluation data with columns:
   - `Frage` - Original question
   - `Antwort` - Ground truth answer
   - `Fundstellen` - Ground truth context references
   - `Generierte Antwort` - LLM-generated answer
   - `Ermittelte Fundstellen` - Retrieved context chunks
   - `context_precision` - RAGAS metric
   - `context_recall` - RAGAS metric
   - `faithfulness` - RAGAS metric
   - `answer_correctness` - RAGAS metric

2. **README file** (`{output_name}.md`): Documentation including:
   - Input data description
   - Model configuration
   - Preprocessing parameters
   - RAGAS metrics summary (avg, min, max, std dev)

## View Generated Results

In [None]:
import pandas as pd
from IPython.display import display, Markdown

# Read and display the generated CSV
results_dir = PROJECT_ROOT / "data" / "results"

# List all result files
print("Available result files:")
for f in sorted(results_dir.glob("*")):
    print(f"  - {f.name}")

In [None]:
# Load and preview a specific result (replace with your output_name)
result_name = "gpt-oss-120b_kompendium-xml"

csv_file = results_dir / f"{result_name}.csv"
if csv_file.exists():
    df = pd.read_csv(csv_file, sep=";", encoding="utf-8-sig")
    print(f"Loaded {len(df)} results")
    display(df.head())
else:
    print(f"File not found: {csv_file}")

In [None]:
# Display the README
readme_file = results_dir / f"{result_name}.md"
if readme_file.exists():
    display(Markdown(readme_file.read_text(encoding="utf-8")))
else:
    print(f"File not found: {readme_file}")

## Metrics Summary

In [None]:
# Calculate and display metrics summary
if 'df' in dir() and df is not None:
    metrics = ["context_precision", "context_recall", "faithfulness", "answer_correctness"]
    
    summary = df[metrics].agg(["mean", "min", "max", "std"]).T
    summary.columns = ["Average", "Min", "Max", "Std Dev"]
    
    # Convert to percentages
    summary = summary * 100
    summary = summary.round(1)
    
    display(Markdown("### RAGAS Metrics Summary (Percentages)"))
    display(summary)