# Dataset Generation for Unified Evals Model

## Setup

In [1]:
%pip install datasets==3.6.0

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import re
import json
import random
from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional

import numpy as np
import pandas as pd
from tqdm import tqdm

from datasets import load_dataset
import mlflow
from evaltune.evaluation.scorers import make_guidelines_scorer

  mlflow.mismatch._check_version_mismatch()


In [2]:
RNG = random.Random(229)
np.random.seed(229)

In [3]:
OUT_DIR = "./evals_benchmark_datasets"
os.makedirs(OUT_DIR, exist_ok=True)

# Ground Truth Labels

In [5]:
# helper functions

def normalize_whitespace(s: str) -> str:
    if s is None:
        return ""
    return re.sub(r"\s+", " ", s).strip()

def save_minimal_dataset(df: pd.DataFrame, name: str) -> str:
    """
    Save df with exactly columns: input, output, label
    """
    df = df[["input", "output", "label"]].copy()
    path_csv = os.path.join(OUT_DIR, f"{name}.csv")
    df.to_csv(path_csv, index=False)
    return path_csv

def stratified_sample(df: pd.DataFrame, n: int, label_col: str = "label") -> pd.DataFrame:
    """
    Try to sample n rows while keeping a reasonable mix of labels.
    If one class is rare, it will take as many as possible.
    """
    if len(df) <= n:
        return df.sample(frac=1.0, random_state=229).reset_index(drop=True)

    df_pos = df[df[label_col] == 1]
    df_neg = df[df[label_col] == 0]

    # Target roughly balanced, but limited by rare class
    half = n // 2
    n_pos = min(len(df_pos), half)
    n_neg = min(len(df_neg), n - n_pos)

    # If not enough negatives/positives, fill from the other
    remaining = n - (n_pos + n_neg)
    if remaining > 0:
        if len(df_pos) - n_pos > len(df_neg) - n_neg:
            extra = df_pos.sample(n=min(remaining, len(df_pos) - n_pos), random_state=229)
        else:
            extra = df_neg.sample(n=min(remaining, len(df_neg) - n_neg), random_state=229)
        sampled = pd.concat([
            df_pos.sample(n=n_pos, random_state=229),
            df_neg.sample(n=n_neg, random_state=229),
            extra
        ], ignore_index=True)
    else:
        sampled = pd.concat([
            df_pos.sample(n=n_pos, random_state=229),
            df_neg.sample(n=n_neg, random_state=229),
        ], ignore_index=True)

    sampled = sampled.sample(frac=1.0, random_state=229).reset_index(drop=True)
    return sampled

## Translation Dataset

In [32]:

def build_translation_wmt_mqm_non_en_to_en(
    n_target: int = 800,
    seed: int = 229,
    group_by=("lp", "year"),
) -> pd.DataFrame:
    """
    Translation dataset with:
      input  = non-English source text
      output = English MT output

    Filters to lp like 'de-en', 'zh-en', ... (i.e., target is English, source is not).
    """
    ds = load_dataset("RicardoRei/wmt-mqm-human-evaluation")
    df = ds["train"].to_pandas()

    # Keep only non-English -> English pairs
    # e.g., 'de-en', 'ru-en', 'zh-en', ...
    df = df[df["lp"].str.endswith("-en") & ~df["lp"].str.startswith("en-")].copy()

    # Minimal columns
    df["input"] = df["src"].map(normalize_ws)   # non-English
    df["output"] = df["mt"].map(normalize_ws)   # English
    df["score"] = pd.to_numeric(df["score"], errors="coerce")

    df = df[(df["input"].str.len() > 0) & (df["output"].str.len() > 0)]
    df = df.dropna(subset=["score"])

    # Median split within (lp, year) => pass/fail without picking a numeric threshold
    if group_by:
        med = df.groupby(list(group_by))["score"].transform("median")
    else:
        med = df["score"].median()
    df["label"] = (df["score"] >= med).astype(int)

    df_min = df[["input", "output", "label"]]

    # Sample
    if len(df_min) > n_target:
        df_min = df_min.sample(n=n_target, random_state=seed).reset_index(drop=True)
    else:
        df_min = df_min.sample(frac=1.0, random_state=seed).reset_index(drop=True)

    return df_min


In [14]:
# ============================================
# 1) Translation dataset from MQM (human error annotations)
#    PASS iff no Major/Critical errors
#    Source: alconost/mqm-translation-gold
# ============================================

def build_translation_mqm_dataset(
    n_target: int = 800,
    seed: int = 229
) -> pd.DataFrame:
    """
    Builds a minimal dataset with columns: input, output, label
    label = 1 iff MQM annotations contain no Major/Critical errors.
    """
    RNG.seed(seed)

    # Load MQM dataset
    # Dataset page describes JSONL/TSV structure and MQM fields. :contentReference[oaicite:3]{index=3}
    # ds = load_dataset("alconost/mqm-translation-gold")
    ds = load_dataset("RicardoRei/wmt-mqm-human-evaluation")

    # Some datasets only have a single split; handle robustly.
    split_name = "train" if "train" in ds else list(ds.keys())[0]
    data = ds[split_name]

    rows = []
    for ex in tqdm(data, desc="Processing MQM examples"):
        # We will be defensive about field names because HF datasets vary.
        # The dataset page documents fields; if any mismatch occurs, inspect first 1-2 rows.
        src = ex.get("source") or ex.get("src") or ex.get("source_text") or ""
        mt = ex.get("translation") or ex.get("hypothesis") or ex.get("output") or ex.get("target") or ""
        src = normalize_whitespace(src)
        mt = normalize_whitespace(mt)

        # MQM errors are typically provided as a list/array of annotations
        # Each annotation includes severity: Minor/Major/Critical
        errors = ex.get("errors") or ex.get("mqm_errors") or ex.get("annotations") or []

        major_or_critical = False
        if isinstance(errors, list):
            for e in errors:
                # e might be dict with key 'severity' or similar
                if isinstance(e, dict):
                    sev = (e.get("severity") or e.get("Severity") or "").strip().lower()
                    if sev in {"major", "critical"}:
                        major_or_critical = True
                        break

        label = 0 if major_or_critical else 1

        if src and mt:
            rows.append({"input": src, "output": mt, "label": int(label)})

    df = pd.DataFrame(rows)

    # Sample to size with decent label mix
    df = stratified_sample(df, n=min(n_target, 1000), label_col="label")
    return df

## Summarization Dataset

In [7]:
def normalize_ws(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()

def build_summarization_frank_minimal(
    frank_data_dir: str = "./../../frank/data",
    split: str = "test",          # "test" | "validation" | "train" | "all"
    n_target: int = 800,
    seed: int = 229
) -> pd.DataFrame:
    """
    Build FRANK summarization dataset with exactly:
      input  = article
      output = summary
      label  = pass/fail from human 'Factuality'
    Uses human_annotations.json for labels (human annotated).
    """
    path = f"{frank_data_dir}/human_annotations.json"
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    rows = []
    for ex in data:
        if split != "all":
            if ex.get("split") != split:
                continue

        # human_annotations.json does NOT include article/summary text in your peek;
        # it includes hash + model_name + labels. So we need benchmark_data.json for text.
        # We'll join on (hash, model_name, split).
        # (Your peek shows benchmark_data.json has those + article/summary.)
        rows.append({
            "hash": str(ex.get("hash")),
            "model_name": str(ex.get("model_name")),
            "split": ex.get("split"),
            "Factuality": ex.get("Factuality")
        })

    ann = pd.DataFrame(rows)
    if ann.empty:
        raise RuntimeError(f"No FRANK annotations found for split={split} at {path}")

    # Load benchmark_data.json for article/summary
    bench_path = f"{frank_data_dir}/benchmark_data.json"
    with open(bench_path, "r", encoding="utf-8") as f:
        bench = json.load(f)

    bench_rows = []
    for ex in bench:
        if split != "all":
            if ex.get("split") != split:
                continue
        bench_rows.append({
            "hash": str(ex.get("hash")),
            "model_name": str(ex.get("model_name")),
            "split": ex.get("split"),
            "article": normalize_ws(ex.get("article", "")),
            "summary": normalize_ws(ex.get("summary", "")),
        })

    bench_df = pd.DataFrame(bench_rows)
    if bench_df.empty:
        raise RuntimeError(f"No FRANK benchmark rows found for split={split} at {bench_path}")

    # Join (hash, model_name, split)
    df = ann.merge(bench_df, on=["hash", "model_name", "split"], how="inner")

    if df.empty:
        raise RuntimeError(
            "FRANK join produced 0 rows. "
            "Check that 'hash' and 'model_name' match between human_annotations.json and benchmark_data.json."
        )

    # Construct minimal columns
    # In FRANK, Factuality is a human label; treat 1.0 as pass, 0.0 as fail.
    # If it contains floats, coerce safely.
    def to_label(x):
        if x is None:
            return None
        try:
            return 1 if float(x) >= 1.0 else 0
        except Exception:
            return None

    df["label"] = df["Factuality"].apply(to_label)
    df = df.dropna(subset=["label"])
    df["label"] = df["label"].astype(int)

    df_min = df.rename(columns={"article": "input", "summary": "output"})[["input", "output", "label"]]

    # Drop empty text rows
    df_min = df_min[(df_min["input"].str.len() > 0) & (df_min["output"].str.len() > 0)]

    # Sample to target size
    if len(df_min) > n_target:
        df_min = df_min.sample(n=n_target, random_state=seed).reset_index(drop=True)
    else:
        df_min = df_min.sample(frac=1.0, random_state=seed).reset_index(drop=True)

    return df_min


## Extraction Dataset

In [8]:
# ============================================
# 3) Extraction dataset (NER) with gold human annotations
#
# We will:
#   - load CoNLL-2003 (gold labels are human annotated)
#   - generate an "output" as predicted entity list using a pretrained NER model
#   - label PASS iff predicted entities exactly match gold entities (no numeric thresholds)
#
# Dataset: eriktks/conll2003 :contentReference[oaicite:5]{index=5}
# ============================================

def conll_tags_to_entities(tokens: List[str], ner_tags: List[str]) -> List[Tuple[str, str]]:
    """
    Convert BIO tags into a list of (entity_text, entity_type).
    ner_tags in CoNLL are typically like 'B-PER', 'I-ORG', 'O', etc.
    """
    entities = []
    cur_tokens = []
    cur_type = None

    def flush():
        nonlocal cur_tokens, cur_type
        if cur_tokens and cur_type:
            entities.append((" ".join(cur_tokens), cur_type))
        cur_tokens, cur_type = [], None

    for tok, tag in zip(tokens, ner_tags):
        if tag == "O" or tag is None:
            flush()
            continue
        if tag.startswith("B-"):
            flush()
            cur_type = tag[2:]
            cur_tokens = [tok]
        elif tag.startswith("I-"):
            t = tag[2:]
            if cur_type == t and cur_tokens:
                cur_tokens.append(tok)
            else:
                # ill-formed, start new
                flush()
                cur_type = t
                cur_tokens = [tok]
        else:
            # unknown tag format
            flush()

    flush()
    return entities


def format_entities_as_string(entities: List[Tuple[str, str]]) -> str:
    """
    Create a stable string representation for entity lists.
    """
    # Sort for deterministic comparison (CoNLL tags are order-preserving but predicted outputs might not be).
    ents = sorted([(normalize_whitespace(e), t) for e, t in entities], key=lambda x: (x[1], x[0]))
    return json.dumps([{"text": e, "type": t} for e, t in ents], ensure_ascii=False)


def build_extraction_conll_dataset(
    n_target: int = 800,
    seed: int = 229,
    hf_ner_model: str = "dslim/bert-base-NER",  # lightweight baseline
    max_examples_to_scan: int = 5000
) -> pd.DataFrame:
    """
    Build (input, output, label) for NER extraction:
    - input = sentence text
    - output = predicted entity list (json string)
    - label = 1 iff exact match to gold entity list
    """
    RNG.seed(seed)

    ds = load_dataset("conll2003", trust_remote_code=True)
    split = "validation" if "validation" in ds else "test" if "test" in ds else "train"
    data = ds[split]

    # Load NER pipeline
    from transformers import pipeline
    ner_pipe = pipeline("token-classification", model=hf_ner_model, aggregation_strategy="simple")

    rows = []

    # Scan examples until we have enough rows; exact-match positives can be rare
    # so we allow scanning more than n_target.
    for i, ex in enumerate(tqdm(data, desc=f"Processing CoNLL-2003 ({split})")):
        if i >= max_examples_to_scan and len(rows) >= n_target:
            break

        tokens = ex["tokens"]
        # conll2003 provides numeric tags; map to string names
        # Feature might be 'ner_tags' (ints) with 'features' mapping
        tag_ids = ex["ner_tags"]
        tag_names = [data.features["ner_tags"].feature.names[t] for t in tag_ids]

        sentence = " ".join(tokens)
        sentence = normalize_whitespace(sentence)

        gold_entities = conll_tags_to_entities(tokens, tag_names)
        gold_str = format_entities_as_string(gold_entities)

        # Model prediction
        pred = ner_pipe(sentence)
        # pred items have entity_group + word; 'simple' aggregation gives grouped entities
        pred_entities = []
        for p in pred:
            etype = p.get("entity_group") or p.get("entity") or ""
            text = p.get("word") or ""
            if etype and text:
                # Map common model labels to CoNLL types if needed
                # dslim/bert-base-NER uses PER/ORG/LOC/MISC already
                pred_entities.append((normalize_whitespace(text), etype))

        pred_str = format_entities_as_string(pred_entities)

        label = int(pred_str == gold_str)

        rows.append({
            "input": sentence,
            "output": pred_str,
            "label": label
        })

    df = pd.DataFrame(rows)

    # For extraction, positives can be very rare. We don't force balance too hard;
    # we sample up to target size with whatever mix we have.
    if len(df) > n_target:
        df = df.sample(n=n_target, random_state=229).reset_index(drop=True)
    else:
        df = df.sample(frac=1.0, random_state=229).reset_index(drop=True)

    return df

## Putting it all together

In [9]:
TARGET_N = 800  

In [33]:
# 1) Translation (MQM)
df_trans = build_translation_wmt_mqm_non_en_to_en(n_target=TARGET_N)
print(df_trans.shape)
print(df_trans["label"].value_counts())
print(df_trans.iloc[0]["input"][:120])
print(df_trans.iloc[0]["output"][:120])


(800, 3)
label
1    458
0    342
Name: count, dtype: int64
波黑奥委会主席：坚信中国有能力在特殊时期办好冬奥盛会-新华网
Chairman of the Olympic Committee of Bosnia and Herzegovina: firmly confident that China has the ability to hold the Win


In [34]:
df_trans.head()

Unnamed: 0,input,output,label
0,波黑奥委会主席：坚信中国有能力在特殊时期办好冬奥盛会-新华网,Chairman of the Olympic Committee of Bosnia an...,1
1,兰花：至少有 两万种兰花 -- 多种多样的令人惊异。,"Orchids: There are at least 20,000 species of ...",1
2,当然，你会无助地发现你的头也被挤压和拉伸， 所以你可能无法理解究竟发生了什么。,"Now of course, your head would be squeezed and...",0
3,“很多疾病早期检测、切除不到位，跟医生手术室用的灯有关 。 ”,“The lack of early detection and resection of ...,0
4,还有一部分品牌，为了推广自己的产品，不择手段，跟医院的医生挂钩，由他们协助产品的推广，从一听...,"There are also some brands, in order to promot...",0


In [35]:
path_trans = save_minimal_dataset(df_trans, "translation_mqm_input_output_label")
print("Saved translation dataset:", path_trans)
print(df_trans["label"].value_counts(dropna=False))

Saved translation dataset: ./evals_benchmark_datasets/translation_mqm_input_output_label.csv
label
1    458
0    342
Name: count, dtype: int64


In [21]:
# 2) Summarization (FRANK)
config = FrankConfig(
    local_json_path="./frank/data/<ACTUAL_FILENAME>.json"
)

df_sum = build_summarization_frank_dataset(
    n_target=TARGET_N,
    config=config
)

Processing FRANK (HF mirror: mtc/frank-test-set-with-faithfulness-annotation): 100%|██████████| 1575/1575 [00:00<00:00, 4377.02it/s]


In [29]:
df_sum = build_summarization_frank_minimal(
    frank_data_dir="./../../frank/data",
    split="test",
    n_target=800
)

print(df_sum.shape)
print(df_sum["label"].value_counts())
df_sum.head()

(800, 3)
label
0    520
1    280
Name: count, dtype: int64


Unnamed: 0,input,output,label
0,France's Dubuisson carded a 67 to tie with ove...,rory mcilroy will take a one-shot lead into th...,0
1,Share this withEmailFacebookMessengerMessenger...,a man has been found guilty of the murder of a...,1
2,Homeless people in the Bay Area are being hand...,community technology alliance is giving away f...,1
3,Crowds who turned out for the Anzac Day memori...,a contractor doing a sound check inside the ex...,1
4,(CNN)Two Transportation Security Administratio...,screeners have been fired after conspiring to ...,0


In [31]:
path_sum = save_minimal_dataset(df_sum, "summarization_frank_input_output_label")
print("Saved summarization dataset:", path_sum)

Saved summarization dataset: ./evals_benchmark_datasets/summarization_frank_input_output_label.csv


In [11]:
# 3) Extraction (CoNLL-2003 NER)
df_extr = build_extraction_conll_dataset(n_target=TARGET_N)

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use mps:0
Processing CoNLL-2003 (validation): 100%|██████████| 3250/3250 [02:06<00:00, 25.61it/s]


In [12]:
print(df_extr.shape)
print(df_extr["label"].value_counts())
df_extr.head()

(800, 3)
label
1    505
0    295
Name: count, dtype: int64


Unnamed: 0,input,output,label
0,"The Briton , who lost his World Boxing Council...","[{""text"": ""##riton"", ""type"": ""MISC""}, {""text"":...",0
1,Coach Berti Vogts has called up a virtually id...,"[{""text"": ""Germany"", ""type"": ""LOC""}, {""text"": ...",1
2,"On Friday , the Dow Jones index closed down 31...","[{""text"": ""Dow Jones"", ""type"": ""MISC""}]",1
3,GOLF - LEADING SCORES AT GREATER MILWAUKEE OPEN .,"[{""text"": ""GREAT"", ""type"": ""MISC""}, {""text"": ""...",0
4,EUROBONDS - Primary market activity was sharpl...,"[{""text"": ""U. S"", ""type"": ""LOC""}, {""text"": ""U....",0


In [13]:
path_extr = save_minimal_dataset(df_extr, "extraction_conll_ner_input_output_label")
print("Saved extraction dataset:", path_extr)
print(df_extr["label"].value_counts(dropna=False))

Saved extraction dataset: ./evals_benchmark_datasets/extraction_conll_ner_input_output_label.csv
label
1    505
0    295
Name: count, dtype: int64


# Feature Creation

In [4]:
# =============================================================================
# Unit-test style guideline specs (text only) so we can instantiate with any judge model
# =============================================================================

TRANSLATION_SPECS = {
    "meaning_preservation": [
        "The English output preserves the meaning and intent of the non-English input at the sentence and document level.",
    ],
    "no_added_info": [
        "The output does not introduce facts, entities, or details that are not present or implied in the input.",
    ],
    "coverage": [
        "All important details from the input are included (key claims, qualifiers, and relationships).",
    ],
    "named_entities_numbers": [
        "Proper nouns, dates, numbers, currencies, and units are translated/transcribed correctly without changing values.",
    ],
    "terminology_consistency": [
        "Repeated terms and named entities are translated consistently throughout the output.",
    ],
    "fluency": [
        "The output is grammatical, idiomatic English with natural phrasing and correct syntax.",
    ],
    "style_register": [
        "The output uses an appropriate tone/register for the input (e.g., news-like text remains formal and neutral).",
    ],
    "formatting": [
        "Punctuation, capitalization, and structural formatting (lists/headers/line breaks) are appropriate and consistent.",
    ],
}

SUMMARIZATION_SPECS = {
    "salience": [
        "The summary captures the central topic and main takeaway(s) of the source text.",
    ],
    "coverage": [
        "The summary includes key details needed to understand the main takeaway(s).",
    ],
    "faithfulness": [
        "All claims are supported by the source; no hallucinated or unsupported facts are introduced.",
    ],
    "attribution_specificity": [
        "The summary avoids over-specific or overly certain claims unless explicitly supported by the source.",
    ],
    "entity_number_fidelity": [
        "Names, numbers, dates, locations, and attributions match the source without distortion.",
    ],
    "coherence": [
        "The summary is logically ordered and easy to follow with clear sentence-to-sentence flow.",
    ],
    "conciseness": [
        "The summary is succinct with minimal redundancy and no filler.",
    ],
    "non_contradiction": [
        "The summary does not contradict the source or itself.",
    ],
}

EXTRACTION_SPECS = {
    "schema_format": [
        "The output strictly follows the expected schema/format and uses correct data types for each field.",
    ],
    "no_hallucinations": [
        "The output does not include entities/values that are not present in the input text.",
    ],
    "span_grounding": [
        "Extracted strings are grounded in the input text (verbatim spans or faithful aliases).",
    ],
    "type_correctness": [
        "Each extracted item has the correct type/category (e.g., PER/ORG/LOC/MISC or your task's schema types).",
    ],
    "completeness_recall": [
        "All relevant entities/values present in the input are extracted; no major items are missing.",
    ],
    "dedup_normalization": [
        "Entities/values are normalized consistently (casing/whitespace) and duplicates are removed appropriately.",
    ],
    "boundary_precision": [
        "Entity boundaries are precise (not overly broad/narrow); extracted spans correspond to the intended entity mention.",
    ],
    "consistency": [
        "The output is internally consistent (the same entity/value is not represented in conflicting ways).",
    ],
}


In [None]:
# ---------------------------------------------------------------------------
# Databricks connection & MLflow tracing
# ---------------------------------------------------------------------------
# MLflow tracking
os.environ["DATABRICKS_TOKEN"] = ""
os.environ["DATABRICKS_HOST"] = ""
os.environ["MLFLOW_TRACKING_URI"] = ""
os.environ["MLFLOW_REGISTRY_URI"] = ""
os.environ["MLFLOW_EXPERIMENT_ID"] = ""

# API keys for models (Hidden for public access)
os.environ["OPENROUTER_API_KEY"] = ""
JUDGE_MODEL = "openrouter:/anthropic/claude-3.5-sonnet"

# alternatives:
# JUDGE_MODEL = "openrouter:/openai/gpt-4"
# JUDGE_MODEL = "openrouter:/openai/gpt-4o-mini"
# JUDGE_MODEL = "openrouter:/meta-llama/llama-3.1-405b"


In [7]:
mlflow.dspy.autolog(log_traces=True, log_traces_from_eval=True, log_compiles=True, log_evals=True)

In [15]:
# ---------------------------------------------------------------------------
# Create scorer objects from the spec dicts defined above
# ---------------------------------------------------------------------------

def specs_to_scorers(specs: dict, model: str) -> list:
    """Convert {name: [guidelines]} dict to list of MLflow Guidelines scorers."""
    return [
        make_guidelines_scorer(name=name, guidelines=guidelines, model=model)
        for name, guidelines in specs.items()
    ]

translation_scorers = specs_to_scorers(TRANSLATION_SPECS, JUDGE_MODEL)
summarization_scorers = specs_to_scorers(SUMMARIZATION_SPECS, JUDGE_MODEL)
extraction_scorers = specs_to_scorers(EXTRACTION_SPECS, JUDGE_MODEL)

print(f"Translation scorers:   {[s.name for s in translation_scorers]}")
print(f"Summarization scorers: {[s.name for s in summarization_scorers]}")
print(f"Extraction scorers:    {[s.name for s in extraction_scorers]}")

Translation scorers:   ['meaning_preservation', 'no_added_info', 'coverage', 'named_entities_numbers', 'terminology_consistency', 'fluency', 'style_register', 'formatting']
Summarization scorers: ['salience', 'coverage', 'faithfulness', 'attribution_specificity', 'entity_number_fidelity', 'coherence', 'conciseness', 'non_contradiction']
Extraction scorers:    ['schema_format', 'no_hallucinations', 'span_grounding', 'type_correctness', 'completeness_recall', 'dedup_normalization', 'boundary_precision', 'consistency']


In [16]:
# ---------------------------------------------------------------------------
# Helper: convert DataFrame rows → MLflow eval data (with pre-existing outputs)
# ---------------------------------------------------------------------------

def df_to_eval_data(df: pd.DataFrame) -> list:
    """Convert DataFrame to MLflow eval data with pre-existing outputs.

    Uses 'inputs' for the source text and 'outputs' for the model output,
    so mlflow.genai.evaluate() scores existing outputs without regeneration.
    """
    return [
        {
            "inputs": {"question": row["input"]},
            "outputs": {"response": row["output"]},
        }
        for _, row in df.iterrows()
    ]


# ---------------------------------------------------------------------------
# Core scoring function
# ---------------------------------------------------------------------------

def run_judge_scoring(
    df: pd.DataFrame,
    scorers: list,
    task_name: str,
) -> pd.DataFrame:
    """Score all rows with LLM judge and return df with new feature columns.

    Each Guidelines scorer produces a yes/no verdict per row, which is
    converted to 1/0 and added as a new column.
    """
    eval_data = df_to_eval_data(df)

    with mlflow.start_run(run_name=f"{task_name}_judge_scoring"):
        result = mlflow.genai.evaluate(
            data=eval_data,
            scorers=scorers,
        )

    # Extract per-row results
    eval_table = result.result_df

    # Debug: show available columns so we can identify the naming pattern
    print(f"[{task_name}] Eval table shape: {eval_table.shape}")
    print(f"[{task_name}] Eval table columns: {list(eval_table.columns)}")

    # Map yes/no → 1/0 for each scorer
    df_features = df.copy().reset_index(drop=True)
    for scorer in scorers:
        # MLflow version may use different column name patterns
        for col_pattern in [f"{scorer.name}/value"]:
            if col_pattern in eval_table.columns:
                df_features[scorer.name] = (
                    eval_table[col_pattern]
                    .map({"yes": 1, "no": 0})
                    .fillna(0)
                    .astype(int)
                    .values
                )
                break

    added_cols = [c for c in df_features.columns if c not in ["input", "output", "label"]]
    print(f"[{task_name}] Added {len(added_cols)} feature(s): {added_cols}")
    return df_features

## Small-Scale Validation (10 rows of summarization)

Run this first to validate that scorers work, features appear, and Databricks traces look correct.
After confirming, proceed to the full pipeline below.

In [21]:
# Load summarization dataset
df_sum = pd.read_csv(f"{OUT_DIR}/summarization_frank_input_output_label.csv")

# Take 10 rows: 5 label=0 + 5 label=1
df_test = pd.concat([
    df_sum[df_sum["label"] == 0].head(10),
    df_sum[df_sum["label"] == 1].head(10),
]).reset_index(drop=True)

print(f"Test set: {len(df_test)} rows")
print(f"Label distribution: {df_test['label'].value_counts().to_dict()}")

Test set: 20 rows
Label distribution: {0: 10, 1: 10}


In [22]:
df_test

Unnamed: 0,input,output,label
0,France's Dubuisson carded a 67 to tie with ove...,rory mcilroy will take a one-shot lead into th...,0
1,(CNN)Two Transportation Security Administratio...,screeners have been fired after conspiring to ...,0
2,Marcy Smith was woken up by her son David to f...,the family of a man who died in a blaze at her...,0
3,Ryan Walls took pictures of 101 passengers dur...,an edinburgh taxi driver who took thousands of...,0
4,"It works by looking for a combination of ""mark...",an international test for alzheimer\'s disease...,0
5,Share this withEmailFacebookMessengerMessenger...,a man has been taken to hospital after a crash...,0
6,Share this withEmailFacebookMessengerMessenger...,the former leader of birmingham city council h...,0
7,The shooting occurred at a hostel attached to ...,a 19-year-old indian student has been shot dea...,0
8,Share this withEmailFacebookTwitterWhatsAppLin...,"it\'s a tale of the indian matchbox industry, ...",0
9,"Denise Fergus said she had been ""let down so m...",the mother of murdered toddler james bulger ha...,0


In [23]:
# Run scoring on the small set (80 LLM calls: 10 rows x 8 scorers)
df_test_scored = run_judge_scoring(df_test, summarization_scorers, "claude3_5")

Evaluating:   0%|          | 0/20 [Elapsed: 00:00, Remaining: ?] 

[claude3_5] Eval table shape: (20, 20)
[claude3_5] Eval table columns: ['trace_id', 'attribution_specificity/value', 'coverage/value', 'salience/value', 'conciseness/value', 'entity_number_fidelity/value', 'non_contradiction/value', 'faithfulness/value', 'coherence/value', 'trace', 'client_request_id', 'state', 'request_time', 'execution_duration', 'request', 'response', 'trace_metadata', 'tags', 'spans', 'assessments']
[claude3_5] Added 8 feature(s): ['salience', 'coverage', 'faithfulness', 'attribution_specificity', 'entity_number_fidelity', 'coherence', 'conciseness', 'non_contradiction']


In [25]:
# Inspect the small-scale results
feature_cols = [c for c in df_test_scored.columns if c not in ["input", "output", "label"]]

print(f"\n--- Correlation with ground-truth label ---")
for col in feature_cols:
    corr = df_test_scored["label"].corr(df_test_scored[col])
    print(f"  {col:30s}  corr = {corr:+.3f}")

print(f"\n--- Feature means ---")
for col in feature_cols:
    print(f"  {col:30s}  mean = {df_test_scored[col].mean():.2f}")


--- Correlation with ground-truth label ---
  salience                        corr = +0.333
  coverage                        corr = +0.734
  faithfulness                    corr = +0.800
  attribution_specificity         corr = +0.816
  entity_number_fidelity          corr = +0.734
  coherence                       corr = +0.105
  conciseness                     corr = +0.000
  non_contradiction               corr = +0.816

--- Feature means ---
  salience                        mean = 0.10
  coverage                        mean = 0.35
  faithfulness                    mean = 0.50
  attribution_specificity         mean = 0.60
  entity_number_fidelity          mean = 0.35
  coherence                       mean = 0.35
  conciseness                     mean = 0.50
  non_contradiction               mean = 0.60


## Full Pipeline: Score All 3 Datasets (800 rows each)

Only run this after validating the small-scale test above.
This will make ~19,200 LLM judge calls and take ~30-60 minutes.

In [26]:
# Load all three datasets
df_trans = pd.read_csv(f"{OUT_DIR}/translation_mqm_input_output_label.csv")
df_sum   = pd.read_csv(f"{OUT_DIR}/summarization_frank_input_output_label.csv")
df_extr  = pd.read_csv(f"{OUT_DIR}/extraction_conll_ner_input_output_label.csv")

print(f"Translation:   {df_trans.shape}")
print(f"Summarization: {df_sum.shape}")
print(f"Extraction:    {df_extr.shape}")

Translation:   (800, 3)
Summarization: (800, 3)
Extraction:    (800, 3)


## Summarization

In [27]:
# Score each dataset (each run is traced in Databricks)
df_sum_scored   = run_judge_scoring(df_sum, summarization_scorers, "summarization")

Evaluating:   0%|          | 0/800 [Elapsed: 00:00, Remaining: ?] 

[summarization] Eval table shape: (800, 20)
[summarization] Eval table columns: ['trace_id', 'non_contradiction/value', 'conciseness/value', 'coverage/value', 'faithfulness/value', 'salience/value', 'coherence/value', 'entity_number_fidelity/value', 'attribution_specificity/value', 'trace', 'client_request_id', 'state', 'request_time', 'execution_duration', 'request', 'response', 'trace_metadata', 'tags', 'spans', 'assessments']
[summarization] Added 8 feature(s): ['salience', 'coverage', 'faithfulness', 'attribution_specificity', 'entity_number_fidelity', 'coherence', 'conciseness', 'non_contradiction']


In [28]:
# Save enriched datasets to new CSVs
df_sum_scored.to_csv(f"{OUT_DIR}/summarization_frank_features.csv", index=False)

print("Saved enriched datasets:")
print(f"  {OUT_DIR}/summarization_frank_features.csv")

Saved enriched datasets:
  ./evals_benchmark_datasets/summarization_frank_features.csv


## Drop rows with errored LLM judge assessments

The Databricks traces UI shows ~22 rows where individual scorer assessments errored.
These rows got 0s from `fillna(0)` instead of real scores. Fetch traces from Databricks
and drop any row where at least one scorer assessment has an error.

In [None]:
# Find the summarization scoring run
runs = mlflow.search_runs(
    experiment_ids=[""],
    filter_string='tags.mlflow.runName = "summarization_judge_scoring"',
    output_format="list",
)
run_id = runs[0].info.run_id
print(f"Run: {run_id}")

# Fetch all traces — must include spans so the `request` field is populated
# (request reads from root span's INPUTS attribute; without spans it's None)
traces_df = mlflow.search_traces(
    experiment_ids=[""],
    run_id=run_id,
)
print(f"Traces: {len(traces_df)}")

# Check each trace's assessments for any scorer error
def has_any_assessment_error(assessments):
    if not assessments:
        return True
    for a in assessments:
        feedback = a.get("feedback", {})
        if feedback.get("error"):
            return True
    return False

traces_df["has_error"] = traces_df["assessments"].apply(has_any_assessment_error)
print(f"Traces with at least one scorer error: {traces_df['has_error'].sum()}")

# Diagnostic: show what the request field actually looks like
sample_req = traces_df["request"].iloc[0]
print(f"\nrequest type: {type(sample_req)}")
print(f"request sample: {str(sample_req)[:300]}")

Run: 5dadb487ebb04b2f9f183a95b38a0a9b


  traces_df = mlflow.search_traces(


Traces: 800
Traces with at least one scorer error: 27

request type: <class 'dict'>
request sample: {'question': 'Ofsted says it has found evidence of children being taught in squalid conditions in three places in Birmingham which have now closed.Anyone running illegal schools could face a jail term of up to 51 weeks.Ministers are also consulting on plans for more regulation of places teaching for


In [43]:
traces_df[traces_df['has_error'] == True].head()

Unnamed: 0,trace_id,trace,client_request_id,state,request_time,execution_duration,request,response,trace_metadata,tags,spans,assessments,has_error,question_text
14,tr-fcc516b007750aeb8d61553b8a4c143d,"{""info"": {""trace_id"": ""tr-fcc516b007750aeb8d61...",tr-fcc516b007750aeb8d61553b8a4c143d,OK,1771550511878,0,{'question': 'This is the moment a paedophile ...,"{'response': 'married adeli , 32 , arranged to...",{'mlflow.source.git.commit': '2d77d10e012ac76e...,"{'mlflow.user': 'annie@evrim.ai', 'mlflow.trac...","[{'trace_id': '/MUWsAd1CuuNYVU7ikwUPQ==', 'spa...",[{'assessment_id': 'a-106093c9ed8b428b8033d9a2...,True,This is the moment a paedophile was caught by ...
122,tr-452c716c95402a76b558d725dbf4105e,"{""info"": {""trace_id"": ""tr-452c716c95402a76b558...",tr-452c716c95402a76b558d725dbf4105e,OK,1771550335092,0,{'question': 'An HIV-positive Ohio man accused...,"{'response': 'keith anthony allen , 27 , plead...",{'mlflow.source.git.commit': '2d77d10e012ac76e...,"{'mlflow.user': 'annie@evrim.ai', 'mlflow.trac...","[{'trace_id': 'RSxxbJVAKna1WNcl2/QQXg==', 'spa...",[{'assessment_id': 'a-e261ab2ad2c947ceaaef97c1...,True,An HIV-positive Ohio man accused of sexually a...
126,tr-71619882aa9ad9d0937c90d2d525882a,"{""info"": {""trace_id"": ""tr-71619882aa9ad9d0937c...",tr-71619882aa9ad9d0937c90d2d525882a,OK,1771550328697,0,{'question': '(CNN)Deputies rushed Kenneth Mor...,{'response': 'UNK is accused of killing an emp...,{'mlflow.source.git.commit': '2d77d10e012ac76e...,"{'mlflow.user': 'annie@evrim.ai', 'mlflow.trac...","[{'trace_id': 'cWGYgqqa2dCTfJDS1SWIKg==', 'spa...",[{'assessment_id': 'a-bd0c2e7c0b6e4b989358938a...,True,(CNN)Deputies rushed Kenneth Morgan Stancil II...
128,tr-79d98567d2a4490a41ee3c26393c6064,"{""info"": {""trace_id"": ""tr-79d98567d2a4490a41ee...",tr-79d98567d2a4490a41ee3c26393c6064,OK,1771550325732,0,{'question': 'This is the moment a paedophile ...,"{'response': 'UNK UNK , who is of UNK origin ,...",{'mlflow.source.git.commit': '2d77d10e012ac76e...,"{'mlflow.user': 'annie@evrim.ai', 'mlflow.trac...","[{'trace_id': 'edmFZ9KkSQpB7jwmOTxgZA==', 'spa...",[{'assessment_id': 'a-57452f735b0c467690050351...,True,This is the moment a paedophile was caught by ...
184,tr-696d53b9a17151d19e47f45e8e54b72c,"{""info"": {""trace_id"": ""tr-696d53b9a17151d19e47...",tr-696d53b9a17151d19e47f45e8e54b72c,OK,1771550246549,0,{'question': 'It's not intended to be a safety...,{'response': 'footage shows the skater confide...,{'mlflow.source.git.commit': '2d77d10e012ac76e...,"{'mlflow.user': 'annie@evrim.ai', 'mlflow.trac...","[{'trace_id': 'aW1TuaFxUdGeR/RejlS3LA==', 'spa...",[{'assessment_id': 'a-2eee1568d9eb43fe95206f7a...,True,It's not intended to be a safety video. But th...


In [44]:
import json

def extract_question(request_val):
    """Extract the input text from the trace request field."""
    if request_val is None:
        return None
    if isinstance(request_val, str):
        try:
            request_val = json.loads(request_val)
        except (json.JSONDecodeError, TypeError):
            return None
    if isinstance(request_val, dict):
        if "question" in request_val:
            return request_val["question"]
        inputs = request_val.get("inputs", {})
        if isinstance(inputs, dict) and "question" in inputs:
            return inputs["question"]
    return None

traces_df["question_text"] = traces_df["request"].apply(extract_question)
print(f"Non-null question_text: {traces_df['question_text'].notna().sum()}/{len(traces_df)}")

# Collect the input texts of errored traces
error_texts = set(
    traces_df[traces_df["has_error"]]["question_text"].dropna().tolist()
)
print(f"Unique error input texts: {len(error_texts)}")

# Filter df_sum_scored by excluding rows whose input text matches an errored trace
mask = df_sum_scored["input"].isin(error_texts)
print(f"Rows matched for removal: {mask.sum()}")

df_sum_cleaned = df_sum_scored[~mask].reset_index(drop=True)
print(f"\nCleaned: {df_sum_cleaned.shape}")
print(df_sum_cleaned["label"].value_counts())

scorer_cols = [c for c in df_sum_cleaned.columns if c not in ["input", "output", "label"]]
print(f"\n--- Feature stats after cleaning ---")
for col in scorer_cols:
    corr = df_sum_cleaned["label"].corr(df_sum_cleaned[col])
    print(f"  {col:30s}  mean={df_sum_cleaned[col].mean():.2f}  corr={corr:+.3f}")

df_sum_cleaned.to_csv(f"{OUT_DIR}/summarization_frank_features.csv", index=False)
print(f"\nSaved: {len(df_sum_cleaned)} rows (dropped {mask.sum()} errors)")

Non-null question_text: 800/800
Unique error input texts: 13
Rows matched for removal: 32

Cleaned: (768, 11)
label
0    501
1    267
Name: count, dtype: int64

--- Feature stats after cleaning ---
  salience                        mean=0.16  corr=+0.427
  coverage                        mean=0.24  corr=+0.545
  faithfulness                    mean=0.42  corr=+0.711
  attribution_specificity         mean=0.47  corr=+0.629
  entity_number_fidelity          mean=0.27  corr=+0.620
  coherence                       mean=0.41  corr=+0.380
  conciseness                     mean=0.50  corr=+0.212
  non_contradiction               mean=0.43  corr=+0.726

Saved: 768 rows (dropped 32 errors)


## Others

In [None]:
# df_trans_scored = run_judge_scoring(df_trans, translation_scorers, "translation")
# df_extr_scored  = run_judge_scoring(df_extr, extraction_scorers, "extraction")

In [None]:
# df_trans_scored.to_csv(f"{OUT_DIR}/translation_mqm_features.csv", index=False)
# df_extr_scored.to_csv(f"{OUT_DIR}/extraction_conll_ner_features.csv", index=False)