In [1]:
import pandas as pd

def _norm_query(s: str) -> str:
    if s is None:
        return ""
    return " ".join(str(s).strip().lower().split())

def load_and_intersect_by_query(path_rag: str, path_rag_filtered: str, path_temporal_filtered: str):
    df1 = pd.read_parquet(path_rag)
    df2 = pd.read_parquet(path_rag_filtered)
    df3 = pd.read_parquet(path_temporal_filtered)

    for name, d in [("rag", df1), ("rag_filtered", df2), ("temporal_filtered", df3)]:
        if "query" not in d.columns:
            raise KeyError(f"{name}: missing column 'query'")

    df1 = df1.copy()
    df2 = df2.copy()
    df3 = df3.copy()

    df1["_q"] = df1["query"].map(_norm_query)
    df2["_q"] = df2["query"].map(_norm_query)
    df3["_q"] = df3["query"].map(_norm_query)

    s1 = set(df1["_q"].dropna().tolist()) - {""}
    s2 = set(df2["_q"].dropna().tolist()) - {""}
    s3 = set(df3["_q"].dropna().tolist()) - {""}

    common_q = sorted(s1 & s2 & s3)

    out1 = df1[df1["_q"].isin(common_q)].drop(columns=["_q"])
    out2 = df2[df2["_q"].isin(common_q)].drop(columns=["_q"])
    out3 = df3[df3["_q"].isin(common_q)].drop(columns=["_q"])

    counts = {
        "rag_unique_queries": len(s1),
        "rag_filtered_unique_queries": len(s2),
        "temporal_filtered_unique_queries": len(s3),
        "common_unique_queries": len(common_q),
        "rag_rows_kept": int(len(out1)),
        "rag_filtered_rows_kept": int(len(out2)),
        "temporal_filtered_rows_kept": int(len(out3)),
    }

    return out1, out2, out3, common_q, counts

PATH_RAG = "eval_df_rag_default.parquet"
PATH_RAG_F = "eval_df_rag_filtered.parquet"
PATH_TEMP_F = "eval_df_temporalrag.parquet" 

rag_df_common, ragf_df_common, tempf_df_common, common_queries, counts = load_and_intersect_by_query(
    PATH_RAG, PATH_RAG_F, PATH_TEMP_F)


In [2]:
counts

{'rag_unique_queries': 200,
 'rag_filtered_unique_queries': 200,
 'temporal_filtered_unique_queries': 200,
 'common_unique_queries': 200,
 'rag_rows_kept': 200,
 'rag_filtered_rows_kept': 200,
 'temporal_filtered_rows_kept': 200}

In [3]:
from vllm import LLM

JUDGE_MODEL = "mistralai/Mistral-Small-24B-Instruct-2501"

judge = LLM(
    model=JUDGE_MODEL,
    dtype="bfloat16",
    max_model_len=32768,
    gpu_memory_utilization=0.90,
)


INFO 01-11 04:39:37 [utils.py:253] non-default args: {'dtype': 'bfloat16', 'max_model_len': 32768, 'disable_log_stats': True, 'model': 'mistralai/Mistral-Small-24B-Instruct-2501'}
INFO 01-11 04:39:38 [model.py:514] Resolved architecture: MistralForCausalLM
INFO 01-11 04:39:38 [model.py:1661] Using max model len 32768
INFO 01-11 04:39:38 [scheduler.py:230] Chunked prefill is enabled with max_num_batched_tokens=8192.


[2026-01-11 04:39:39] INFO tekken.py:184: Adding special tokens <SPECIAL_20>, ..., <SPECIAL_999>
[2026-01-11 04:39:39] INFO tekken.py:195: Non special vocabulary size is 130072 with 1000 special tokens.
[2026-01-11 04:39:39] INFO tekken.py:567: Cutting non special vocabulary to first 130072 tokens.


[0;36m(EngineCore_DP0 pid=9953)[0;0m INFO 01-11 04:39:40 [core.py:93] Initializing a V1 LLM engine (v0.13.0) with config: model='mistralai/Mistral-Small-24B-Instruct-2501', speculative_config=None, tokenizer='mistralai/Mistral-Small-24B-Instruct-2501', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None

[0;36m(EngineCore_DP0 pid=9953)[0;0m We recommend installing via `pip install torch-c-dlpack-ext`


[0;36m(EngineCore_DP0 pid=9953)[0;0m INFO 01-11 04:39:46 [cuda.py:351] Using FLASH_ATTN attention backend out of potential backends: ('FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION')


consolidated.safetensors:   0%|          | 0.00/47.1G [00:00<?, ?B/s]

[0;36m(EngineCore_DP0 pid=9953)[0;0m INFO 01-11 04:42:42 [weight_utils.py:487] Time spent downloading weights for mistralai/Mistral-Small-24B-Instruct-2501: 175.629203 seconds
[0;36m(EngineCore_DP0 pid=9953)[0;0m INFO 01-11 04:42:42 [weight_utils.py:527] No consolidated.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


[0;36m(EngineCore_DP0 pid=9953)[0;0m INFO 01-11 04:42:51 [default_loader.py:308] Loading weights took 9.27 seconds
[0;36m(EngineCore_DP0 pid=9953)[0;0m INFO 01-11 04:42:52 [gpu_model_runner.py:3659] Model loading took 43.9150 GiB memory and 188.933548 seconds
[0;36m(EngineCore_DP0 pid=9953)[0;0m INFO 01-11 04:43:00 [backends.py:643] Using cache directory: /home/mlcore/.cache/vllm/torch_compile_cache/afa38a0225/rank_0_0/backbone for vLLM's torch.compile
[0;36m(EngineCore_DP0 pid=9953)[0;0m INFO 01-11 04:43:00 [backends.py:703] Dynamo bytecode transform time: 7.98 s
[0;36m(EngineCore_DP0 pid=9953)[0;0m INFO 01-11 04:43:09 [backends.py:261] Cache the graph of compile range (1, 8192) for later use
[0;36m(EngineCore_DP0 pid=9953)[0;0m INFO 01-11 04:43:25 [backends.py:278] Compiling a graph for compile range (1, 8192) takes 20.61 s
[0;36m(EngineCore_DP0 pid=9953)[0;0m INFO 01-11 04:43:25 [monitor.py:34] torch.compile takes 28.58 s in total
[0;36m(EngineCore_DP0 pid=9953)[0;0m

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:05<00:00,  9.71it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:02<00:00, 13.43it/s]


[0;36m(EngineCore_DP0 pid=9953)[0;0m INFO 01-11 04:43:37 [gpu_model_runner.py:4587] Graph capturing finished in 9 secs, took 2.61 GiB
[0;36m(EngineCore_DP0 pid=9953)[0;0m INFO 01-11 04:43:37 [core.py:259] init engine (profile, create kv cache, warmup model) took 44.36 seconds
INFO 01-11 04:43:37 [llm.py:360] Supported tasks: ['generate']


In [7]:
from transformers import AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained(JUDGE_MODEL, trust_remote_code=True)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

The tokenizer you are loading from 'mistralai/Mistral-Small-24B-Instruct-2501' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


In [9]:
import json, re
import pandas as pd
from vllm import SamplingParams

SYSTEM_PROMPT_JUDGE = """
Ты — строгий судья качества RAG-саммари. На входе: QUERY, ANCHOR_DATE, CONTEXT (набор источников с датами) и SUMMARY.
Оценивай ТОЛЬКО по данному CONTEXT. Запрещено использовать внешние знания.

Поставь оценки по шкале 1..5 по пяти критериям:

1) relevance (соответствие запросу):
5 = полностью отвечает на QUERY, по теме
3 = частично по теме, есть пропуски/лишнее
1 = почти не отвечает на QUERY

2) groundedness (соответствие саммари контексту):
5 = все существенные утверждения подтверждаются CONTEXT
3 = есть 1–2 неподтвержденных/сомнительных утверждения
1 = много неподтвержденного или противоречия CONTEXT

3) facts_numbers (точность фактов и чисел относительно CONTEXT):
5 = числа/факты корректны
3 = есть мелкие неточности или расплывчатость
1 = есть явные ошибки в цифрах/фактах

4) dates_timeline (правильность дат и порядка событий относительно CONTEXT):
5 = даты/порядок корректны, не путает “раньше/позже”
3 = небольшие огрехи, но без критики
1 = путает даты/порядок или выдает старое за новое

5) recency (актуальность относительно ANCHOR_DATE):
5 = правильно выделяет самое свежее из CONTEXT и делает на этом акцент
3 = частично, но фокус размыт/уходит в старое
1 = в основном не про актуальное или искажает актуальность

Верни СТРОГО JSON без Markdown и без лишнего текста:
{
  "scores": {"relevance":1..5,"groundedness":1..5,"facts_numbers":1..5,"dates_timeline":1..5,"recency":1..5},
  "confidence": 0..1,
  "issues": ["..."] (0..5),
  "evidence": [{"summary_quote":"≤25 слов","context_quote":"≤25 слов","note":"..."}] (0..3),
  "reasoning": ["..."] (3..7)
}

Если CONTEXT недостаточен для проверки части утверждений — снизь groundedness и confidence и укажи это в issues.
""".strip()

def _safe_json_load(s: str):
    s = (s or "").strip()
    try:
        return json.loads(s), None
    except Exception as e1:
        m = re.search(r"\{.*\}", s, flags=re.DOTALL)
        if m:
            try:
                return json.loads(m.group(0)), None
            except Exception as e2:
                return None, f"{repr(e1)} | {repr(e2)}"
        return None, repr(e1)

def judge_pointwise_on_df(
    judge_model,
    judge_tokenizer,
    df_eval: pd.DataFrame,
    system_name: str,
    gen_batch_size: int = 6,
    max_out_tokens: int = 512,
):
    need = {"query", "anchor_date", "context", "summary"}
    miss = need - set(df_eval.columns)
    if miss:
        raise KeyError(f"{system_name}: missing {miss}")

    prompts, meta = [], []
    for r in df_eval.itertuples(index=False):
        q = str(getattr(r, "query"))
        ad = str(getattr(r, "anchor_date"))
        ctx = str(getattr(r, "context"))
        sm = str(getattr(r, "summary"))

        user = f"QUERY: {q}\nANCHOR_DATE: {ad}\n\nCONTEXT:\n{ctx}\n\nSUMMARY:\n{sm}\n"
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT_JUDGE},
            {"role": "user", "content": user},
        ]
        prompt = judge_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        prompts.append(prompt)
        meta.append({"system": system_name, "query": q, "anchor_date": ad})

    sampling = SamplingParams(temperature=0.0, top_p=1.0, max_tokens=int(max_out_tokens))

    outs_text = []
    for start in range(0, len(prompts), int(gen_batch_size)):
        chunk = prompts[start:start + int(gen_batch_size)]
        outs = judge_model.generate(chunk, sampling)
        for o in outs:
            outs_text.append(o.outputs[0].text.strip())

    rows = []
    for m, raw in zip(meta, outs_text):
        parsed, err = _safe_json_load(raw)
        row = dict(m)
        row["judge_raw"] = raw
        row["judge_parse_error"] = err

        for col in ["relevance","groundedness","facts_numbers","dates_timeline","recency","confidence"]:
            row[col] = None
        row["issues"] = None
        row["evidence"] = None
        row["reasoning"] = None

        if isinstance(parsed, dict):
            sc = parsed.get("scores", {})
            if isinstance(sc, dict):
                row["relevance"] = sc.get("relevance")
                row["groundedness"] = sc.get("groundedness")
                row["facts_numbers"] = sc.get("facts_numbers")
                row["dates_timeline"] = sc.get("dates_timeline")
                row["recency"] = sc.get("recency")
            row["confidence"] = parsed.get("confidence")
            row["issues"] = parsed.get("issues")
            row["evidence"] = parsed.get("evidence")
            row["reasoning"] = parsed.get("reasoning")

        rows.append(row)

    return pd.DataFrame(rows)


In [10]:
jud_rag = judge_pointwise_on_df(judge, tokenizer, rag_df_common,  "rag",              gen_batch_size=6, max_out_tokens=1024)
jud_ragf = judge_pointwise_on_df(judge, tokenizer, ragf_df_common, "rag_filtered",    gen_batch_size=6, max_out_tokens=1024)
jud_temp = judge_pointwise_on_df(judge, tokenizer, tempf_df_common,"temporal_filtered", gen_batch_size=6, max_out_tokens=1024)

jud_all = pd.concat([jud_rag, jud_ragf, jud_temp], ignore_index=True)

jud_all.to_parquet("judge_scores.parquet", index=False)
print("Saved judge_scores.parquet, rows:", len(jud_all))

score_cols = ["relevance","groundedness","facts_numbers","dates_timeline","recency","confidence"]

summary = (
    jud_all.groupby("system")[score_cols]
    .mean(numeric_only=True)
    .sort_values(["groundedness","recency","relevance"], ascending=False)
)

parse_fail_rate = (
    jud_all.assign(parse_fail=jud_all["judge_parse_error"].notna())
    .groupby("system")["parse_fail"]
    .mean()
)

print("MEAN SCORES:")
display(summary)

print("\nPARSE FAIL RATE:")
display(parse_fail_rate)

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]



Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/2 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/2 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/2 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/2 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/6 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/2 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/2 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Saved judge_scores.parquet, rows: 600
MEAN SCORES:


Unnamed: 0_level_0,relevance,groundedness,facts_numbers,dates_timeline,recency,confidence
system,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
temporal_filtered,3.983051,2.949153,3.101695,3.050847,3.728814,0.755085
rag,4.175676,2.891892,2.952703,2.952703,3.824324,0.762162
rag_filtered,4.265306,2.843537,3.0,2.911565,3.70068,0.751701



PARSE FAIL RATE:


system
rag                  0.245
rag_filtered         0.240
temporal_filtered    0.390
Name: parse_fail, dtype: float64

In [12]:
prompts[-1]

NameError: name 'prompts' is not defined