In [151]:
# ---------------- Imports ----------------
import json
import os
import random

from datetime import datetime

import pandas as pd
import yaml



In [152]:
# ---------------- Args ----------------
prompted_input_file_name = "20251219T1217-llama-3.1-8b-instruct-pr-m3generated"
sft_input_file_name = "20251221T0934-20251220t0857-llama-3.1-8b-instruct-sft-m3trained"
orl_input_file_name = "20251227T0854-20251226t0940-llama-3.1-8b-instruct-seq-std-m3trained"

random_state_seed = 42
sample_size = 100



In [153]:
# ---------------- Config ----------------
with open("../../../config/config.yaml", "r") as f:
    config = yaml.safe_load(f)


random.seed(random_state_seed)


proj_store = config["paths"]["proj_store"]
models_folderpath = config["paths"]["models"]

timestamp = datetime.now().strftime("%Y%m%dt%H%M%S")

utterances_dir = os.path.join(proj_store, "evaluation", "generated-utterances")

prompted_input_file = os.path.join(utterances_dir, f"{prompted_input_file_name}.jsonl")
sft_input_file = os.path.join(utterances_dir, f"{sft_input_file_name}.jsonl")
orl_input_file = os.path.join(utterances_dir, f"{orl_input_file_name}.jsonl")


human_eval_folder = os.path.join(proj_store, "evaluation", "human-evaluation")
os.makedirs(human_eval_folder, exist_ok=True)

combined_input_folder = os.path.join(human_eval_folder, "combined-utterances")
os.makedirs(combined_input_folder, exist_ok=True)
combined_input_file = os.path.join(combined_input_folder, f"{timestamp}-combined-utterances.jsonl")

output_file_dir = os.path.join(human_eval_folder, "batches")
os.makedirs(output_file_dir, exist_ok=True)
output_file = os.path.join(output_file_dir, f"{timestamp}-batch.csv")



In [154]:
def read_jsonl_with_order(path):
    ordered = []
    lookup = {}

    with open(path, "r", encoding="utf-8") as f:
        for line_idx, line in enumerate(f):
            obj = json.loads(line)
            block_id = obj.get("block_id")

            if block_id is None:
                raise ValueError(f"Missing block_id in file {path} at line {line_idx}")

            if block_id in lookup:
                raise ValueError(f"Duplicate block_id {block_id} in file {path}")

            ordered.append(obj)
            lookup[block_id] = obj

    return ordered, lookup



prompted_ordered, prompted_lookup = read_jsonl_with_order(prompted_input_file)
_, sft_lookup = read_jsonl_with_order(sft_input_file)
_, orl_lookup = read_jsonl_with_order(orl_input_file)



In [155]:
prompted_ids = set(prompted_lookup.keys())
sft_ids = set(sft_lookup.keys())
orl_ids = set(orl_lookup.keys())

if not (prompted_ids == sft_ids == orl_ids):
    raise RuntimeError(
        "Block ID mismatch detected:\n"
        f"- Missing in SFT: {sorted(prompted_ids - sft_ids)}\n"
        f"- Missing in ORL: {sorted(prompted_ids - orl_ids)}\n"
        f"- Missing in Prompted: {sorted((sft_ids | orl_ids) - prompted_ids)}"
    )


In [156]:
combined_records = []

for base in prompted_ordered:
    block_id = base["block_id"]

    record = base.copy()

    if "context_messages" in record:
        record["context_messages"] = [
            msg for msg in record["context_messages"]
            if msg.get("role") != "system"
        ]

    # Rename prompted generated_response
    if "generated_response" not in record:
        raise ValueError(f"Missing generated_response in prompted for block_id={block_id}")

    record["generated_response_prompted"] = record.pop("generated_response")

    # Add others
    record["generated_response_sft"] = sft_lookup[block_id].get("generated_response")
    record["generated_response_orl"] = orl_lookup[block_id].get("generated_response")

    if record["generated_response_sft"] is None:
        raise ValueError(f"Missing generated_response in sft for block_id={block_id}")
    if record["generated_response_orl"] is None:
        raise ValueError(f"Missing generated_response in orl for block_id={block_id}")

    combined_records.append(record)



In [157]:
with open(combined_input_file, "w", encoding="utf-8") as f:
    for rec in combined_records:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")


In [158]:
# ---------------- Load combined file ----------------
rows = []

with open(combined_input_file, "r", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)

        block_id = item["block_id"]
        if "domain" not in item or item["domain"] is None:
            raise ValueError(f"Missing domain for block_id={item['block_id']}")
        domain = item["domain"]

        # ---------------- Format context into HTML ----------------
        ROLE_MAP = {
            "user": "Respondent",
            "assistant": "Elicitor",
        }

        context_lines = []
        for turn in item["context_messages"]:
            raw_role = turn["role"]
            role = ROLE_MAP.get(raw_role, raw_role.capitalize())
            utterance = turn["content"]
            context_lines.append(f"<li><b>{role}</b>: {utterance}</li>")

        context_str = "<ul>\n" + "\n".join(context_lines) + "\n</ul>"

        # ---------------- Collect responses ----------------
        if "real_response" not in item or item["real_response"] is None:
            raise ValueError(f"Missing real_response for block_id={block_id}")

        responses = [
            ("real", item["real_response"]),
            ("prompted", item["generated_response_prompted"]),
            ("sft", item["generated_response_sft"]),
            ("orl", item["generated_response_orl"]),
        ]

        # Shuffle responses (evaluation blindness)
        random.shuffle(responses)

        # ---------------- Build row ----------------
        row = {
            "block_id": block_id,
            #"type": "evaluation",
            "domain": domain,
            "context_turns": context_str,
        }

        for i, (label, utt) in enumerate(responses, start=1):
            row[f"response_{i}"] = f"<b>Elicitor</b>: {utt}"
            row[f"response_{i}_label"] = label

        rows.append(row)



In [159]:
## Create DataFrame

df = pd.DataFrame(rows)

df["domain"].value_counts()



domain
judicial_proceedings           4949
oral_history                   4028
academic_interviews            1125
journalistic_investigations     183
Name: count, dtype: int64

In [160]:
## Stratified sampling

n_domains = df["domain"].nunique()
sample_per_domain = sample_size // n_domains

df_sampled = (
    df.groupby("domain", group_keys=False, sort=False)
      .sample(n=sample_per_domain, random_state=random_state_seed)
      .reset_index(drop=True)
)

# Domain normalization

df["domain"] = df["domain"].str.replace("_", " ").str.title()
df_sampled["domain"] = df_sampled["domain"].str.replace("_", " ").str.title()



In [161]:
# Diagnostics and save

display(df_sampled.head())
display(df_sampled["domain"].value_counts(normalize=True))
display(df_sampled.shape)

df_sampled.to_csv(output_file, index=False)



Unnamed: 0,block_id,domain,context_turns,response_1,response_1_label,response_2,response_2_label,response_3,response_3_label,response_4,response_4_label
0,wikinews-00000:38,Journalistic Investigations,<ul>\n<li><b>Respondent</b>: I have very littl...,"<b>Elicitor</b>: It looks very painful, your l...",real,<b>Elicitor</b>: It sounds like you've had a l...,prompted,<b>Elicitor</b>: Can you walk at all?,sft,<b>Elicitor</b>: You mentioned your vision. Do...,orl
1,wikinews-00000:84,Journalistic Investigations,<ul>\n<li><b>Respondent</b>: Yes there is. Sug...,<b>Elicitor</b>: What about MySpace and YouTub...,orl,<b>Elicitor</b>: So it sounds like you're real...,prompted,<b>Elicitor</b>: Is your goal to raise money o...,sft,<b>Elicitor</b>: Why do you think there are so...,real
2,voa-news-00065:1,Journalistic Investigations,<ul>\n<li><b>Respondent</b>: It's very importa...,<b>Elicitor</b>: So it's mainly the restrictio...,prompted,<b>Elicitor</b>: Do you have any data on how m...,sft,"<b>Elicitor</b>: So, the European Union is tal...",orl,<b>Elicitor</b>: What do you expect from Polan...,real
3,wikinews-00002:120,Journalistic Investigations,<ul>\n<li><b>Respondent</b>: Maybe. I have a l...,"<b>Elicitor</b>: Yeah, yeah. The Southern Bapt...",real,"<b>Elicitor</b>: Yeah. I don't know, but I thi...",orl,<b>Elicitor</b>: I did. I grew up in the Midwe...,sft,"<b>Elicitor</b>: I was raised Catholic, but I ...",prompted
4,wikinews-00002:200,Journalistic Investigations,<ul>\n<li><b>Respondent</b>: I'm not sure. The...,"<b>Elicitor</b>: Yes, he did. He said, 'I thin...",orl,<b>Elicitor</b>: He's a very dynamic person to...,real,<b>Elicitor</b>: I must have made a mistake. ...,prompted,"<b>Elicitor</b>: Yeah, and he said that he was...",sft


domain
Journalistic Investigations    0.25
Judicial Proceedings           0.25
Academic Interviews            0.25
Oral History                   0.25
Name: proportion, dtype: float64

(100, 11)