In [117]:
# ---------------- Imports ----------------
import os
import pandas as pd
import json
import re
from datetime import datetime

import yaml



In [118]:
# ---------------- Args ----------------
results_file_name = "Batch-5365353-batch-results"




In [119]:
# ---------------- Config ----------------
with open("../../../config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

proj_store = config["paths"]["proj_store"]
models_folderpath = config["paths"]["models"]

timestamp = datetime.now().strftime("%Y%m%dt%H%M%S")

human_eval_dir = os.path.join(proj_store, "evaluation", "human-evaluation")

results_dir = os.path.join(human_eval_dir, "raw-results")
results_file = os.path.join(results_dir, f"{results_file_name}.csv")

formatted_results_dir = os.path.join(human_eval_dir, "formatted-results")
os.makedirs(formatted_results_dir, exist_ok=True)
formatted_results_file = os.path.join(formatted_results_dir, f"{results_file_name}-formatted.csv")


analysis_results_dir = os.path.join(human_eval_dir, "analysis")
os.makedirs(analysis_results_dir, exist_ok=True)



## Formatting

In [120]:
results_table = pd.read_csv(results_file)

display(results_table.columns)
display(results_table.shape)
display(results_table.head(2))


Index(['HITId', 'HITTypeId', 'Title', 'Description', 'Keywords', 'Reward',
       'CreationTime', 'MaxAssignments', 'RequesterAnnotation',
       'AssignmentDurationInSeconds', 'AutoApprovalDelayInSeconds',
       'Expiration', 'NumberOfSimilarHITs', 'LifetimeInSeconds',
       'AssignmentId', 'WorkerId', 'AssignmentStatus', 'AcceptTime',
       'SubmitTime', 'AutoApprovalTime', 'ApprovalTime', 'RejectionTime',
       'RequesterFeedback', 'WorkTimeInSeconds', 'LifetimeApprovalRate',
       'Last30DaysApprovalRate', 'Last7DaysApprovalRate', 'Input.block_id',
       'Input.domain', 'Input.context_turns', 'Input.response_1',
       'Input.response_1_label', 'Input.response_2', 'Input.response_2_label',
       'Input.response_3', 'Input.response_3_label', 'Input.response_4',
       'Input.response_4_label', 'Answer.taskAnswers', 'Approve', 'Reject'],
      dtype='object')

(300, 41)

Unnamed: 0,HITId,HITTypeId,Title,Description,Keywords,Reward,CreationTime,MaxAssignments,RequesterAnnotation,AssignmentDurationInSeconds,...,Input.response_1_label,Input.response_2,Input.response_2_label,Input.response_3,Input.response_3_label,Input.response_4,Input.response_4_label,Answer.taskAnswers,Approve,Reject
0,30OITAWPCG17BTMN0FF5BQDRSC79HI,3KQ8X25AAV07QCNWJRVYKJQMMIV67J,Judge Language Model Responses (<2 mins per task),Score short model generated text based on prov...,"model response quality, language models",$0.25,Wed Dec 31 15:12:03 PST 2025,3,BatchId:5365353;OriginalHitTemplateId:928390905;,10800,...,real,<b>Elicitor</b>: It sounds like you've had a l...,prompted,<b>Elicitor</b>: Can you walk at all?,sft,<b>Elicitor</b>: You mentioned your vision. Do...,orl,"[{""Conformity1"":{""1"":false,""2"":false,""3"":false...",,
1,30OITAWPCG17BTMN0FF5BQDRSC79HI,3KQ8X25AAV07QCNWJRVYKJQMMIV67J,Judge Language Model Responses (<2 mins per task),Score short model generated text based on prov...,"model response quality, language models",$0.25,Wed Dec 31 15:12:03 PST 2025,3,BatchId:5365353;OriginalHitTemplateId:928390905;,10800,...,real,<b>Elicitor</b>: It sounds like you've had a l...,prompted,<b>Elicitor</b>: Can you walk at all?,sft,<b>Elicitor</b>: You mentioned your vision. Do...,orl,"[{""Conformity1"":{""1"":false,""2"":true,""3"":false,...",,


In [121]:
results_table = results_table[
    [
        "HITId",
        "AssignmentId",
        "Input.block_id",
        "Input.domain",
        "Input.context_turns",
        "Input.response_1",
        "Input.response_1_label",
        "Input.response_2",
        "Input.response_2_label",
        "Input.response_3",
        "Input.response_3_label",
        "Input.response_4",
        "Input.response_4_label",
        "Answer.taskAnswers",

        ]
    ]


display(results_table.head(2))

Unnamed: 0,HITId,AssignmentId,Input.block_id,Input.domain,Input.context_turns,Input.response_1,Input.response_1_label,Input.response_2,Input.response_2_label,Input.response_3,Input.response_3_label,Input.response_4,Input.response_4_label,Answer.taskAnswers
0,30OITAWPCG17BTMN0FF5BQDRSC79HI,3MTMREQS5MG60MPBDMK7W8UUVUNAWP,wikinews-00000:38,Journalistic Investigations,<ul>\n<li><b>Respondent</b>: I have very littl...,"<b>Elicitor</b>: It looks very painful, your l...",real,<b>Elicitor</b>: It sounds like you've had a l...,prompted,<b>Elicitor</b>: Can you walk at all?,sft,<b>Elicitor</b>: You mentioned your vision. Do...,orl,"[{""Conformity1"":{""1"":false,""2"":false,""3"":false..."
1,30OITAWPCG17BTMN0FF5BQDRSC79HI,3YT88D1N1ZWFL1W9FTMULPMT2DM3KQ,wikinews-00000:38,Journalistic Investigations,<ul>\n<li><b>Respondent</b>: I have very littl...,"<b>Elicitor</b>: It looks very painful, your l...",real,<b>Elicitor</b>: It sounds like you've had a l...,prompted,<b>Elicitor</b>: Can you walk at all?,sft,<b>Elicitor</b>: You mentioned your vision. Do...,orl,"[{""Conformity1"":{""1"":false,""2"":true,""3"":false,..."


In [122]:
# mapping from metric suffix -> Input column prefix
suffix_to_input = {
    1: "Input.response_1",
    2: "Input.response_2",
    3: "Input.response_3",
    4: "Input.response_4",
}

def parse_task_answers(task_str):
    if pd.isna(task_str):
        return {}
    data = json.loads(task_str)
    if isinstance(data, list) and data:
        data = data[0]
    # for each metric dict, pick the key where value is True and cast to int
    out = {}
    for metric, choices in data.items():
        chosen = next((k for k, v in choices.items() if v), None)
        out[metric] = int(chosen) if chosen is not None else None
    return out

# 1) expand the metrics to columns like Conformity1, Control2, ...
expanded = results_table["Answer.taskAnswers"].apply(parse_task_answers).apply(pd.Series)

# 2) rename using the suffix->Input mapping
def rename_with_input(col):
    m = re.search(r'(\d+)$', col)
    if not m:
        return col
    idx = int(m.group(1))
    prefix = suffix_to_input.get(idx)
    return f"{prefix}.{col}" if prefix else col

expanded.columns = [rename_with_input(c) for c in expanded.columns]

# 3) combine with the original table
new_df = pd.concat([results_table.drop(columns=["Answer.taskAnswers"]), expanded], axis=1)

display(new_df.head(2))




Unnamed: 0,HITId,AssignmentId,Input.block_id,Input.domain,Input.context_turns,Input.response_1,Input.response_1_label,Input.response_2,Input.response_2_label,Input.response_3,...,Input.response_3.GoalRel3,Input.response_4.GoalRel4,Input.response_1.Probing1,Input.response_2.Probing2,Input.response_3.Probing3,Input.response_4.Probing4,Input.response_1.Progression1,Input.response_2.Progression2,Input.response_3.Progression3,Input.response_4.Progression4
0,30OITAWPCG17BTMN0FF5BQDRSC79HI,3MTMREQS5MG60MPBDMK7W8UUVUNAWP,wikinews-00000:38,Journalistic Investigations,<ul>\n<li><b>Respondent</b>: I have very littl...,"<b>Elicitor</b>: It looks very painful, your l...",real,<b>Elicitor</b>: It sounds like you've had a l...,prompted,<b>Elicitor</b>: Can you walk at all?,...,5,4,4,5,4,5,5,4,4,4
1,30OITAWPCG17BTMN0FF5BQDRSC79HI,3YT88D1N1ZWFL1W9FTMULPMT2DM3KQ,wikinews-00000:38,Journalistic Investigations,<ul>\n<li><b>Respondent</b>: I have very littl...,"<b>Elicitor</b>: It looks very painful, your l...",real,<b>Elicitor</b>: It sounds like you've had a l...,prompted,<b>Elicitor</b>: Can you walk at all?,...,4,4,1,4,3,3,2,4,4,4


In [123]:
# Remove "Input" from all column names
new_df.columns = new_df.columns.str.replace("^Input\.?", "", regex=True)



new_df = new_df.rename(columns={
    'HITId': 'HITId',
    'AssignmentId': 'AssignmentId',
    'block_id': 'block_id',
    'domain': 'domain',
    'context_turns': 'context_turns',
    'response_1': 'response_1',
    'response_2': 'response_2',
    'response_3': 'response_3',
    'response_4': 'response_4',
    'response_1.Conformity1': 'response.Conformity_1',
    'response_2.Conformity2': 'response.Conformity_2',
    'response_3.Conformity3': 'response.Conformity_3',
    'response_4.Conformity4': 'response.Conformity_4',
    'response_1.Control1': 'response.Control_1',
    'response_2.Control2': 'response.Control_2',
    'response_3.Control3': 'response.Control_3',
    'response_4.Control4': 'response.Control_4',
    'response_1.GoalRel1': 'response.GoalRel_1',
    'response_2.GoalRel2': 'response.GoalRel_2',
    'response_3.GoalRel3': 'response.GoalRel_3',
    'response_4.GoalRel4': 'response.GoalRel_4',
    'response_1.Probing1': 'response.Probing_1',
    'response_2.Probing2': 'response.Probing_2',
    'response_3.Probing3': 'response.Probing_3',
    'response_4.Probing4': 'response.Probing_4',
    'response_1.Progression1': 'response.Progression_1',
    'response_2.Progression2': 'response.Progression_2',
    'response_3.Progression3': 'response.Progression_3',
    'response_4.Progression4': 'response.Progression_4',
    'response_1_label': 'response.label_1',
    'response_2_label': 'response.label_2',
    'response_3_label': 'response.label_3',
    'response_4_label': 'response.label_4',
})


display(new_df.columns)
display(new_df.head(2))




Index(['HITId', 'AssignmentId', 'block_id', 'domain', 'context_turns',
       'response_1', 'response.label_1', 'response_2', 'response.label_2',
       'response_3', 'response.label_3', 'response_4', 'response.label_4',
       'response.Conformity_1', 'response.Conformity_2',
       'response.Conformity_3', 'response.Conformity_4', 'response.Control_1',
       'response.Control_2', 'response.Control_3', 'response.Control_4',
       'response.GoalRel_1', 'response.GoalRel_2', 'response.GoalRel_3',
       'response.GoalRel_4', 'response.Probing_1', 'response.Probing_2',
       'response.Probing_3', 'response.Probing_4', 'response.Progression_1',
       'response.Progression_2', 'response.Progression_3',
       'response.Progression_4'],
      dtype='object')

Unnamed: 0,HITId,AssignmentId,block_id,domain,context_turns,response_1,response.label_1,response_2,response.label_2,response_3,...,response.GoalRel_3,response.GoalRel_4,response.Probing_1,response.Probing_2,response.Probing_3,response.Probing_4,response.Progression_1,response.Progression_2,response.Progression_3,response.Progression_4
0,30OITAWPCG17BTMN0FF5BQDRSC79HI,3MTMREQS5MG60MPBDMK7W8UUVUNAWP,wikinews-00000:38,Journalistic Investigations,<ul>\n<li><b>Respondent</b>: I have very littl...,"<b>Elicitor</b>: It looks very painful, your l...",real,<b>Elicitor</b>: It sounds like you've had a l...,prompted,<b>Elicitor</b>: Can you walk at all?,...,5,4,4,5,4,5,5,4,4,4
1,30OITAWPCG17BTMN0FF5BQDRSC79HI,3YT88D1N1ZWFL1W9FTMULPMT2DM3KQ,wikinews-00000:38,Journalistic Investigations,<ul>\n<li><b>Respondent</b>: I have very littl...,"<b>Elicitor</b>: It looks very painful, your l...",real,<b>Elicitor</b>: It sounds like you've had a l...,prompted,<b>Elicitor</b>: Can you walk at all?,...,4,4,1,4,3,3,2,4,4,4


In [124]:
new_df_arrange = new_df[
    [
        'AssignmentId',
        'HITId',
        'block_id',
        'domain',
        'context_turns',
        'response_1',
        'response.label_1',
        'response.Conformity_1',
        'response.Control_1',
        'response.GoalRel_1',
        'response.Probing_1',
        'response.Progression_1',
        'response_2',
        'response.label_2',
        'response.Conformity_2',
        'response.Control_2',
        'response.GoalRel_2',
        'response.Probing_2',
        'response.Progression_2',
        'response_3',
        'response.label_3',
        'response.Conformity_3',
        'response.Control_3',
        'response.GoalRel_3',
        'response.Probing_3',
        'response.Progression_3',
        'response_4',
        'response.label_4',
        'response.Conformity_4',
        'response.Control_4',
        'response.GoalRel_4',
        'response.Probing_4',
        'response.Progression_4',
        ]
    ]


display(new_df_arrange.head(5))



Unnamed: 0,AssignmentId,HITId,block_id,domain,context_turns,response_1,response.label_1,response.Conformity_1,response.Control_1,response.GoalRel_1,...,response.GoalRel_3,response.Probing_3,response.Progression_3,response_4,response.label_4,response.Conformity_4,response.Control_4,response.GoalRel_4,response.Probing_4,response.Progression_4
0,3MTMREQS5MG60MPBDMK7W8UUVUNAWP,30OITAWPCG17BTMN0FF5BQDRSC79HI,wikinews-00000:38,Journalistic Investigations,<ul>\n<li><b>Respondent</b>: I have very littl...,"<b>Elicitor</b>: It looks very painful, your l...",real,4,4,5,...,5,4,4,<b>Elicitor</b>: You mentioned your vision. Do...,orl,4,4,4,5,4
1,3YT88D1N1ZWFL1W9FTMULPMT2DM3KQ,30OITAWPCG17BTMN0FF5BQDRSC79HI,wikinews-00000:38,Journalistic Investigations,<ul>\n<li><b>Respondent</b>: I have very littl...,"<b>Elicitor</b>: It looks very painful, your l...",real,2,2,2,...,4,3,4,<b>Elicitor</b>: You mentioned your vision. Do...,orl,4,3,4,3,4
2,3RSDURM971KDF41N8S4HDO51RHKYE6,30OITAWPCG17BTMN0FF5BQDRSC79HI,wikinews-00000:38,Journalistic Investigations,<ul>\n<li><b>Respondent</b>: I have very littl...,"<b>Elicitor</b>: It looks very painful, your l...",real,2,2,2,...,4,3,4,<b>Elicitor</b>: You mentioned your vision. Do...,orl,4,4,4,3,4
3,33CKWXB74AIH5XW8KF2TSHBXVN811T,385MDVING2DEEVULHQSI1N1H35VWJZ,wikinews-00000:84,Journalistic Investigations,<ul>\n<li><b>Respondent</b>: Yes there is. Sug...,<b>Elicitor</b>: What about MySpace and YouTub...,orl,4,3,3,...,3,4,4,<b>Elicitor</b>: Why do you think there are so...,real,3,4,3,3,4
4,3F0BG9B9NGLS6PMDLKQB7EDK5UGY7K,385MDVING2DEEVULHQSI1N1H35VWJZ,wikinews-00000:84,Journalistic Investigations,<ul>\n<li><b>Respondent</b>: Yes there is. Sug...,<b>Elicitor</b>: What about MySpace and YouTub...,orl,3,2,3,...,5,3,5,<b>Elicitor</b>: Why do you think there are so...,real,4,3,4,3,4


In [125]:
# Reshape from wide to long
long_df = pd.wide_to_long(
    new_df_arrange,
    stubnames=[
        "response", "response.label",
        "response.Conformity", "response.Control",
        "response.GoalRel", "response.Probing",
        "response.Progression"
    ],
    i=["AssignmentId", "HITId", "block_id", "domain", "context_turns"],
    j="response_number",
    sep="_",
    suffix="\\d+"
).reset_index()

# Drop the response_number if you don’t care which (1,2,3) it was
long_df = long_df.drop(columns=["response_number"])

display(long_df.head())




Unnamed: 0,AssignmentId,HITId,block_id,domain,context_turns,response,response.label,response.Conformity,response.Control,response.GoalRel,response.Probing,response.Progression
0,3MTMREQS5MG60MPBDMK7W8UUVUNAWP,30OITAWPCG17BTMN0FF5BQDRSC79HI,wikinews-00000:38,Journalistic Investigations,<ul>\n<li><b>Respondent</b>: I have very littl...,"<b>Elicitor</b>: It looks very painful, your l...",real,4,4,5,4,5
1,3MTMREQS5MG60MPBDMK7W8UUVUNAWP,30OITAWPCG17BTMN0FF5BQDRSC79HI,wikinews-00000:38,Journalistic Investigations,<ul>\n<li><b>Respondent</b>: I have very littl...,<b>Elicitor</b>: It sounds like you've had a l...,prompted,4,4,5,5,4
2,3MTMREQS5MG60MPBDMK7W8UUVUNAWP,30OITAWPCG17BTMN0FF5BQDRSC79HI,wikinews-00000:38,Journalistic Investigations,<ul>\n<li><b>Respondent</b>: I have very littl...,<b>Elicitor</b>: Can you walk at all?,sft,5,5,5,4,4
3,3MTMREQS5MG60MPBDMK7W8UUVUNAWP,30OITAWPCG17BTMN0FF5BQDRSC79HI,wikinews-00000:38,Journalistic Investigations,<ul>\n<li><b>Respondent</b>: I have very littl...,<b>Elicitor</b>: You mentioned your vision. Do...,orl,4,4,4,5,4
4,3YT88D1N1ZWFL1W9FTMULPMT2DM3KQ,30OITAWPCG17BTMN0FF5BQDRSC79HI,wikinews-00000:38,Journalistic Investigations,<ul>\n<li><b>Respondent</b>: I have very littl...,"<b>Elicitor</b>: It looks very painful, your l...",real,2,2,2,1,2


In [126]:
# Remove "response." (with the dot) from all column names
long_df.columns = long_df.columns.str.replace("response\.", "", regex=True)

display(long_df.head(2))


Unnamed: 0,AssignmentId,HITId,block_id,domain,context_turns,response,label,Conformity,Control,GoalRel,Probing,Progression
0,3MTMREQS5MG60MPBDMK7W8UUVUNAWP,30OITAWPCG17BTMN0FF5BQDRSC79HI,wikinews-00000:38,Journalistic Investigations,<ul>\n<li><b>Respondent</b>: I have very littl...,"<b>Elicitor</b>: It looks very painful, your l...",real,4,4,5,4,5
1,3MTMREQS5MG60MPBDMK7W8UUVUNAWP,30OITAWPCG17BTMN0FF5BQDRSC79HI,wikinews-00000:38,Journalistic Investigations,<ul>\n<li><b>Respondent</b>: I have very littl...,<b>Elicitor</b>: It sounds like you've had a l...,prompted,4,4,5,5,4


In [127]:
# Sort by block_id first, then by response.label
sorted_df = long_df.sort_values(
    by=["block_id", "label"],
    ascending=[True, True]  # change to False if you want descending
).reset_index(drop=True)



display(sorted_df.columns)
display(sorted_df.head())



Index(['AssignmentId', 'HITId', 'block_id', 'domain', 'context_turns',
       'response', 'label', 'Conformity', 'Control', 'GoalRel', 'Probing',
       'Progression'],
      dtype='object')

Unnamed: 0,AssignmentId,HITId,block_id,domain,context_turns,response,label,Conformity,Control,GoalRel,Probing,Progression
0,3WT783CTQ2F25YOY29K2FP7XLWQBCC,3I01FDIL7C6FZTWMTAH0AIQFUE42DA,covid-19-threshold-00008:47,Academic Interviews,<ul>\n<li><b>Respondent</b>: So when we were o...,<b>Elicitor</b>: Do you remember how you felt ...,orl,4,5,4,5,5
1,3TDXMTX3D2SNCPT3YSZGFCJ0K0Z6IO,3I01FDIL7C6FZTWMTAH0AIQFUE42DA,covid-19-threshold-00008:47,Academic Interviews,<ul>\n<li><b>Respondent</b>: So when we were o...,<b>Elicitor</b>: Do you remember how you felt ...,orl,5,5,4,5,4
2,3VELCLL3HBH8VCQGREHEHJFRWXR1FU,3I01FDIL7C6FZTWMTAH0AIQFUE42DA,covid-19-threshold-00008:47,Academic Interviews,<ul>\n<li><b>Respondent</b>: So when we were o...,<b>Elicitor</b>: Do you remember how you felt ...,orl,1,5,3,1,3
3,3WT783CTQ2F25YOY29K2FP7XLWQBCC,3I01FDIL7C6FZTWMTAH0AIQFUE42DA,covid-19-threshold-00008:47,Academic Interviews,<ul>\n<li><b>Respondent</b>: So when we were o...,<b>Elicitor</b>: So you were able to manage th...,prompted,4,4,4,5,5
4,3TDXMTX3D2SNCPT3YSZGFCJ0K0Z6IO,3I01FDIL7C6FZTWMTAH0AIQFUE42DA,covid-19-threshold-00008:47,Academic Interviews,<ul>\n<li><b>Respondent</b>: So when we were o...,<b>Elicitor</b>: So you were able to manage th...,prompted,5,5,5,5,5


In [128]:
sorted_df = sorted_df.rename(columns={
    'AssignmentId': 'assignment_id',
    'HITId': 'hit_id',
    'block_id': 'block_id',
    'domain': 'domain',
    'context_turns': 'context_turns',
    'response': 'response',
    'label': 'label',
    'Conformity': 'conformity',
    'Control': 'control',
    'GoalRel': 'goal_rel',
    'Probing': 'probing',
    'Progression': 'progression',
})

display(sorted_df.head())


Unnamed: 0,assignment_id,hit_id,block_id,domain,context_turns,response,label,conformity,control,goal_rel,probing,progression
0,3WT783CTQ2F25YOY29K2FP7XLWQBCC,3I01FDIL7C6FZTWMTAH0AIQFUE42DA,covid-19-threshold-00008:47,Academic Interviews,<ul>\n<li><b>Respondent</b>: So when we were o...,<b>Elicitor</b>: Do you remember how you felt ...,orl,4,5,4,5,5
1,3TDXMTX3D2SNCPT3YSZGFCJ0K0Z6IO,3I01FDIL7C6FZTWMTAH0AIQFUE42DA,covid-19-threshold-00008:47,Academic Interviews,<ul>\n<li><b>Respondent</b>: So when we were o...,<b>Elicitor</b>: Do you remember how you felt ...,orl,5,5,4,5,4
2,3VELCLL3HBH8VCQGREHEHJFRWXR1FU,3I01FDIL7C6FZTWMTAH0AIQFUE42DA,covid-19-threshold-00008:47,Academic Interviews,<ul>\n<li><b>Respondent</b>: So when we were o...,<b>Elicitor</b>: Do you remember how you felt ...,orl,1,5,3,1,3
3,3WT783CTQ2F25YOY29K2FP7XLWQBCC,3I01FDIL7C6FZTWMTAH0AIQFUE42DA,covid-19-threshold-00008:47,Academic Interviews,<ul>\n<li><b>Respondent</b>: So when we were o...,<b>Elicitor</b>: So you were able to manage th...,prompted,4,4,4,5,5
4,3TDXMTX3D2SNCPT3YSZGFCJ0K0Z6IO,3I01FDIL7C6FZTWMTAH0AIQFUE42DA,covid-19-threshold-00008:47,Academic Interviews,<ul>\n<li><b>Respondent</b>: So when we were o...,<b>Elicitor</b>: So you were able to manage th...,prompted,5,5,5,5,5


In [129]:
sorted_df.to_csv(formatted_results_file, header=True)



## Analysis

In [130]:
# Clean up the label column
sorted_df["label"] = (
    sorted_df["label"]
    .str.replace("_", "-", regex=False)  # swap underscores for spaces
    .str.title()                         # capitalize each word
    .str.replace("Sft", "SFT", regex=False)  # swap underscores for spaces
    .str.replace("Orl", "ORL", regex=False)  # swap underscores for spaces
)

sorted_df = sorted_df.rename(columns={
    'domain': 'Domain',
    'label': 'Label',
    'conformity': 'Conformity',
    'control': 'Conversational Control',
    'goal_rel': 'Outcome Relevance',
    'probing': 'Probing Effectiveness',
    'progression': 'Progression',
})

display(sorted_df.head())


Unnamed: 0,assignment_id,hit_id,block_id,Domain,context_turns,response,Label,Conformity,Conversational Control,Outcome Relevance,Probing Effectiveness,Progression
0,3WT783CTQ2F25YOY29K2FP7XLWQBCC,3I01FDIL7C6FZTWMTAH0AIQFUE42DA,covid-19-threshold-00008:47,Academic Interviews,<ul>\n<li><b>Respondent</b>: So when we were o...,<b>Elicitor</b>: Do you remember how you felt ...,ORL,4,5,4,5,5
1,3TDXMTX3D2SNCPT3YSZGFCJ0K0Z6IO,3I01FDIL7C6FZTWMTAH0AIQFUE42DA,covid-19-threshold-00008:47,Academic Interviews,<ul>\n<li><b>Respondent</b>: So when we were o...,<b>Elicitor</b>: Do you remember how you felt ...,ORL,5,5,4,5,4
2,3VELCLL3HBH8VCQGREHEHJFRWXR1FU,3I01FDIL7C6FZTWMTAH0AIQFUE42DA,covid-19-threshold-00008:47,Academic Interviews,<ul>\n<li><b>Respondent</b>: So when we were o...,<b>Elicitor</b>: Do you remember how you felt ...,ORL,1,5,3,1,3
3,3WT783CTQ2F25YOY29K2FP7XLWQBCC,3I01FDIL7C6FZTWMTAH0AIQFUE42DA,covid-19-threshold-00008:47,Academic Interviews,<ul>\n<li><b>Respondent</b>: So when we were o...,<b>Elicitor</b>: So you were able to manage th...,Prompted,4,4,4,5,5
4,3TDXMTX3D2SNCPT3YSZGFCJ0K0Z6IO,3I01FDIL7C6FZTWMTAH0AIQFUE42DA,covid-19-threshold-00008:47,Academic Interviews,<ul>\n<li><b>Respondent</b>: So when we were o...,<b>Elicitor</b>: So you were able to manage th...,Prompted,5,5,5,5,5


In [131]:

# Get the list of unique labels
labels = sorted_df["Label"].unique()

tables = {}
for lbl in labels:
    subset = sorted_df[sorted_df["Label"] == lbl]

    # Group by domain, compute mean, transpose, and round
    summary = (
        subset.groupby("Domain")[[
            "Conformity", 
            "Progression", 
            "Outcome Relevance", 
            "Probing Effectiveness",  
            "Conversational Control"
        ]]
        .mean()
        .round(4)
        #.T
    )

    # Remove the rogue "Domain" header
    summary.columns.name = None

    # Turn the index into a column called "Metric"
    summary = summary.reset_index().rename(columns={"index": "Metric"})

    tables[lbl] = summary
    
    # Export directly to CSV (one file per label)
    clean_lbl = lbl.lower().replace("-", "_")
    filename = f"{results_file_name}-{clean_lbl}.csv"
    summary.to_csv(os.path.join(analysis_results_dir, filename), index=False)
    print(f"Saved {filename}")


# -------------------------------
# Create the "all" table (ignore labels)
# -------------------------------
all_summary = (
    sorted_df.groupby("Domain")[[
        "Progression", 
        "Conversational Control",
        "Outcome Relevance", 
        "Probing Effectiveness",  
        "Conformity", 
    ]]
    .mean()
    .round(4)
    #.T
)

all_summary.columns.name = None
all_summary = all_summary.reset_index().rename(columns={"index": "Metric"})

tables["all"] = all_summary

# Export "all" table
all_filename = f"{results_file_name}-all.csv"
all_summary.to_csv(os.path.join(analysis_results_dir, all_filename), index=False)
print(f"Saved {all_filename}")


# Example: display the "all" table
display("Table for all (no label distinction):")
display(tables["all"])


Saved Batch-5365353-batch-results-orl.csv
Saved Batch-5365353-batch-results-prompted.csv
Saved Batch-5365353-batch-results-real.csv
Saved Batch-5365353-batch-results-sft.csv
Saved Batch-5365353-batch-results-all.csv


'Table for all (no label distinction):'

Unnamed: 0,Domain,Progression,Conversational Control,Outcome Relevance,Probing Effectiveness,Conformity
0,Academic Interviews,4.0133,3.9633,4.0833,3.91,4.1167
1,Journalistic Investigations,3.7333,3.5667,3.8233,3.53,3.8733
2,Judicial Proceedings,3.8033,3.7767,4.1067,3.7533,4.0267
3,Oral History,3.9467,3.8233,4.06,3.7167,4.1067


In [132]:
# Define the desired order
label_order = ["Real", "Prompted", "SFT", "ORL"]

# Convert Label column to categorical with the custom order
sorted_df["Label"] = pd.Categorical(sorted_df["Label"], categories=label_order, ordered=True)

# Group by label and calculate mean of each metric
summary = (
    sorted_df.groupby(
        "Label", observed=True
    )[["Progression", "Conversational Control", "Outcome Relevance", "Probing Effectiveness", "Conformity"]]
      .mean()
      .round(4)
      .reset_index()
)

filename = f"{results_file_name}-per-label.csv"
summary.to_csv(os.path.join(analysis_results_dir, filename), index=False)
    
display(summary)


Unnamed: 0,Label,Progression,Conversational Control,Outcome Relevance,Probing Effectiveness,Conformity
0,Real,3.8933,3.7467,3.9233,3.7133,3.97
1,Prompted,4.25,4.0733,4.25,4.1733,4.2033
2,SFT,3.65,3.64,4.01,3.57,3.9567
3,ORL,3.7033,3.67,3.89,3.4533,3.9933
