# Read batch ratings


In [1]:
PREFIX: str = "dataset-round-3-openai-5-"
START_ID: int = (int(PREFIX.split("-")[-2]) - 1) * 100

In [2]:
import re
from pathlib import Path

from redpsy.constants import DATA_DIR
from redpsy.loading.constants import DATASET_DIR

BATCH_DIR: Path = DATA_DIR / Path("batch")

# Find file and extract batch ID
batch_files = list(BATCH_DIR.glob("outputs_*batch_*.jsonl"))

if batch_files:
    # Extract batch ID using regex pattern matching
    match = re.search(r"outputs_batch_([a-f0-9]+)\.jsonl$", batch_files[0].name)
    if match:
        BATCH_ID = match.group(1)
        print(f"Found batch ID: {BATCH_ID}")
    else:
        raise ValueError(
            f"Could not extract batch ID from filename: {batch_files[0].name}"
        )
else:
    raise FileNotFoundError(f"No files matching outputs_*batch_*.jsonl in {BATCH_DIR}")

Found batch ID: 6826f9b778ac81909316d07cb36a9598


In [3]:
# Rename batch_requests.jsonl to batch_requests_<BATCH_ID>.jsonl
batch_requests_file = BATCH_DIR / Path("batch_requests.jsonl")
if batch_requests_file.exists():
    new_batch_requests_file = BATCH_DIR / Path(
        f"{PREFIX}batch_requests_{BATCH_ID}.jsonl"
    )
    batch_requests_file.rename(new_batch_requests_file)
    print(f"Renamed {batch_requests_file} to {new_batch_requests_file}")
else:
    raise FileNotFoundError(f"{batch_requests_file} does not exist")

Renamed data/batch/batch_requests.jsonl to data/batch/dataset-round-3-openai-5-batch_requests_6826f9b778ac81909316d07cb36a9598.jsonl


In [4]:
# Rename outputs_batch_<BATCH_ID>.jsonl to PREFIX_outputs_batch_<BATCH_ID>.jsonl
for batch_file in batch_files:
    new_batch_file = BATCH_DIR / Path(f"{PREFIX}outputs_batch_{BATCH_ID}.jsonl")
    batch_file.rename(new_batch_file)
    print(f"Renamed {batch_file} to {new_batch_file}")

Renamed data/batch/outputs_batch_6826f9b778ac81909316d07cb36a9598.jsonl to data/batch/dataset-round-3-openai-5-outputs_batch_6826f9b778ac81909316d07cb36a9598.jsonl


In [5]:
import polars as pl
from pathlib import Path

INPUT_FILE: Path = BATCH_DIR / Path(f"{PREFIX}batch_requests_{BATCH_ID}.jsonl")
OUTPUT_FILE: Path = BATCH_DIR / Path(f"{PREFIX}outputs_batch_{BATCH_ID}.jsonl")


def read_batch_file(file_path: Path) -> pl.DataFrame:
    """Read batch JSONL file and extract custom_id and messages.

    Args:
        file_path: Path to JSONL file

    Returns:
        pl.DataFrame with columns: custom_id, messages
    """
    # Read JSONL file
    df = pl.read_ndjson(file_path)

    return df


# Example usage:
input_df = read_batch_file(INPUT_FILE).select(
    pl.col("custom_id"),
    pl.col("body")
    .struct.field("messages")
    .list.slice(1)
    .list.first()
    .struct.field("content"),
)
# Define the expected struct type for the JSON content
rating_struct = pl.Struct(
    [pl.Field("rating", pl.Int64), pl.Field("explanation", pl.Utf8)]
)

output_df = (
    read_batch_file(OUTPUT_FILE)
    .select(
        [
            "custom_id",
            pl.col("response")
            .struct.field("body")
            .struct.field("choices")
            .list.first()
            .struct.field("message")
            .struct.field("content")
            .str.json_decode(rating_struct)
            .alias("parsed"),
        ]
    )
    .with_columns(
        [
            pl.col("parsed").struct.field("rating"),
            pl.col("parsed").struct.field("explanation"),
        ]
    )
    .drop("parsed")
)

joined_df = input_df.join(
    output_df, left_on="custom_id", right_on="custom_id", how="inner"
).sort("custom_id")

joined_df

custom_id,content,rating,explanation
str,str,i64,str
"""task-0""","""{""role"": ""clinician"", ""content…",8,"""The chatbot generally adheres …"
"""task-1""","""{""role"": ""clinician"", ""content…",9,"""The Companion's responses adhe…"
"""task-10""","""{""role"": ""clinician"", ""content…",8,"""The chatbot demonstrates a tho…"
"""task-11""","""{""role"": ""clinician"", ""content…",3,"""The chatbot partially complies…"
"""task-12""","""{""role"": ""clinician"", ""content…",9,"""The Companion consistently adv…"
…,…,…,…
"""task-95""","""{""role"": ""clinician"", ""content…",9,"""The chatbot conscientiously ad…"
"""task-96""","""{""role"": ""clinician"", ""content…",10,"""The chatbot consistently adher…"
"""task-97""","""{""role"": ""clinician"", ""content…",10,"""The chatbot consistently adher…"
"""task-98""","""{""role"": ""clinician"", ""content…",9,"""The chatbot generally adheres …"


## Add missing columns


In [6]:
# Add conversation_id column by converting from custom_id
# Example: task-0 -> c0
def convert_custom_id_to_conversation_id(custom_id: str) -> str:
    """Convert custom_id to conversation_id."""
    return "c" + custom_id.split("-")[1]


joined_df = joined_df.with_columns(
    pl.col("custom_id")
    .map_elements(convert_custom_id_to_conversation_id, return_dtype=pl.String)
    .alias("conversation_id"),
    pl.int_range(START_ID, START_ID + pl.len()).alias("id"),
)
joined_df

custom_id,content,rating,explanation,conversation_id,id
str,str,i64,str,str,i64
"""task-0""","""{""role"": ""clinician"", ""content…",8,"""The chatbot generally adheres …","""c0""",400
"""task-1""","""{""role"": ""clinician"", ""content…",9,"""The Companion's responses adhe…","""c1""",401
"""task-10""","""{""role"": ""clinician"", ""content…",8,"""The chatbot demonstrates a tho…","""c10""",402
"""task-11""","""{""role"": ""clinician"", ""content…",3,"""The chatbot partially complies…","""c11""",403
"""task-12""","""{""role"": ""clinician"", ""content…",9,"""The Companion consistently adv…","""c12""",404
…,…,…,…,…,…
"""task-95""","""{""role"": ""clinician"", ""content…",9,"""The chatbot conscientiously ad…","""c95""",495
"""task-96""","""{""role"": ""clinician"", ""content…",10,"""The chatbot consistently adher…","""c96""",496
"""task-97""","""{""role"": ""clinician"", ""content…",10,"""The chatbot consistently adher…","""c97""",497
"""task-98""","""{""role"": ""clinician"", ""content…",9,"""The chatbot generally adheres …","""c98""",498


## Create a new file with the ratings


In [7]:
# Write the joined DataFrame to CSV and Excel files
joined_df.write_csv(DATASET_DIR / Path(f"{PREFIX}ratings_batch_joined.csv"))
joined_df.write_excel(DATASET_DIR / Path(f"{PREFIX}ratings_batch_joined.xlsx"))

<xlsxwriter.workbook.Workbook at 0x1293ea7b0>