# Reader Study

In [16]:
import pandas as pd
import numpy as np
from datetime import datetime
import os
import tqdm

In [17]:
def clean_date(timestamp):
    dt = pd.Timestamp(timestamp)
    return f"{dt.strftime('%B')} {dt.day}, {dt.year} - {dt.strftime('%I:%M %p')}"

def generate_note_title(enc_dept_name, note_type, auth_prov_type, deid_service_date):
    deid_service_date = clean_date(deid_service_date) if deid_service_date else ""
    if enc_dept_name and note_type and auth_prov_type and deid_service_date:
        return f"{enc_dept_name} - {note_type} ({auth_prov_type}) | {deid_service_date}"
    elif enc_dept_name and note_type and deid_service_date:
        return f"{enc_dept_name} - {note_type} | {deid_service_date}"
    elif note_type and deid_service_date:
        return f"{note_type} | {deid_service_date}"
    elif deid_service_date:
        return f"Clinical Note | {deid_service_date}"
    return f"Clinical Note"

def generate_note_titles(row):
    return [generate_note_title(ed, nt, apt, ds)
            for ed, nt, apt, ds in zip(row['enc_dept_names'],
                                        row['note_types'],
                                        row['auth_prov_types'],
                                        row['deid_service_dates'])]

# Reader Study Evaluation Dataset

In [18]:
reader_evaluation_dataset = pd.read_parquet("reader_evaluation_dataset.parquet")
reader_evaluation_dataset["note_titles"] = reader_evaluation_dataset.apply(generate_note_titles, axis=1)

claude_evaluation = pd.read_csv("../inference/results/reader_evaluation_dataset/claude3_5_0_250.csv")[["llm_indication"]].rename(
    columns={"llm_indication": "llm_indication_claude"}
)

llama_evaluation = pd.read_csv("../inference/results/reader_evaluation_dataset/Qwen_Qwen2.5-7B-Instruct_0_250.csv")[["llm_indication"]].rename(
    columns={"llm_indication": "llm_indication_qwen"}
)

reader_evaluation_dataset = pd.concat([reader_evaluation_dataset, claude_evaluation, llama_evaluation], axis=1)

In [19]:
SEED = 123
indication_choices = [
    "original_history", 
    "additional_history", 
    "llm_indication_claude",
    "llm_indication_qwen"
]
def randomize_indications(_):
    choices = indication_choices.copy()
    np.random.shuffle(choices)
    return pd.Series(choices, index=["indication1_random", "indication2_random", "indication3_random", "indication4_random"])

# Apply the function row-wise and assign the new columns to the DataFrame
reader_evaluation_dataset[["indication1_choice", "indication2_choice", "indication3_choice", "indication4_choice"]] = \
    reader_evaluation_dataset.apply(randomize_indications, axis=1)

reader_evaluation_dataset.to_parquet("randomized_reader_study_evaluation.parquet")

# Case Assignment

In [20]:
#create small groupings, this is necessary for uniform distribution to ensure equal group sizing (in setting of a rectangular matrix...)
# we want 25 cases per person for 20 people  , 2 repeats of each case , so we want a dimension 10 matrix , 5 groupings of 3 and 5 groupings of 2
X = list()
for i in range(10):
    row = list()
    for j in range(10):
        if (j - i) % 10 in [0, 1, 2, 3, 4]:
            row.append(3)
        else:
            row.append(2)
    X.append(row)

# use X and our index array to create a matrix of the actual cases, grouped according to small groups defined in X - call this matrix Y.
# essentially we are sampling from the case index (labeled index) and drawing # of cases according to matrix X
# well its a "matrix" but really the structure is a list of lists. the first index will refer to the "row" and the second index will be the "column"
# Y will be a square matrix by construction

index = list(range(250))
Y = list()
start_index=0

for i in range(10):
  row = list()
  for j in range(10):
    row.append(index[start_index:start_index+X[i][j]])
    #update start_index
    start_index=start_index+X[i][j]
  Y.append(row)

# use matrix Y to create the final assignments by taking the union across each row to create the first set of 10 cases and then do the same by column for the second set of 10 cases store these as arrays

case_assignments=list()

#first assignment for first 10 raters, read along rows of Y
for i in range(10):
  case_assignments.append([element for row in Y[i] for element in row])

#second assignment for raters 11-20, read along columns of Y, create array of arrays to store reworked values. unfortunately list of lists makes this hard so create a temporary matrix Y2 first which is a transpose of Y
Y2= [Y[j][i] for i in range(10) for j in range(10)]

for i in range(10):
  case_assignments.append(([element for row in Y2[10*i:10*i+10] for element in row]))

In [21]:
import pandas as pd

NUM_READERS = 20
WORDS_PER_LINE = 10 
LINES_PER_BLOCK = 20


for i in tqdm.tqdm(range(NUM_READERS)):
    case_assignment = case_assignments[i]
    reader_evaluation_dataset_subset = reader_evaluation_dataset.iloc[case_assignment].reset_index(drop=True)
    for j in range(len(reader_evaluation_dataset_subset)):
        row = reader_evaluation_dataset_subset.iloc[j]
        basepath = f"/mnt/sohn2022/Adrian/rad-llm-pmhx/dataset/reader_study/evaluation/public/user{i+1}/set{j+1}"
        os.makedirs(basepath, exist_ok=True)
        
        with open(f"{basepath}/exam.txt", "w", encoding="utf-8") as f:
            f.write(str(row["exam_type"]))
        
        note_titles = row["note_titles"]
        note_texts = row["note_texts"]
        
        for k in range(10):
            filename = f"{basepath}/note{k+1}.txt"
            with open(filename, "w", encoding="utf-8") as f:
                if k < len(note_titles) and k < len(note_texts):
                    f.write(note_titles[k] + "\n")
                    words = note_texts[k].split()
                    line_count = 0
                    for idx in range(0, len(words), WORDS_PER_LINE):
                        line = " ".join(words[idx:idx + WORDS_PER_LINE])
                        f.write(line + "\n")
                        line_count += 1
                        if line_count % LINES_PER_BLOCK == 0:
                            f.write("\n") 
                else:
                    f.write("")
        
        for l in range(4):
            filename = f"{basepath}/indication{l+1}.txt"
            with open(filename, "w", encoding="utf-8") as f:
                f.write(row[row[f"indication{l+1}_choice"]])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:29<00:00,  1.49s/it]
