In [9]:
import json
import re

In [2]:
with open('data/curebench_valset_pharse1.jsonl', 'r') as f:
    val_data = [json.loads(line) for line in f]

with open('data/curebench_results_validation.json', 'r') as f:
    ann_data = json.load(f)

In [3]:
val_data[0], ann_data[0]

({'id': 'U9PHZ83RKYV8',
  'question_type': 'multi_choice',
  'question': 'Which drug brand name is associated with the treatment of acne?',
  'correct_answer': 'A',
  'options': {'A': 'Salicylic Acid',
   'B': 'Minoxidil',
   'C': 'Ketoconazole',
   'D': 'Fluocinonide'}},
 {'id': 'U9PHZ83RKYV8',
  'question_type': 'multi_choice',
  'question': 'Which drug brand name is associated with the treatment of acne?',
  'options': {'A': 'Salicylic Acid',
   'B': 'Minoxidil',
   'C': 'Ketoconazole',
   'D': 'Fluocinonide'},
  'llm_answer': "The question asks for the drug brand name associated with the treatment of acne. Among the given options, 'Salicylic Acid' is commonly used in acne treatment products.\n\n[FinalAnswer] A: Salicylic Acid. Salicylic Acid is a well-known treatment for acne, often found in over-the-counter acne treatment products. It helps to exfoliate the skin and unclog pores, making it effective in managing acne symptoms. [#: 1]",
  'output': '{\'id\': \'chatcmpl-5de9dda9-cce9

In [27]:
ann_data[0]['llm_answer'].split('[FinalAnswer]')[1]

answer_pattern = r'\[FinalAnswer\]\s*([A-Z]:\s*[^.\n]+)'
remain_pattern = r'\[FinalAnswer\]\s*([A-Z]:\s*[^.\n]+)'
second_remain_pattern = r'\[FinalAnswer\]'

In [25]:
ann_data[1]['llm_answer']

"Patients experiencing severe allergic reactions during or after receiving fosaprepitant for injection should seek immediate medical attention. This is crucial to ensure their safety and receive appropriate treatment. Therefore, the correct option is 'B': 'Inform their healthcare provider immediately and seek emergency medical care.'.\n\n[FinalAnswer] B: 'Inform their healthcare provider immediately and seek emergency medical care.' Patients experiencing severe allergic reactions during or after receiving fosaprepitant for injection should inform their healthcare provider immediately and seek emergency medical care. This is essential to manage the reaction effectively and ensure their safety. [FinalAnswer]"

In [31]:
def process_text(text):
    # Replace the first occurrence of [FinalAnswer] with 'The correct answer is'
    text, count = re.subn(r'\[FinalAnswer\]', 'The correct answer is', text, count=1)
    
    # Remove the second (and any subsequent) [FinalAnswer] tags
    if count == 1:
        text = re.sub(r'\[FinalAnswer\]', '', text)
    
    # Remove all other tags enclosed in square brackets (e.g. [#: 1], [xyz])
    text = re.sub(r'\[[^\]]+\]', '', text)
    
    # Optional: clean up multiple spaces and whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [32]:

re.search(answer_pattern, ann_data[-1]['llm_answer']).group(1)
re.sub(second_remain_pattern, '', re.sub(remain_pattern, '', ann_data[2]['llm_answer']))
process_text(ann_data[2]['llm_answer'])

"The question asks for a contraindication for the use of Gadavist. Among the given options, a history of severe hypersensitivity reactions to Gadavist would be the most critical contraindication, as it could lead to severe allergic reactions. The correct answer is B: 'History of severe hypersensitivity reactions to Gadavist'. This is because severe hypersensitivity reactions can be life-threatening and using Gadavist in such cases could exacerbate the condition, leading to severe allergic reactions."

True

In [52]:
re.search(answer_pattern, ann_data[42]['llm_answer']).group(1).strip()[0].upper()

'B'

In [54]:
val_data[0]['correct_answer'].upper() 

'A'

In [46]:
re.search(answer_pattern, ann_data[42]['llm_answer']).group(1).strip()[0] == val_data[0]['correct_answer']

False

In [55]:

len(ann_data)

459

In [78]:

# Create a dictionary for val_data keyed by 'id' for quick lookup
val_data_by_id = {entry['id']: entry for entry in val_data}


all_cleaned_data = []


for ann_entry in ann_data:
    entry_id = ann_entry['id']
    # Find the corresponding val_data entry by id
    val_entry = val_data_by_id.get(entry_id)

    if val_entry is None:
        # If no matching id in val_data, you can skip or handle it as needed
        print(f"Warning: id {entry_id} in ann_data not found in val_data")
        continue

    # Extract simplified_answer safely
    simplified_answer = "No answer"
    match = re.search(answer_pattern, ann_entry['llm_answer'])
    if match:
        simplified_answer = match.group(1).strip()

    cleaned_data = {
        "id": entry_id,
        "question_type": val_entry.get('question_type', ''),
        "question": val_entry.get('question', ''),
        "full_answer": ann_entry.get('llm_answer', ''),
        "simplified_answer": simplified_answer,
        "reasoning": process_text(ann_entry.get('llm_answer', '')).strip(),
        "correct_or_not": simplified_answer[0].upper() == val_entry.get('correct_answer', '').upper() if simplified_answer != "No answer" else False,
    }

    all_cleaned_data.append(cleaned_data)

with open('data/curebench_validation_preprocessed.json', 'w') as f:
    json.dump(all_cleaned_data, f, indent=4)

'No answer'

In [79]:
print("# correct answer, # incorrect answers")
len([x['id'] for x in all_cleaned_data if x['correct_or_not'] == True]), len([x for x in all_cleaned_data if x['correct_or_not'] == False])

correct_answered_ids = [x['id'] for x in all_cleaned_data if x['correct_or_not'] == True]

incorrect_answered_ids = [x['id'] for x in all_cleaned_data if x['correct_or_not'] == False]

len(correct_answered_ids), len(incorrect_answered_ids)


# correct answer, # incorrect answers


(335, 124)

In [80]:
print('# unanswered: ')
len([x['id'] for x in all_cleaned_data if x['simplified_answer'] == "No answer"])
unanswered_ids = [x['id'] for x in all_cleaned_data if x['simplified_answer'] == "No answer"]
len(unanswered_ids)

# unanswered: 


43

In [82]:
print('All unanswered questions are incorrect')
unanswered_questions = [x for x in incorrect_answered_ids if x in unanswered_ids]
len(unanswered_questions)
unanswered_questions[0:5]

All unanswered questions are incorrect


['Eic83tab2ldk',
 '0NpConxVPMFN',
 '7YXi98NhIrhP',
 '1hPNUhXOcSyQ',
 'RqM53KI1SHyE']

# generate clean finetuning dataset

In [83]:

with open('data/curebench_validation_preprocessed.json', 'r') as f:
    preprocessed_val_data = json.load(f)




In [84]:
preprocessed_val_data[0]

{'id': 'U9PHZ83RKYV8',
 'question_type': 'multi_choice',
 'question': 'Which drug brand name is associated with the treatment of acne?',
 'full_answer': "The question asks for the drug brand name associated with the treatment of acne. Among the given options, 'Salicylic Acid' is commonly used in acne treatment products.\n\n[FinalAnswer] A: Salicylic Acid. Salicylic Acid is a well-known treatment for acne, often found in over-the-counter acne treatment products. It helps to exfoliate the skin and unclog pores, making it effective in managing acne symptoms. [#: 1]",
 'simplified_answer': 'A: Salicylic Acid',
 'reasoning': "The question asks for the drug brand name associated with the treatment of acne. Among the given options, 'Salicylic Acid' is commonly used in acne treatment products. The correct answer is A: Salicylic Acid. Salicylic Acid is a well-known treatment for acne, often found in over-the-counter acne treatment products. It helps to exfoliate the skin and unclog pores, makin

# correcting the wrong answers