In [1]:
import json
from cns_obsidian.utils import load_variable_json
from cns_obsidian.utils import save_variable_json

In [2]:
import json

def assert_question_structure_mc(entry):
    try:
        question_data = json.loads(entry["question"].replace('\n', ''))
    except json.JSONDecodeError:
        raise AssertionError("The 'question' field is not a valid JSON string")

    assert isinstance(question_data, dict), "The 'question' field should contain a dictionary"

    required_question_fields = ["question_stem", "answer_choices", "discussion", "correct_answer"]
    for field in required_question_fields:
        assert field in question_data, f"Question data is missing the '{field}' field"

    assert isinstance(question_data["question_stem"], str), "'question_stem' should be a string"
    assert isinstance(question_data["answer_choices"], list), "'answer_choices' should be a list"
    assert isinstance(question_data["discussion"], str), "'discussion' should be a string"
    assert isinstance(question_data["correct_answer"], str), "'correct_answer' should be a string"

    # Check answer choices
    num_choices = len(question_data["answer_choices"])
    assert num_choices in [4, 5, 6, 7, 8, 9], "There should be either 4 or 5 answer choices"

    expected_letters = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'][:num_choices]
    for i, choice in enumerate(question_data["answer_choices"]):
        assert choice.startswith(f"{expected_letters[i]}."), f"Answer choice {i+1} should start with '{expected_letters[i]}.'"

    assert any(question_data["correct_answer"] in choice for choice in question_data["answer_choices"]), "The correct answer should match one of the answer choices"

def assert_question_structure_ddx(entry):
    try:
        question_data = json.loads(entry["question"].replace('\n', ''))
    except json.JSONDecodeError:
        raise AssertionError("The 'question' field is not a valid JSON string")

    assert isinstance(question_data, dict), "The 'question' field should contain a dictionary"

    required_question_fields = ["one-liner", "ddx"]
    for field in required_question_fields:
        assert field in question_data, f"Question data is missing the '{field}' field"

    assert isinstance(question_data["one-liner"], str), "'one-liner' should be a string"
    assert isinstance(question_data["ddx"], list), "'ddx' should be a list"

    # Check differential diagnosis
    assert len(question_data["ddx"]) in [5, 6, 7, 8, 9 , 10, 11], "There should be exactly 5 items in the differential diagnosis"
    
    for item in question_data["ddx"]:
        assert isinstance(item, str), "Each item in the differential diagnosis should be a string"

def assert_question_structure_ift(json_entry):
    # Parse the JSON string in the "question" field
    try:
        questions = json.loads(json_entry["question"].replace('\n', ''))
    except json.JSONDecodeError:
        raise AssertionError("The 'question' field is not a valid JSON string")
    
    # Assert that questions is a list
    assert isinstance(questions, list), "The 'question' field should contain a list"
    
    # Check each item in the list
    for item in questions:
        # Assert that each item is a dict
        assert isinstance(item, dict), "Each item in the question list should be a dictionary"
        
        # Assert that each dict has 'question' and 'answer' keys
        assert "question" in item, "Each question dict should have a 'question' key"
        assert "answer" in item, "Each question dict should have an 'answer' key"
        
        # Optionally, you can also check if the values are strings
        assert isinstance(item["question"], str), "The 'question' value should be a string"
        assert isinstance(item["answer"], str), "The 'answer' value should be a string"


In [None]:
journal = "Neurosurgery"
typ = "ddx" 
data = load_variable_json(f"/gpfs/data/oermannlab/private_data/TheMedScrolls/FiguresJadenTextract/{journal}/dataset_claude_{typ}.json")

In [None]:
data = load_variable_json(f"/gpfs/data/oermannlab/private_data/TheMedScrolls/FiguresJadenTextract/{journal}/dataset_claude_{typ}.json")

# Lists to store successful and failed entries
successful_entries = []
failed_entries = []

for point in data:
    try:
        # Attempt to load the 'question' as JSON after replacing newlines
        json.loads(point['question'].replace('\n', ''))

        # Validate based on the type
        if typ == "ddx":
            assert_question_structure_ddx(point)
        elif typ == "mc":
            assert_question_structure_mc(point)
        elif typ == "ift":
            assert_question_structure_ift(point)

        # If no exception occurs, add to successful entries
        successful_entries.append(point)

    except json.JSONDecodeError as e:
        # Extract the error position and print relevant info for failed entries
        print(f"Error in custom_id: {point['custom_id']}")
        error_pos = int(str(e).split(' ')[-1][:-1])
        to_print = point['question'].replace('\n', '')[error_pos:]
        print("Problematic part of the question: {}\n\n\n".format(to_print))

        # Add failed entry to the failed_entries list
        failed_entries.append(point)

    except Exception as e:
        # Handle other exceptions (such as failed assertions)
        print(f"Exception: {e}")
        to_print = point['question'].replace('\n', '')
        print(f"Question: {to_print}")

        # Add to the failed entries 
print(len(successful_entries), len(failed_entries))
