In [1]:
import re
import json

import xml.etree.ElementTree as ET
import pandas as pd

In [2]:
causal_regex = re.compile(
    r"(?i)"
    r"(?:"
    r"why\b"                                    # "Why does prolonged stress contribute to heart disease?"
    r"|causes?\b"                               # "What causes insulin resistance in type 2 diabetes?"
    r"|how\s+(?:come|did)\b"                    # "How did genetic mutations lead to this rare disorder?"
    r"|effects?\b|affects?\b"                   # "How does hypertension affect kidney function?"
    r"|leads?\s+to\b"                           # "What does chronic inflammation lead to in autoimmune diseases?"
    r"|what\s+(?:will|might)?\s*happen(?:s)?\b" # "What happens if a stroke is left untreated?"
    r"|what\s+(?:to\s+do|should\s+be\s+done)\b" # "What should be done for a patient with sepsis?"
    r"|impacts?\b"                              # "What is the impact of obesity on cardiovascular health?"
    r"|consequences?\b"                         # "What are the consequences of delayed cancer diagnosis?"
    r"|outcomes?\b"                             # "What are the outcomes of untreated gestational diabetes?"
    r"|results?\s+(?:from|in)\b"                # "What results from prolonged high cholesterol levels?"
    r"|influence(?:s)?\b"                       # "How does smoking influence the progression of COPD?"
    r"|implications?\b"                         # "What are the implications of genetic predisposition to Alzheimer's?"
    r"|associated\s+with\b"                     # "What complications are associated with uncontrolled diabetes?"
    r"|linked\s+to\b"                           # "What risk factors are linked to cardiovascular disease?"
    r"|related\s+to\b"                          # "What symptoms are related to liver failure?"
    r"|if\s+.+then\b"                           # "If the patient has high BP, then what will occur?"
    r"|when\s+.+happens\b"                      # "When fever occurs, what is the likely outcome?"
    r"|could\s+.+cause\b"                       # "Could anemia cause heart palpitations?"
    r"|what\s+is\s+the\s+reason\b"              # "What is the reason behind sudden weight loss?"
    r"|reason\s+for\b"                          # "What is the reason for high creatinine levels?"
    r")",
    re.IGNORECASE
)


In [None]:
def contains_keyword(message, keywords):
    pattern = re.compile(r'\b(?:' + '|'.join(re.escape(keyword) for keyword in keywords) + r')\b', re.IGNORECASE)
    return bool(pattern.search(message))

def contains_causal_relation(message):
    return bool(causal_regex.search(message))


def format_options(option_dict):
    option_str = str(option_dict)
    # Regex pattern to match 'correct answer' key-value pair
    option_str = re.sub(r"'correct answer': '.*?'", "", option_str)
    # Clean up extra commas left after removing the key-value pair
    option_str = re.sub(r",\s*,", ",", option_str)
    option_str = re.sub(r",\s*}$", "}", option_str)
    return (option_str)


In [None]:
idf = pd.read_csv('data/reasoning_FCT_reasoning_FCT.csv',)
dataset = idf.copy()
print(len(dataset))


18866


In [8]:
data_causal = []
data_generic = []
# Iterate through each question in the XML
for idx, row in dataset.iterrows():
    qid = idx
    question = row['question']
    options = row['options']
    formatted_options = format_options(options)
    print(formatted_options)
    if contains_causal_relation(question):
        data_causal.append({'id': qid, 'question': question, 'options': formatted_options, 'correct_answer': row['correct_answer'], 'correct_index' :  row['correct_index']})
    else:
        data_generic.append({'id': qid, 'question': question, 'options': formatted_options, 'correct_answer': row['correct_answer'], 'correct_index' :  row['correct_index']})
# Create the DataFrame
df_causal = pd.DataFrame(data_causal)
df_gen = pd.DataFrame(data_generic)

{'0': 'They are anilides with propyl group in ortho.', '1': 'They are benzamides with methoxy group in ortho.', '2': 'They are benzenesulfonamides with a methyl group in ortho.', '3': 'They are ortho-halogenated derivatives of phenothiazine.'}
{'0': 'Young children (Early Childhood Education and first courses of Primary Education).', '1': 'Older children (last courses of Primary Education).', '2': 'Preadolescents (first courses of Secondary Education).', '3': 'Adolescents (last years of Secondary Education and Bachillerato).'}
{'0': '＜130/80 mmHg', '1': '150-160/90-95 mmHg', '2': '＜140/90 mmHg', '3': '＜140/85 mmHg'}
{'0': 'Cause less gingival shrinking', '1': 'Cause Less scars', '2': 'Less intra-operative bleeding', '3': 'None of the above'}
{'0': 'Bicarbonate', '1': 'Albumin', '2': 'Phosphate', '3': 'Ammonia'}
{'0': 'Uterine prolapsed', '1': 'Stress urinary incontinence', '2': 'Vesicovaginal fistula', '3': 'Uteric fistula'}
{'0': 'Hydrolysis of glucocerebroside', '1': 'DNA interstrand

In [None]:
df_gen.to_csv("path_to_save_generic_question.csv")
df_causal.to_csv("path_to_save_causal_question.csv")