In [1]:
import os
import json
import xml.etree.ElementTree as ET

from pathlib import Path

def extract_qa_from_xml(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    
    doc_id = root.attrib.get("id")
    source = root.attrib.get("source")
    url = root.attrib.get("url")

    qa_list = []
    for qapair in root.findall(".//QAPair"):
        question = qapair.find("Question").text.strip()
        answer = qapair.find("Answer").text.strip()
        qa_list.append({
            "doc_id": doc_id,
            "source": source,
            "url": url,
            "question": question,
            "answer": answer
        })
    
    return qa_list

def process_all_xml_to_jsonl(input_dir, output_path):
    input_path = Path(input_dir)
    print(input_path)
    all_qa_pairs = []

    for xml_file in input_path.glob("*.xml"):
        try:
            qa_pairs = extract_qa_from_xml(xml_file)
            all_qa_pairs.extend(qa_pairs)
        except Exception as e:
            print(f"Failed to process {xml_file}: {e}")

    with open(output_path, "w", encoding="utf-8") as f_out:
        for qa in all_qa_pairs:
            json_line = json.dumps(qa, ensure_ascii=False)
            f_out.write(json_line + "\n")

    print(f"Done. Extracted {len(all_qa_pairs)} QA pairs to {output_path}")


input_xml_folder = "./2_GARD_QA"  
output_jsonl_file = "qa_pairs.jsonl"
process_all_xml_to_jsonl(input_xml_folder, output_jsonl_file)


2_GARD_QA
Failed to process 2_GARD_QA\0002079.xml: 'NoneType' object has no attribute 'strip'
Failed to process 2_GARD_QA\0002080.xml: 'NoneType' object has no attribute 'strip'
Failed to process 2_GARD_QA\0002253.xml: 'NoneType' object has no attribute 'strip'
Failed to process 2_GARD_QA\0002747.xml: 'NoneType' object has no attribute 'strip'
Failed to process 2_GARD_QA\0006509.xml: 'NoneType' object has no attribute 'strip'
Done. Extracted 5384 QA pairs to qa_pairs.jsonl


In [2]:
import json

def convert_to_chat_format(input_jsonl, output_jsonl):
    with open(input_jsonl, "r", encoding="utf-8") as fin, \
         open(output_jsonl, "w", encoding="utf-8") as fout:

        for line in fin:
            entry = json.loads(line)
            question = entry["question"].strip()
            answer = entry["answer"].strip()

            chat_obj = {
                "messages": [
                    {"role": "system", "content": "You are a helpful medical assistant."},
                    {"role": "user", "content": question},
                    {"role": "assistant", "content": answer}
                ]
            }

            fout.write(json.dumps(chat_obj, ensure_ascii=False) + "\n")

    print(f"Converted chat-style entries written to {output_jsonl}")


input_file = "qa_pairs.jsonl"
output_file = "gard_qa_prompts.jsonl"
convert_to_chat_format(input_file, output_file)


Converted chat-style entries written to gard_qa_prompts.jsonl
