### Data extraction

In [3]:
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET


def extract_data(files):

    result = []

    for path in files:
        # Abre y parsea el archivo XML
        tree = ET.parse(path)
        # Obtiene el elemento raíz del archivo XML
        root = tree.getroot()
        for question in root.findall("NLM-QUESTION"):
            
            question_id = question.get("questionid") or question.get("qid")
            message = question.find("MESSAGE").text #if question.find("MESSAGE") is not None and question.find("MESSAGE").text else ""
            
            for sub_question in question.findall("SUB-QUESTIONS/SUB-QUESTION"):
                sub_question_id = sub_question.get("subqid")
               
                focus = sub_question.find("ANNOTATIONS/FOCUS").text if sub_question.find("ANNOTATIONS/FOCUS") is not None and sub_question.find("ANNOTATIONS/FOCUS").text else ''
                type = sub_question.find("ANNOTATIONS/TYPE").text if sub_question.find("ANNOTATIONS/TYPE") is not None and sub_question.find("ANNOTATIONS/TYPE").text else ''
                
                for answer in sub_question.findall("ANSWERS/ANSWER"):
                    answer_id = answer.get("answerid")
                    answer = answer.text if answer.text is not None else ""
                    result.append({
                        "Q_id": question_id, "message": message, "sub_Q_id": sub_question_id,
                        "focus": focus, "type": type, "A_id": answer_id, "answer": answer
                    })
        
    df = pd.DataFrame(result)
    df = df.explode("answer")

    df = df.dropna(subset = "message")

    return df

In [5]:
df = extract_data(["TRAIN DATASETS/TREC-2017-LiveQA-Medical-Train-1.xml", "TRAIN DATASETS/TREC-2017-LiveQA-Medical-Train-2.xml"])

In [7]:
df_no_id = df.drop(["Q_id", "sub_Q_id", "A_id"], axis = 1)

##### CSV

In [10]:
import os

# Crear el directorio si no existe
if not os.path.exists("data"):
    os.makedirs("data")

df.to_csv("data/data.csv", index=False)
df_no_id.to_csv("data/data_no_id.csv", index = False)

##### JSON

In [17]:
import csv
import json

def csv_to_json(csv_file, json_file):
    data = []
    with open(csv_file, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            data.append(row)

    with open(json_file, 'w') as jsonfile:
        json.dump(data, jsonfile, indent=4)

csv_to_json("data/data.csv", "data/data.json")
csv_to_json("data/data_no_id.csv", "data/data_no_id.json")