In [50]:
import json
import os
import xmltodict
import re
import time
import pandas as pd
from jsonpath import jsonpath
import glob


In [51]:

# Initialize a dictionary to store the data
data = {
    "Questions": [],
    "Answers": [],
}

# Function to process an XML file
def processXmlFile(completePath):
    # Open the XML file
    with open(completePath) as f:
        # Read the contents of the file
        xmlstring = f.read()

        try:
            # Parse the XML string into a dictionary using xmltodict library
            dataDict = xmltodict.parse(xmlstring, xml_attribs=False)
            
            # Extract the QAPair and Focus information from the dictionary
            listOfQA = json.loads(json.dumps(jsonpath(dataDict, '$..' + "QAPair")[0]))
        except Exception as e:
            # Handle exceptions, such as empty QAPair or Focus
            return

        # Check if there is only a single QA pair, and convert it to a list if needed
        if isinstance(listOfQA, dict):
            listOfQA = [listOfQA]
        
        # Process each QA pair
        for qaPair in listOfQA:
            try:
                # Clean up the answer text
                x = re.sub(' +', ' ', qaPair['Answer'])
                x = re.sub('Key Points', "", x)
                x = x.replace("\n", "").replace("-", "")
                
                # Append the processed data to the data dictionary
                data['Answers'].append(x)
                data['Questions'].append(qaPair['Question'])
            except:
                # Handle any exceptions that occur during processing
                return

In [52]:
# List of folders with empty answers
foldersWithEmptyAnswers = [
    "10_MPlus_ADAM_QA",
    "11_MPlusDrugs_QA",
    "12_MPlusHerbsSupplements_QA",
    "readme.txt",  # As it does not contain any QAs
    "QA-TestSet-LiveQA-Med-Qrels-2479-Answers.zip",  # Will use it later,
    "ProcessedData.csv"
]

# Base path for the folders
BASE_PATH = "MedQuAD"

# Iterate over the folders in the base path
for folder in os.listdir(BASE_PATH):
    print(folder)
    # Check if the folder is in the list of folders with empty answers
    if folder in foldersWithEmptyAnswers:
        # If the folder is in the list, skip it and continue with the next folder
        continue
    else:
        # If the folder is not in the list, process it
        print("Processing folder:", folder)
        start = time.time()

        # Iterate over the XML files in the current folder
        for xmlFileName in os.listdir(os.path.join(BASE_PATH, folder)):
            completePath = os.path.join(BASE_PATH, folder, xmlFileName)
            
            # Process the XML file
            processXmlFile(completePath)

        print("Took", time.time() - start)


12_MPlusHerbsSupplements_QA
8_NHLBI_QA_XML
Processing folder: 8_NHLBI_QA_XML
Took 0.35936927795410156
9_CDC_QA
Processing folder: 9_CDC_QA
Took 0.09395003318786621
7_SeniorHealth_QA
Processing folder: 7_SeniorHealth_QA
Took 0.1974344253540039
2_GARD_QA
Processing folder: 2_GARD_QA
Took 5.483072280883789
1_CancerGov_QA
Processing folder: 1_CancerGov_QA
Took 0.3397035598754883
11_MPlusDrugs_QA
4_MPlus_Health_Topics_QA
Processing folder: 4_MPlus_Health_Topics_QA
Took 0.6619377136230469
6_NINDS_QA
Processing folder: 6_NINDS_QA
Took 0.2737085819244385
5_NIDDK_QA
Processing folder: 5_NIDDK_QA
Took 0.34223008155822754
10_MPlus_ADAM_QA
QA-TestSet-LiveQA-Med-Qrels-2479-Answers.zip
readme.txt
3_GHR_QA
Processing folder: 3_GHR_QA
Took 1.517181634902954


In [53]:
df_medquad = pd.DataFrame(data)
df_medquad.head()

Unnamed: 0,Questions,Answers
0,What is (are) Hypersensitivity Pneumonitis ?,"Hypersensitivity pneumonitis (noomoNItis), or ..."
1,What causes Hypersensitivity Pneumonitis ?,Repeatedly breathing in foreign substances can...
2,Who is at risk for Hypersensitivity Pneumoniti...,People who repeatedly breathe in foreign subst...
3,What are the symptoms of Hypersensitivity Pneu...,Signs and symptoms of hypersensitivity pneumon...
4,How to diagnose Hypersensitivity Pneumonitis ?,"To diagnose hypersensitivity pneumonitis (HP),..."


In [54]:
file_pattern = 'medical-question-answer-data/*.json'

# Create empty lists to store the questions and answers
questions = []
answers = []

# Iterate over the JSON files
for file_path in glob.glob(file_pattern):
    with open(file_path, 'r') as json_file:
        data = json.load(json_file)
    
    for item in data:
        question = item['question']
        answer = item['answer']
        
        questions.append(question)
        answers.append(answer)

# Create a DataFrame using the lists of questions and answers
df_iclinic = pd.DataFrame({'Questions': questions, 'Answers': answers})

In [55]:
df_iclinic.head()

Unnamed: 0,Questions,Answers
0,my 5 1/2-year-old son displays adhd symptoms f...,adhd and bipolar mood disorder (bmd) can coexi...
1,my son has add and mild autism. he has been su...,stimulants in general tend to decrease appetit...
2,my son is 13 and is depressed. he has been tak...,while any of the stimulant medications can inc...
3,my 17-year-old has stopped taking concerta aft...,seventy percent of teens diagnosed when they a...
4,i've been taking respa-ar for allergies. i can...,try claritin-d which is located behind the pha...


In [56]:
df_iclinic.shape, df_medquad.shape

((29752, 2), (16402, 2))

In [57]:
final_medical_dataset = pd.concat([df_iclinic, df_medquad], axis=0, join='outer', ignore_index=True)


In [58]:
final_medical_dataset.shape

(46154, 2)

In [59]:
final_medical_dataset.head()

Unnamed: 0,Questions,Answers
0,my 5 1/2-year-old son displays adhd symptoms f...,adhd and bipolar mood disorder (bmd) can coexi...
1,my son has add and mild autism. he has been su...,stimulants in general tend to decrease appetit...
2,my son is 13 and is depressed. he has been tak...,while any of the stimulant medications can inc...
3,my 17-year-old has stopped taking concerta aft...,seventy percent of teens diagnosed when they a...
4,i've been taking respa-ar for allergies. i can...,try claritin-d which is located behind the pha...


In [60]:

final_medical_dataset.reset_index(drop=True, inplace=True)

# Save the DataFrame as a CSV file
final_medical_dataset.to_csv('final_medical_dataset.csv')

In [61]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

# Download the required resources for tokenization
nltk.download('punkt')

# Assuming you have already loaded the DataFrame from CSV or other sources

# Function to count the number of words/tokens in a text
def count_words(text):
    tokens = word_tokenize(text)  # Tokenize the text into words
    return len(tokens)

# Apply the count_words function to each answer in the DataFrame
final_medical_dataset['Word Count'] = final_medical_dataset['Answers'].apply(lambda x: count_words(str(x)))

# Calculate the maximum, minimum, mean, and median word counts
max_word_count = final_medical_dataset['Word Count'].max()
min_word_count = final_medical_dataset['Word Count'].min()
mean_word_count = final_medical_dataset['Word Count'].mean()
median_word_count = final_medical_dataset['Word Count'].median()

# Print the results
print("Maximum Word Count:", max_word_count)
print("Minimum Word Count:", min_word_count)
print("Mean Word Count:", mean_word_count)
print("Median Word Count:", median_word_count)

[nltk_data] Downloading package punkt to /home/tesla/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Maximum Word Count: 4705
Minimum Word Count: 1
Mean Word Count: 143.6153096156346
Median Word Count: 86.0
