In [1]:
import os
from pathlib import Path

input_txt = "../docs/medquad_medquad_qa_corpus.txt"

In [2]:
# -- 2. Load MedQuAD Corpus (TXT) --
def parse_medquad_txt(txt_path):
    """
    Parse MedQuAD Q/A pairs from txt file.
    Returns: List of dicts with 'question', 'answer' fields.
    Assumes format:
    Q: <question>
    A: <answer>
    <blank line>
    """
    qa_pairs = []
    with open(txt_path, "r", encoding="utf-8") as file:
        lines = file.readlines()
    i = 0
    while i < len(lines):
        if lines[i].startswith("Q:"):
            q = lines[i][2:].strip()
            if i+1 < len(lines) and lines[i+1].startswith("A:"):
                a = lines[i+1][2:].strip()
                qa_pairs.append({"question": q, "answer": a})
            i += 3  # assumes blank line after each pair
        else:
            i += 1
    return qa_pairs

In [3]:
qa_pairs = parse_medquad_txt(input_txt)
print(f"Loaded {len(qa_pairs)} Q/A pairs.")

Loaded 16407 Q/A pairs.


In [6]:
import pandas as pd
df = pd.DataFrame(qa_pairs)
df

Unnamed: 0,question,answer
0,What is (are) Adult Acute Lymphoblastic Leukem...,Key Points - Adult acute lymphoblastic leukemi...
1,What are the symptoms of Adult Acute Lymphobla...,"Signs and symptoms of adult ALL include fever,..."
2,How to diagnose Adult Acute Lymphoblastic Leuk...,Tests that examine the blood and bone marrow a...
3,What is the outlook for Adult Acute Lymphoblas...,Certain factors affect prognosis (chance of re...
4,Who is at risk for Adult Acute Lymphoblastic L...,Previous chemotherapy and exposure to radiatio...
...,...,...
16402,What is (are) Parasites - Zoonotic Hookworm ?,"There are many different species of hookworms,..."
16403,Who is at risk for Parasites - Zoonotic Hookwo...,Dog and cat hookworms are found throughout the...
16404,How to diagnose Parasites - Zoonotic Hookworm ?,Cutaneous larva migrans (CLM) is a clinical di...
16405,What are the treatments for Parasites - Zoonot...,The zoonotic hookworm larvae that cause cutane...


In [7]:
print("Sample questions:")
for q in df["question"].sample(5, random_state=42):
    print("-", q)

print("Sample answers:")
for a in df["answer"].sample(5, random_state=42):
    print("-", a[:120], "...")


Sample questions:
- What is (are) Linear porokeratosis ?
- What are the treatments for Gum (Periodontal) Disease ?
- What are the symptoms of Nystagmus, congenital motor, autosomal recessive ?
- How many people are affected by pseudohypoaldosteronism type 1 ?
- What is (are) Adult Central Nervous System Tumors ?
Sample answers:
- Linear porokeratosis is a skin condition that most often begins in infancy or early childhood, but it can occur at any a ...
- Controlling the Infection The main goal of treatment is to control the infection. The number and types of treatment will ...
- What are the signs and symptoms of Nystagmus, congenital motor, autosomal recessive? The Human Phenotype Ontology provid ...
- PHA1 is a rare condition that has been estimated to affect 1 in 80,000 newborns. ...
- Key Points - An adult central nervous system tumor is a disease in which abnormal cells form in the tissues of the brain ...
