In [150]:
import xml.etree.ElementTree as ET
import pandas as pd
import os

from collections import defaultdict

In [158]:
# First we collect all the files that are there
manifest_dirs = os.listdir("project2/data/raw/clinical")
xml_files = []

for dir in manifest_dirs:
    dir_files = os.listdir("project2/data/raw/clinical/" + dir)
    xml_file = None
    for file in dir_files:
        if file.endswith(".xml"):
            xml_file = file

    if xml_file is None:
        print(f"ERROR: no XML file in data dir {dir}")

    xml_files.append((dir, xml_file))

print(f"Found {len(xml_files)} XML files for the metadata")

Found 11141 XML files for the metadata


In [None]:
patients = {}
missing_col_counts = defaultdict(int)

for dir, file in xml_files:
    try:
        tree = ET.parse("project2/data/raw/clinical/" + dir + "/" + file)
    except:
        print(f"Unable to parse {dir}/{file}")
        continue

    patient = tree.getroot()[-1]
    assert patient.tag.endswith("patient")

    patient_dict = {}

    for column in patient:
        tag = column.tag.split("}")[-1]
        value = column.text

        # Some values are lists, so we need to handle them differently
        if tag in ['race_list', 
                   'metastatic_site_list', 'relation_testicular_cancer_list', 
                   'postoperative_tx_list']:
            values = []
            for v in column:
                values.append(v.text)

            if values == [None]:
                value = pd.NA
            else:
                value = ", ".join(values)

        if tag == "blood_relative_cancer_history_list":
            relatives = []
            for relative in column:
                d = {}
                for c in column[0]:
                    subtag = c.tag.split("}")[-1]
                    subvalue = c.text
                    d[subtag] = subvalue

                type_col = "cancer_diagnosis_cancer_type_icd9_text_name"
                if not type_col in d.keys():
                    type_col = "relative_family_cancer_hx_text"
                if not type_col in d.keys():
                    type_col = "family_history_cancer_type_other"
                if not type_col in d.keys():
                    type_col = "family_cancer_type_txt"

                try:
                    comb = f'{d["family_member_relationship_type"]}:{d[type_col]}'
                except:
                    print(f"Unable to parse family history: {d}")
                if comb is not None and comb != "None:None" and comb not in relatives:
                    relatives.append(comb)

            if len(relatives) > 1:
                print(relatives)

            if relatives == []:
                value = pd.NA
            else:
                value = ", ".join(relatives)

        # TODO: other columns that have a weird format, but could be included as well are:
        missing_columns = [
            "stage event",
            "new_tumor_events",
            "drugs",
            "radiations",
            "follow_ups",
            "history_hepato_carcinoma_risk_factors",
            "loss_expression_of_mismatch_repair_proteins_by_ihc_results",
            "antireflux_treatment_types",
            "sites_of_primary_melanomas",
            "viral_hepatitis_serologies",
            "prior_systemic_therapy_types",
            "anatomic_neoplasm_subdivisions",
            "first_nonlymph_node_metastasis_anatomic_sites",
            "patient_history_immune_system_and_related_disorders_names",
            "lymph_node_location_positive_pathology_names",
            "fdg_or_ct_pet_performed_outcomes",
            "diagnostic_mri_result_outcomes",
            "diagnostic_ct_result_outcomes",
            "human_papillomavirus_types",
            "treatment",
            "relation_testicular_cancer_list",
            "postoperative_tx_list",
        ]

        if tag in missing_columns:
            nonempty = False
            try:
                for c in column:
                    nonempty = True
            except:
                pass

            if nonempty:
                missing_col_counts[tag] += 1
            value = pd.NA

        # Sometimes there's a random newline in the value
        if value is not None and isinstance(value, str):
            value = value.strip()

        patient_dict[tag] = value
    
    patients[patient_dict["patient_id"]] = patient_dict

print(f"Parsed metadata for {len(patients)} patients")

print("\nThese columns are not correctly parsed because they need manual intervention\n" + \
      "     (amount of samples that have data here between brackets):")
for col, count in missing_col_counts.items():
    print(f"- {col}: {count}")

# Make it into a dataframe
df = pd.DataFrame(patients).T

Parsed metadata for 11141 patients

These columns are not correctly parsed because they need manual intervention
(amount of samples that have data here between brackets):
- new_tumor_events: 10941
- drugs: 4322
- radiations: 3239
- follow_ups: 9599
- anatomic_neoplasm_subdivisions: 1095
- first_nonlymph_node_metastasis_anatomic_sites: 1095
- loss_expression_of_mismatch_repair_proteins_by_ihc_results: 626
- sites_of_primary_melanomas: 468
- prior_systemic_therapy_types: 468
- history_hepato_carcinoma_risk_factors: 425
- viral_hepatitis_serologies: 377
- antireflux_treatment_types: 627
- relation_testicular_cancer_list: 134
- postoperative_tx_list: 134
- patient_history_immune_system_and_related_disorders_names: 307
- lymph_node_location_positive_pathology_names: 307
- human_papillomavirus_types: 307
- treatment: 307
- fdg_or_ct_pet_performed_outcomes: 34
- diagnostic_ct_result_outcomes: 64
- diagnostic_mri_result_outcomes: 14


In [161]:
df

Unnamed: 0,additional_studies,tumor_tissue_site,histological_type,other_dx,gender,vital_status,days_to_birth,days_to_last_known_alive,days_to_death,days_to_last_followup,...,histologic_grading_tier_category,maximum_tumor_dimension,alcoholic_exposure_category,history_of_diabetes,days_to_diabetes_onset,history_of_chronic_pancreatitis,days_to_pancreatitis_onset,relative_cancer_types,history_of_disease,tests_performed
6561,,Lung,Lung Squamous Cell Carcinoma- Not Otherwise Sp...,No,MALE,Alive,-24234,,,24,...,,,,,,,,,,
A0CT,,Breast,Infiltrating Ductal Carcinoma,No,FEMALE,Alive,-25959,,,1918,...,,,,,,,,,,
A41J,,Thyroid,Thyroid Papillary Carcinoma - Classical/usual,No,FEMALE,Alive,-10411,,,477,...,,,,,,,,,,
4916,,Kidney,Kidney Clear Cell Renal Carcinoma,No,FEMALE,Alive,-25470,,,1373,...,,,,,,,,,,
A45H,,Thyroid,Thyroid Papillary Carcinoma - Classical/usual,No,MALE,Alive,-16541,,,238,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
A8PF,,Stomach,"Stomach, Intestinal Adenocarcinoma, Tubular Type",No,MALE,Dead,-28116,,76,,...,,,,,,,,,,
5715,,Lung,Lung Adenocarcinoma Mixed Subtype,No,FEMALE,Alive,-25383,,,62,...,,,,,,,,,,
A28C,,Colon,Colon Adenocarcinoma,No,MALE,Dead,-27073,,2475,2475,...,,,,,,,,,,
A1IA,,,,No,FEMALE,Alive,-11744,,,1887,...,,,,,,,,,,


In [163]:
df.to_csv("project2/data/processed/metadata.csv", index=False)