In [4]:
import pandas as pd
import math
import os
import xml.etree.ElementTree as ET
import csv
import pdb

In [5]:
# subdirs = os.listdir('/Users/awxlong/Desktop/my-studies/temp_data/CRC/TCGA-CRC/gdc_download_20240629_132916.799980/')
cohort_name = 'COAD'
task_name = 'g0_arrest'

In [6]:
# Parent directory containing subdirectories with XML files
parent_directory = '/Users/awxlong/Desktop/my-studies/temp_data/CRC/TCGA-CRC/gdc_download_20240629_132916.799980/'
root_dir = '/Users/awxlong/Desktop/my-studies/hpc_exps/Data/'

# Output CSV file
output_file = f'{root_dir}{cohort_name}_clinical_features.csv'


In [33]:
# Define the clinical features we want to extract
clinical_features = [
    'gender',
    # 'tumor_tissue_site', # MULTICOLLINEARITY with icd_o_site
    # 'histological_type', # MULTICOLLINEARITY
    # 'height', # IRRELEVANT
    # 'tissue_source_site', # BIOLOGICALLY IRRELEVANT; example D7: Greater Poland Center
    # 'days_to_last_known_alive', # MULTICOLLINEARITY with age 
    'weight',
    'age_at_initial_pathologic_diagnosis',
    'race',
    # 'ethnicity', # only one value: not hispanic or latino
    'other_dx', # DISCUSS: Other diagnosis: only 3 values and high imbalance
    'pathologic_stage', # correlated with pathologic T, N, M
    # 'pathologic_T',
    # 'pathologic_N',
    # 'pathologic_M',
    # 'clinical_stage', # 100% NA rate, discard clinical_t, _n, _m due to similar reasons
    # 'history_of_neoadjuvant_treatment', # only 2 yes
    # informed_consent_verified, # BIOLOGICALLY IRRELEVANT
    'icd_o_3_histology', # cell type and biological activity
    'icd_o_3_site', # anatomical location of tumor origin
    # 'icd_10',       # not limited to cancer; drop it to avoid MULTICOLLINEARITY
    # 'anatomic_neoplasm_subdivision', # CORRELATES WITH ICD3
    # 'tissue_retrospective_collection_indicator', # BIOLOGICALLY IRRELEVANT 
    'lymph_node_examined_count',
    # 'primary_lymph_node_presentation_assessment', # 98% is YES
    # 'number_of_lymphnodes_positive_by_he', # PROBABLY THE SAME AS LYMPH NODE COUNT
    # 'number_of_lymphnodes_positive_by_ihc',# PROBABLY THE SAME AS LYMPH NODE COUNT
    # 'vital_status', # No point for DEAD patients
    # 'gleason_grading', # ONLY /n
    # 'igcccg_stage', # ONLY NaN
    # 'psa_value', # 100% NA rate
    # 'ann_arbor', # ONLY /n, its subsets such as serum_markers can also be 
    'person_neoplasm_cancer_status', # does it causally correlate with pathologic_stage? 
    'circumferential_resection_margin', # 79% NA RATE
    'venous_invasion',
    'lymphatic_invasion',
    'perineural_invasion_present',
    'microsatellite_instability', # 79% NA Rate
    'history_of_colon_polyps',
    # 'synchronous_colon_cancer_present', # 95% NO, and may correlate with History of Colon Polyps
    'colon_polyps_present', # HIGH NA Rate
    'residual_tumor', # correlates with g0 arrest? There could be residue, or not, or maybe yes
    'radiation_therapy', # >90% NA RATE
    'primary_therapy_outcome_success', # >90% NA RATE
    # 'other_malignancy_anatomic_site', # this is from org_omf
    'preoperative_pretreatment_cea_level',
    'non_nodal_tumor_deposits', # 
    'kras_mutation_found', # HIGH NA rate
    'braf_gene_analysis_result', # HIGH NA rate
    'loss_expression_of_mismatch_repair_proteins_by_ihc',
    # 'number_of_first_degree_relatives_with_cancer_diagnosis', # BIOLOGICALLY IRRELEVANT
    'postoperative_rx_tx', # HIGH NA rate, rx is radiotherapy and tx is chemotherapy
    'new_tumor_event_after_initial_treatment', # HIGH NA Rate
    'prescribed_dose', # HIGH NA Rate
    'number_cycles',  # HIGH NA rate
    'measure_of_response', # HIGH Rate of UNKNOWN values
    
]

# Function to find element regardless of namespace
def find_element(root, tag):
    for elem in root.iter():
        if elem.tag.endswith(tag):
            return elem
    return None

# Function to extract feature value
def extract_feature(root, feature):
    elem = find_element(root, feature)
    return elem.text if elem is not None else 'Unknown'

# List to store patient data
patient_data = []

# Iterate through all subdirectories in the parent directory
for subdir in os.listdir(parent_directory):
    subdir_path = os.path.join(parent_directory, subdir)
    
    if os.path.isdir(subdir_path):
        for filename in os.listdir(subdir_path):
            if filename.endswith('.xml') and 'org_omf' not in filename: # exclude those with OMF annotations since they're a minority
                file_path = os.path.join(subdir_path, filename)
                
                try:
                    tree = ET.parse(file_path)
                    root = tree.getroot()
                    
                    # Extract patientID from the filename
                    patient_id = filename.split('.')[2]
                    
                    # Extract all specified clinical features
                    features = [patient_id] + [extract_feature(root, feature) for feature in clinical_features]
                    
                    patient_data.append(features)
                    
                except Exception as e:
                    print(f"Error processing file {file_path}: {str(e)}")
                
                break  # Assuming one XML per subdirectory

# Write data to CSV file
with open(output_file, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    
    # Write header
    csv_writer.writerow(['PatientID'] + [feature.replace('_', ' ').title() for feature in clinical_features])
    
    # Write patient data
    csv_writer.writerows(patient_data)

print(f"Data has been written to {output_file}")

# Read the CSV files

df1 = pd.read_csv(f'{root_dir}{cohort_name}_clinical_features.csv')
# df2 = pd.read_csv(f'{root}{cohort_name}_{task_name}.csv')
df2 = pd.read_csv(f'{root_dir}local_cohort_{cohort_name}.csv')

# # Merge the two datasets based on the PatientID column
# merged_data = pd.merge(df1, df2, on='PatientID')

# # Write the merged data to a new CSV file
# merged_data.to_csv(f'{root_dir}{cohort_name}_WSI_clinical_features.csv', index=False)

# Get the list of unique PatientIDs from patient_wsi
wsi_patient_ids = df2['PatientID'].unique()

filtered_patient_data = df1[df1['PatientID'].isin(wsi_patient_ids)]

# Write the filtered data to a new CSV file
filtered_patient_data.to_csv(f'{root_dir}{cohort_name}_WSI_clinical_features.csv', index=False)

print('filtered patients with WSI merged')

# ref = pd.read_csv('/Users/awxlong/Desktop/my-studies/hpc_exps/Data/Task_g0_arrest_resnet50.csv')
df = pd.read_csv(f'{root_dir}{cohort_name}_WSI_clinical_features.csv')

ref = pd.read_csv('/Users/awxlong/Desktop/my-studies/hpc_exps/Data/Task_g0_arrest_uni.csv')

merged_df = pd.merge(ref, df, on='PatientID', how='left')
final_df = merged_df.drop(['Unnamed: 0', 'folder', 'filename', 'slide', 'tissue', 'patch', 'feature', 'slide_nb', 'tissue_nb', 'patch_nb', 'feature_nb'], axis=1)
final_df.to_csv(f'{root_dir}{cohort_name}_WSI_clinical_g0_arrest_features.csv')

Data has been written to /Users/awxlong/Desktop/my-studies/hpc_exps/Data/COAD_clinical_features.csv
filtered patients with WSI merged


In [None]:
repeated_patient_ids = [pid for pid in wsi_patient_ids if len(df1[df1['PatientID'] == pid]) > 1]
repeated_patient_ids

In [None]:
df1.shape # 605
df2.shape # 604

In [None]:
final_df.head()