In [None]:
import pandas as pd
import math
import os
import xml.etree.ElementTree as ET
import csv
import pdb

In [None]:
# subdirs = os.listdir('/Users/awxlong/Desktop/my-studies/temp_data/CRC/TCGA-CRC/gdc_download_20240629_132916.799980/')
cohort_name = 'COAD'
task_name = 'g0_arrest'

In [None]:
# Parent directory containing subdirectories with XML files
parent_directory = '/Users/awxlong/Desktop/my-studies/temp_data/CRC/TCGA-CRC/gdc_download_20240629_132916.799980/'
root_dir = '/Users/awxlong/Desktop/my-studies/hpc_exps/Data/'

# Output CSV file
output_file = f'{root_dir}{cohort_name}_clinical_features.csv'


In [None]:
# Define the clinical features we want to extract
clinical_features = [
    'gender',
    # 'tumor_tissue_site',
    # 'histological_type',
    'height',
    'weight',
    'age_at_initial_pathologic_diagnosis',
    'race',
    # 'ethnicity', # only not hispanic or latino
    'other_dx',
    'pathologic_stage',
    'pathologic_T',
    'pathologic_N',
    'pathologic_M',
    'psa_value',
    'clinical_stage',
    'history_of_neoadjuvant_treatment',
    # 'icd_o_3_histology', # probably the same as histological type
    'icd_o_3_site', # may have different meanings
    'icd_10',
    'anatomic_neoplasm_subdivision',
    'lymph_node_examined_count',
    'number_of_lymphnodes_positive_by_he',
    # 'number_of_lymphnodes_positive_by_ihc',
    # 'vital_status',
    # 'gleason_grading',
    # 'psa_value',
    'person_neoplasm_cancer_status',
    'circumferential_resection_margin',
    'venous_invasion',
    'lymphatic_invasion',
    'perineural_invasion_present',
    'microsatellite_instability',
    'history_of_colon_polyps',
    'synchronous_colon_cancer_present',
    'colon_polyps_present',
    'radiation_therapy',
    'primary_therapy_outcome_success',
    # 'other_malignancy_anatomic_site', # this is from org_omf
    
]

# Function to find element regardless of namespace
def find_element(root, tag):
    for elem in root.iter():
        if elem.tag.endswith(tag):
            return elem
    return None

# Function to extract feature value
def extract_feature(root, feature):
    elem = find_element(root, feature)
    return elem.text if elem is not None else 'Unknown'

# List to store patient data
patient_data = []

# Iterate through all subdirectories in the parent directory
for subdir in os.listdir(parent_directory):
    subdir_path = os.path.join(parent_directory, subdir)
    
    if os.path.isdir(subdir_path):
        for filename in os.listdir(subdir_path):
            if filename.endswith('.xml') and 'org_omf' not in filename: # exclude those with OMF annotations since they're a minority
                file_path = os.path.join(subdir_path, filename)
                
                try:
                    tree = ET.parse(file_path)
                    root = tree.getroot()
                    
                    # Extract patientID from the filename
                    patient_id = filename.split('.')[2]
                    
                    # Extract all specified clinical features
                    features = [patient_id] + [extract_feature(root, feature) for feature in clinical_features]
                    
                    patient_data.append(features)
                    
                except Exception as e:
                    print(f"Error processing file {file_path}: {str(e)}")
                
                break  # Assuming one XML per subdirectory

# Write data to CSV file
with open(output_file, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    
    # Write header
    csv_writer.writerow(['PatientID'] + [feature.replace('_', ' ').title() for feature in clinical_features])
    
    # Write patient data
    csv_writer.writerows(patient_data)

print(f"Data has been written to {output_file}")

# Read the CSV files

df1 = pd.read_csv(f'{root_dir}{cohort_name}_clinical_features.csv')
# df2 = pd.read_csv(f'{root}{cohort_name}_{task_name}.csv')
df2 = pd.read_csv(f'{root_dir}local_cohort_{cohort_name}.csv')

# # Merge the two datasets based on the PatientID column
# merged_data = pd.merge(df1, df2, on='PatientID')

# # Write the merged data to a new CSV file
# merged_data.to_csv(f'{root_dir}{cohort_name}_WSI_clinical_features.csv', index=False)

# Get the list of unique PatientIDs from patient_wsi
wsi_patient_ids = df2['PatientID'].unique()

filtered_patient_data = df1[df1['PatientID'].isin(wsi_patient_ids)]

# Write the filtered data to a new CSV file
filtered_patient_data.to_csv(f'{root_dir}{cohort_name}_WSI_clinical_features.csv', index=False)

print('filtered patients with WSI merged')

In [None]:
repeated_patient_ids = [pid for pid in wsi_patient_ids if len(df2[df2['PatientID'] == pid]) > 1]
repeated_patient_ids