In [1]:
import pandas as pd
from pathlib import Path

In [3]:
dataset_name = 'CPTAC-UCEC'
clinical_data_path = Path(f"../rawdata/GDC_clinical_data/{dataset_name}/")

In [4]:
print(f"Dataset: {dataset_name}")
clinical_data_frames = {}
all_column_names = set()
for data_file in sorted(clinical_data_path.glob('*.tsv')):
    data_type = data_file.stem
    data = pd.read_csv(data_file, sep='\t', na_values=['\'--', 'not reported', 'Not Reported', 'nan', 'unknown', 'Unknown'])
    
    if data.empty:
        print(f'Skipping {data_type} because it is empty')
        continue

    # Find columns entirely made up of NAs
    na_cols = data.columns[data.isna().all()]
    data = data.drop(columns=na_cols)

    # Remove duplicate rows
    data = data.drop_duplicates(ignore_index=True)

    clinical_data_frames[data_type] = data
    # print(f'Row count for {data_type}: {len(data)}')
    # print(f'Columnn count for {data_type}: {len(data.columns)}')
    print(f'Unique patient count for {data_type}: {len(data.case_submitter_id.unique())}')
    print("---")

    all_column_names.update(data.columns)

print()
print()
print("All Variable Names:")
print(all_column_names)
print('---')
print("Survival Variables Present:")
survival_variables = ['vital_status', 'days_to_death', 'days_to_last_follow_up', 'days_to_progression', 'days_to_recurrence']
print([var for var in all_column_names if var in survival_variables])
print('---')
print("Treatment types:")
print(clinical_data_frames['clinical'].treatment_type.unique())
print("Treatment value counts:")
print(clinical_data_frames['clinical'].value_counts("treatment_type"))


Dataset: CPTAC-UCEC
Unique patient count for clinical: 241
---
Unique patient count for exposure: 233
---
Skipping family_history because it is empty
Unique patient count for follow_up: 232
---
Unique patient count for pathology_detail: 128
---


All Variable Names:
{'tissue_or_organ_of_origin', 'primary_diagnosis', 'ajcc_pathologic_t', 'pathology_detail_submitter_id', 'type_of_smoke_exposure', 'timepoint_category', 'age_at_index', 'age_at_diagnosis', 'tumor_focality', 'residual_disease', 'disease_response', 'ajcc_pathologic_stage', 'weight', 'pathology_detail_id', 'ecog_performance_status', 'vital_status', 'project_id', 'days_to_birth', 'tobacco_smoking_status', 'cause_of_death', 'morphology', 'ajcc_clinical_m', 'cigarettes_per_day', 'exposure_type', 'case_id', 'days_to_recurrence', 'year_of_death', 'tumor_largest_dimension_diameter', 'year_of_birth', 'follow_up_submitter_id', 'hormonal_contraceptive_use', 'days_to_death', 'ajcc_staging_system_edition', 'diagnosis_id', 'alcohol_intens

In [4]:
clinical_data_frames['clinical'].value_counts("treatment_type")

treatment_type
Radiation Therapy, NOS                130
Chemotherapy                           70
Immunotherapy (Including Vaccines)      3
Name: count, dtype: int64

In [5]:
for column in clinical_data_frames['clinical'].columns:
    unique_values = clinical_data_frames['clinical'][column].unique()
    if len(unique_values) < 10:
        print(f"{column} values: {unique_values}")
        print()

project_id values: ['CPTAC-3']

age_at_index values: [nan 67.]

age_is_obfuscated values: [False True nan]

cause_of_death values: [nan 'Cardiovascular Disorder, NOS' 'Cancer Related' 'Not Cancer Related'
 'Infection']

ethnicity values: ['not hispanic or latino' nan 'hispanic or latino']

gender values: ['female']

race values: ['white' 'other' 'black or african american' 'asian' nan]

vital_status values: ['Alive' 'Dead' nan]

year_of_death values: [  nan 2016. 2019. 2020. 2018. 2022. 2017. 2021.]

ajcc_clinical_m values: ['MX' 'M0' 'M1' nan 'M1b']

ajcc_pathologic_m values: ['MX' 'M1' 'M0' nan]

ajcc_pathologic_n values: ['N0' 'NX' 'N2' 'N2a' 'N1' nan 'N1a']

ajcc_pathologic_t values: ['T1a' 'T3a' 'T2' 'T1b' 'T3b' 'T1' nan 'T4']

ajcc_staging_system_edition values: ['7th' '8th' nan]

diagnosis_is_primary_disease values: [ True False]

last_known_disease_status values: ['Tumor free' 'With tumor' 'Unknown tumor status' nan]

morphology values: ['8380/3' '8140/3']

primary_diagnosis va

In [6]:
clinical_data_frames['clinical']

Unnamed: 0,case_id,case_submitter_id,project_id,age_at_index,age_is_obfuscated,cause_of_death,days_to_birth,days_to_death,ethnicity,gender,...,progression_or_recurrence,residual_disease,site_of_resection_or_biopsy,tissue_or_organ_of_origin,tumor_focality,tumor_grade,year_of_diagnosis,residual_disease.1,treatment_outcome,treatment_type
0,01a80f88-16bf-4ed6-9864-313e9650a6e3,C3N-02012,CPTAC-3,,False,,-25277.0,,not hispanic or latino,female,...,no,RX,Corpus uteri,"Uterus, NOS",Unifocal,G2,2017.0,RX,,
1,01db2fd5-6443-4ed8-8318-3ef4f9012450,C3L-02354,CPTAC-3,,False,,-20260.0,,not hispanic or latino,female,...,yes,R1,Corpus uteri,"Uterus, NOS",Unifocal,G1,2017.0,R1,Complete Response,"Radiation Therapy, NOS"
2,01db2fd5-6443-4ed8-8318-3ef4f9012450,C3L-02354,CPTAC-3,,False,,-20260.0,,not hispanic or latino,female,...,yes,R1,Corpus uteri,"Uterus, NOS",Unifocal,G1,2017.0,R1,Complete Response,Chemotherapy
3,024bde91-ea3e-4157-83c4-8482801b00dc,C3L-01739,CPTAC-3,,False,,-26003.0,,,female,...,no,RX,Corpus uteri,"Uterus, NOS",Unifocal,G3,2017.0,RX,Complete Response,Chemotherapy
4,0701e7c9-898f-4cc3-9b8a-855bef549781,C3L-01633,CPTAC-3,,False,,-22150.0,,not hispanic or latino,female,...,no,R0,Corpus uteri,"Uterus, NOS",Unifocal,G1,2017.0,R0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,f87b09a1-ad59-4c29-b306-73ed26f38a1a,C3N-01764,CPTAC-3,,False,,-26880.0,,,female,...,no,R0,Corpus uteri,"Uterus, NOS",Unifocal,G3,2017.0,R0,Complete Response,"Radiation Therapy, NOS"
298,fab863a1-138e-4caf-8489-409d05511795,C3N-00322,CPTAC-3,,False,,-25752.0,,,female,...,no,R0,Corpus uteri,"Uterus, NOS",Multifocal,G2,2016.0,R0,Complete Response,"Radiation Therapy, NOS"
299,fc9d8045-d918-4860-a134-c520de7c3e55,C3L-01664,CPTAC-3,,False,,-27611.0,,not hispanic or latino,female,...,no,R0,Endometrium,"Uterus, NOS",Unifocal,G2,2017.0,R0,Complete Response,"Radiation Therapy, NOS"
300,ffef8d1d-f99d-4cc0-9f49-46488bfca131,C3L-00586,CPTAC-3,,False,,-18580.0,,not hispanic or latino,female,...,no,RX,Corpus uteri,"Uterus, NOS",Unifocal,G1,2016.0,RX,Complete Response,"Radiation Therapy, NOS"
