We use the clinical data with follow up and outcome data for the TCGA PanCancer Atlas from Liu et al1

1: An Integrated TCGA Pan-Cancer Clinical Data Resource to Drive High-Quality Survival Outcome Analytics (https://pubmed.ncbi.nlm.nih.gov/29625055/)


In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob

In [2]:
tcga_data_path = '/n/data1/hms/dbmi/zitnik/lab/users/was966/TCGA/GDC_v37'

In [3]:
df_patients_liu = pd.read_excel(os.path.join(tcga_data_path, 'Liu_TCGA_clinical.xlsx'), 
                             sheet_name='TCGA-CDR', index_col='bcr_patient_barcode')

In [4]:
df_patients_liu = df_patients_liu[df_patients_liu.columns[1:]]

In [5]:
df_patients_liu.columns

Index(['type', 'age_at_initial_pathologic_diagnosis', 'gender', 'race',
       'ajcc_pathologic_tumor_stage', 'clinical_stage', 'histological_type',
       'histological_grade', 'initial_pathologic_dx_year', 'menopause_status',
       'birth_days_to', 'vital_status', 'tumor_status', 'last_contact_days_to',
       'death_days_to', 'cause_of_death', 'new_tumor_event_type',
       'new_tumor_event_site', 'new_tumor_event_site_other',
       'new_tumor_event_dx_days_to', 'treatment_outcome_first_course',
       'margin_status', 'residual_tumor', 'OS', 'OS.time', 'DSS', 'DSS.time',
       'DFI', 'DFI.time', 'PFI', 'PFI.time', 'Redaction'],
      dtype='object')

In [8]:
df_patients_liu.treatment_outcome_first_course.value_counts().index

Index(['[Not Available]', 'Complete Remission/Response', 'Progressive Disease',
       'Stable Disease', '[Unknown]', 'Partial Remission/Response',
       '[Not Applicable]', 'No Measureable Tumor or Tumor Markers',
       '[Discrepancy]', 'Persistent Disease', '[Not Evaluated]',
       'Normalization of Tumor Markers, but Residual Tumor Mass'],
      dtype='object', name='treatment_outcome_first_course')

In [41]:
nan = ['[Not Available]', '[Unknown]', '[Not Applicable]', '[Not Evaluated]', 'No Measureable Tumor or Tumor Markers',
       '[Discrepancy]','Normalization of Tumor Markers, but Residual Tumor Mass']
R = ['Complete Remission/Response', 'Partial Remission/Response' ]
NR  = ['Progressive Disease', 'Persistent Disease',  'Stable Disease']

def get_r(x):
    if x in nan:
        return 'NR'
    elif x in R:
        return 'R'
    elif x in NR:
        return 'NR'
    else:
        return 'NR'

df_patients_liu['treatment_outcome'] = df_patients_liu.treatment_outcome_first_course.apply(get_r)


In [58]:
df_patients_liu['treatment_outcome'] = df_patients_liu.treatment_outcome_first_course.apply(get_r)
df_patients_liu['age'] = df_patients_liu.age_at_initial_pathologic_diagnosis
df_patients_liu['stage'] = df_patients_liu.ajcc_pathologic_tumor_stage

df_patients_liu['os_status'] = df_patients_liu['OS'].map({1:'dead', 0: 'alive'})
df_patients_liu['os_time'] = df_patients_liu['OS.time']

df_patients_liu['pfi_status'] = df_patients_liu['PFI'].map({1:'Progression', 0: 'censored'})
df_patients_liu['pfi_time'] = df_patients_liu['PFI.time']

In [62]:
cols= ['age', 'gender', 'race', 'stage', 'vital_status', 'tumor_status', 'treatment_outcome', 'os_status', 'os_time', 'pfi_status', 'pfi_time']
df_clinic = df_patients_liu[cols]

In [65]:
df_clinic.to_pickle(os.path.join(tcga_data_path, 'df_patient_clinic.pkl'))