Clinical Data Dictionary https://docs.gdc.cancer.gov/Data_Dictionary/viewer/#?view=table-entity-list&anchor=clinical

TCGA Barcode https://docs.gdc.cancer.gov/Encyclopedia/pages/TCGA_Barcode/ 

# CBioPortal Clinical

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter
import os

In [2]:
cbioportal = pd.read_csv("/shared/cgorner/cbioportal_firehose_brca_tcga_clinical_data.tsv",sep="\t")
cbioportal

Unnamed: 0,Study ID,Patient ID,Sample ID,Diagnosis Age,American Joint Committee on Cancer Metastasis Stage Code,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code,Neoplasm Disease Stage American Joint Committee on Cancer Code,American Joint Committee on Cancer Publication Version Type,American Joint Committee on Cancer Tumor Stage Code,Brachytherapy first reference point administered total dose,...,Staging System.1,Surgery for positive margins,Surgery for positive margins other,Surgical procedure first,Time between clamping and freezing,Time between excision and freezing,Tissue Source Site,TMB (nonsynonymous),Person Neoplasm Status,Vial number
0,brca_tcga,TCGA-3C-AAAU,TCGA-3C-AAAU-01,55.0,MX,NX,Stage X,6th,TX,,...,,,,Modified Radical Mastectomy,,,3C,,WITH TUMOR,A
1,brca_tcga,TCGA-3C-AALI,TCGA-3C-AALI-01,50.0,M0,N1a,Stage IIB,6th,T2,,...,,,,Lumpectomy,,,3C,,TUMOR FREE,A
2,brca_tcga,TCGA-3C-AALJ,TCGA-3C-AALJ-01,62.0,M0,N1a,Stage IIB,7th,T2,,...,,,,Modified Radical Mastectomy,,,3C,,TUMOR FREE,A
3,brca_tcga,TCGA-3C-AALK,TCGA-3C-AALK-01,52.0,M0,N0 (i+),Stage IA,7th,T1c,,...,,,,Simple Mastectomy,,,3C,,TUMOR FREE,A
4,brca_tcga,TCGA-4H-AAAK,TCGA-4H-AAAK-01,50.0,M0,N2a,Stage IIIA,7th,T2,,...,,,,Modified Radical Mastectomy,,,4H,,TUMOR FREE,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1103,brca_tcga,TCGA-WT-AB44,TCGA-WT-AB44-01,77.0,MX,N0 (i-),Stage IA,7th,T1c,,...,,,,Lumpectomy,,,WT,,TUMOR FREE,A
1104,brca_tcga,TCGA-XX-A899,TCGA-XX-A899-01,46.0,MX,N2a,Stage IIIA,7th,T1c,,...,,,,Modified Radical Mastectomy,,,XX,,TUMOR FREE,A
1105,brca_tcga,TCGA-XX-A89A,TCGA-XX-A89A-01,68.0,MX,N0,Stage IIB,7th,T3,,...,,,,Simple Mastectomy,,,XX,,TUMOR FREE,A
1106,brca_tcga,TCGA-Z7-A8R5,TCGA-Z7-A8R5-01,61.0,MX,N1a,Stage IIIA,6th,T3,,...,,,,Other,,,Z7,,TUMOR FREE,A


In [3]:
cbioportal["Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code"].value_counts()

Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code
N0           333
N1a          170
N0 (i-)      154
N1           127
N2a           64
N2            56
N3a           49
N1mi          37
N1b           33
N0 (i+)       28
N3            26
NX            20
N3b            3
N1c            2
N3c            1
N0 (mol+)      1
Name: count, dtype: int64

In [4]:
list(cbioportal.columns)

['Study ID',
 'Patient ID',
 'Sample ID',
 'Diagnosis Age',
 'American Joint Committee on Cancer Metastasis Stage Code',
 'Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code',
 'Neoplasm Disease Stage American Joint Committee on Cancer Code',
 'American Joint Committee on Cancer Publication Version Type',
 'American Joint Committee on Cancer Tumor Stage Code',
 'Brachytherapy first reference point administered total dose',
 'Cancer Type',
 'Cancer Type Detailed',
 'Cent17 Copy Number',
 'Neoplasm American Joint Committee on Cancer Clinical Distant Metastasis M Stage',
 'Neoplasm American Joint Committee on Cancer Clinical Regional Lymph Node N Stage',
 'Neoplasm American Joint Committee on Cancer Clinical Primary Tumor T Stage',
 'Neoplasm American Joint Committee on Cancer Clinical Group Stage',
 'Days to Sample Collection.',
 'Last Alive Less Initial Pathologic Diagnosis Date Calculated Day Value',
 'days_to_patient_progression_free',
 'Days to Sample Procureme

In [5]:
columns_to_include = ["Patient ID", "Sample ID","Number of Samples Per Patient", "Diagnosis Age", "Sex", "Prior Cancer Diagnosis Occurence", "Menopause Status", "Race Category", "ER Status By IHC","PR status by ihc", "Fraction Genome Altered","American Joint Committee on Cancer Tumor Stage Code","Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code","American Joint Committee on Cancer Metastasis Stage Code","Overall Survival Status","Overall Survival (Months)"]
#columns_to_include = ["Patient ID", "Sample ID","Number of Samples Per Patient", "Diagnosis Age", "Sex", "Prior Cancer Diagnosis Occurence", "Menopause Status", "Race Category", "Overall Survival Status","Overall Survival (Months)"]

len(columns_to_include)

16

In [6]:
clinical_columns = ["Diagnosis Age", "Sex", "Prior Cancer Diagnosis Occurence", "Menopause Status", "Race Category", "ER Status By IHC","PR status by ihc", "Fraction Genome Altered","American Joint Committee on Cancer Tumor Stage Code","Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code","American Joint Committee on Cancer Metastasis Stage Code"]
#clinical_columns = ["Diagnosis Age", "Sex", "Prior Cancer Diagnosis Occurence", "Menopause Status", "Race Category"]
len(clinical_columns)

11

In [7]:
cbioportal_survival = cbioportal.filter(columns_to_include)

## Find image

In [8]:
path_to_images = "/shared/fsartori/MuSEPI/dataset/"
all_images = os.listdir(path_to_images)

In [9]:
def find_matching_file(id_value, file_list, path_to_images):
    for file in file_list:
        if file.startswith(id_value):
            return path_to_images + file
    return None

In [10]:
# Add path to image to dataframe based on Sample ID
cbioportal_survival["Image Path"] = cbioportal_survival["Sample ID"].apply(lambda x: find_matching_file(x,all_images,path_to_images))
cbioportal_survival
    

Unnamed: 0,Patient ID,Sample ID,Number of Samples Per Patient,Diagnosis Age,Sex,Prior Cancer Diagnosis Occurence,Menopause Status,Race Category,ER Status By IHC,PR status by ihc,Fraction Genome Altered,American Joint Committee on Cancer Tumor Stage Code,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code,American Joint Committee on Cancer Metastasis Stage Code,Overall Survival Status,Overall Survival (Months),Image Path
0,TCGA-3C-AAAU,TCGA-3C-AAAU-01,1,55.0,Female,No,Pre (<6 months since LMP AND no prior bilatera...,WHITE,Positive,Positive,0.7787,TX,NX,MX,0:LIVING,132.95,
1,TCGA-3C-AALI,TCGA-3C-AALI-01,1,50.0,Female,No,Post (prior bilateral ovariectomy OR >12 mo si...,BLACK OR AFRICAN AMERICAN,Positive,Positive,0.7164,T2,N1a,M0,0:LIVING,131.57,/shared/fsartori/MuSEPI/dataset/TCGA-3C-AALI-0...
2,TCGA-3C-AALJ,TCGA-3C-AALJ-01,1,62.0,Female,No,Post (prior bilateral ovariectomy OR >12 mo si...,BLACK OR AFRICAN AMERICAN,Positive,Positive,0.5340,T2,N1a,M0,0:LIVING,48.42,/shared/fsartori/MuSEPI/dataset/TCGA-3C-AALJ-0...
3,TCGA-3C-AALK,TCGA-3C-AALK-01,1,52.0,Female,No,,BLACK OR AFRICAN AMERICAN,Positive,Positive,0.0764,T1c,N0 (i+),M0,0:LIVING,47.57,/shared/fsartori/MuSEPI/dataset/TCGA-3C-AALK-0...
4,TCGA-4H-AAAK,TCGA-4H-AAAK-01,1,50.0,Female,No,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,Positive,Positive,0.2364,T2,N2a,M0,0:LIVING,11.43,/shared/fsartori/MuSEPI/dataset/TCGA-4H-AAAK-0...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1103,TCGA-WT-AB44,TCGA-WT-AB44-01,1,77.0,Female,No,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,Positive,Positive,0.3436,T1c,N0 (i-),MX,0:LIVING,29.01,/shared/fsartori/MuSEPI/dataset/TCGA-WT-AB44-0...
1104,TCGA-XX-A899,TCGA-XX-A899-01,1,46.0,Female,No,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,Positive,Positive,0.0625,T1c,N2a,MX,0:LIVING,15.34,/shared/fsartori/MuSEPI/dataset/TCGA-XX-A899-0...
1105,TCGA-XX-A89A,TCGA-XX-A89A-01,1,68.0,Female,No,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,Positive,Positive,0.1764,T3,N0,MX,0:LIVING,16.03,/shared/fsartori/MuSEPI/dataset/TCGA-XX-A89A-0...
1106,TCGA-Z7-A8R5,TCGA-Z7-A8R5-01,1,61.0,Female,Yes,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,Positive,Positive,0.0452,T3,N1a,MX,0:LIVING,107.98,/shared/fsartori/MuSEPI/dataset/TCGA-Z7-A8R5-0...


In [11]:
cbioportal_survival["Image Path"].value_counts()

Image Path
/shared/fsartori/MuSEPI/dataset/TCGA-Z7-A8R6-01Z-00-DX1.CE4ED818-D762-4324-9DEA-2ACB38B9B0B9.svs    1
/shared/fsartori/MuSEPI/dataset/TCGA-3C-AALI-01Z-00-DX2.CF4496E0-AB52-4F3E-BDF5-C34833B91B7C.svs    1
/shared/fsartori/MuSEPI/dataset/TCGA-3C-AALJ-01Z-00-DX2.62DFE56B-B84C-40F9-9625-FCB55767B70D.svs    1
/shared/fsartori/MuSEPI/dataset/TCGA-3C-AALK-01Z-00-DX1.4E6EB156-BB19-410F-878F-FC0EA7BD0B53.svs    1
/shared/fsartori/MuSEPI/dataset/TCGA-4H-AAAK-01Z-00-DX1.ABF1B042-1970-4E28-8671-43AAD393D2F9.svs    1
                                                                                                   ..
/shared/fsartori/MuSEPI/dataset/TCGA-A1-A0SD-01Z-00-DX1.DB17BFA9-D951-42A8-91D2-F4C2EBC6EB9F.svs    1
/shared/fsartori/MuSEPI/dataset/TCGA-A1-A0SE-01Z-00-DX1.04B09232-C6C4-46EF-AA2C-41D078D0A80A.svs    1
/shared/fsartori/MuSEPI/dataset/TCGA-A1-A0SF-01Z-00-DX1.7F252D89-EA78-419F-A969-1B7313D77499.svs    1
/shared/fsartori/MuSEPI/dataset/TCGA-A1-A0SH-01Z-00-DX1.90E71B08-E1D9-4

In [12]:
cbioportal_survival["Image Path"] = cbioportal_survival["Image Path"].astype("string")

## Exclude some sites

In [13]:
hospitals_to_exclude = ["TCGA-A1", "TCGA-B6", "TCGA-E9"]
cbioportal_survival = cbioportal_survival[cbioportal_survival["Image Path"].str.contains('|'.join(hospitals_to_exclude))==False]
cbioportal_survival

Unnamed: 0,Patient ID,Sample ID,Number of Samples Per Patient,Diagnosis Age,Sex,Prior Cancer Diagnosis Occurence,Menopause Status,Race Category,ER Status By IHC,PR status by ihc,Fraction Genome Altered,American Joint Committee on Cancer Tumor Stage Code,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code,American Joint Committee on Cancer Metastasis Stage Code,Overall Survival Status,Overall Survival (Months),Image Path
1,TCGA-3C-AALI,TCGA-3C-AALI-01,1,50.0,Female,No,Post (prior bilateral ovariectomy OR >12 mo si...,BLACK OR AFRICAN AMERICAN,Positive,Positive,0.7164,T2,N1a,M0,0:LIVING,131.57,/shared/fsartori/MuSEPI/dataset/TCGA-3C-AALI-0...
2,TCGA-3C-AALJ,TCGA-3C-AALJ-01,1,62.0,Female,No,Post (prior bilateral ovariectomy OR >12 mo si...,BLACK OR AFRICAN AMERICAN,Positive,Positive,0.5340,T2,N1a,M0,0:LIVING,48.42,/shared/fsartori/MuSEPI/dataset/TCGA-3C-AALJ-0...
3,TCGA-3C-AALK,TCGA-3C-AALK-01,1,52.0,Female,No,,BLACK OR AFRICAN AMERICAN,Positive,Positive,0.0764,T1c,N0 (i+),M0,0:LIVING,47.57,/shared/fsartori/MuSEPI/dataset/TCGA-3C-AALK-0...
4,TCGA-4H-AAAK,TCGA-4H-AAAK-01,1,50.0,Female,No,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,Positive,Positive,0.2364,T2,N2a,M0,0:LIVING,11.43,/shared/fsartori/MuSEPI/dataset/TCGA-4H-AAAK-0...
5,TCGA-5L-AAT0,TCGA-5L-AAT0-01,1,42.0,Female,Yes,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,Positive,Positive,0.0702,T2,N0,M0,0:LIVING,48.52,/shared/fsartori/MuSEPI/dataset/TCGA-5L-AAT0-0...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1103,TCGA-WT-AB44,TCGA-WT-AB44-01,1,77.0,Female,No,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,Positive,Positive,0.3436,T1c,N0 (i-),MX,0:LIVING,29.01,/shared/fsartori/MuSEPI/dataset/TCGA-WT-AB44-0...
1104,TCGA-XX-A899,TCGA-XX-A899-01,1,46.0,Female,No,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,Positive,Positive,0.0625,T1c,N2a,MX,0:LIVING,15.34,/shared/fsartori/MuSEPI/dataset/TCGA-XX-A899-0...
1105,TCGA-XX-A89A,TCGA-XX-A89A-01,1,68.0,Female,No,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,Positive,Positive,0.1764,T3,N0,MX,0:LIVING,16.03,/shared/fsartori/MuSEPI/dataset/TCGA-XX-A89A-0...
1106,TCGA-Z7-A8R5,TCGA-Z7-A8R5-01,1,61.0,Female,Yes,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,Positive,Positive,0.0452,T3,N1a,MX,0:LIVING,107.98,/shared/fsartori/MuSEPI/dataset/TCGA-Z7-A8R5-0...


In [14]:
#np.savetxt(r'/shared/cgorner/images.txt', cbioportal_survival["Image Path"][cbioportal_survival["Image Path"].str.contains('|'.join(hospitals_to_exclude))==False].dropna().values, fmt="%s")

## Exclude rows without image, survival and where survival is less than zero

In [15]:
cbioportal_survival = cbioportal_survival[(cbioportal_survival["Image Path"].notnull()) & (cbioportal_survival["Overall Survival (Months)"].notnull()) & (cbioportal_survival["Overall Survival (Months)"] > 0)]
cbioportal_survival

Unnamed: 0,Patient ID,Sample ID,Number of Samples Per Patient,Diagnosis Age,Sex,Prior Cancer Diagnosis Occurence,Menopause Status,Race Category,ER Status By IHC,PR status by ihc,Fraction Genome Altered,American Joint Committee on Cancer Tumor Stage Code,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code,American Joint Committee on Cancer Metastasis Stage Code,Overall Survival Status,Overall Survival (Months),Image Path
1,TCGA-3C-AALI,TCGA-3C-AALI-01,1,50.0,Female,No,Post (prior bilateral ovariectomy OR >12 mo si...,BLACK OR AFRICAN AMERICAN,Positive,Positive,0.7164,T2,N1a,M0,0:LIVING,131.57,/shared/fsartori/MuSEPI/dataset/TCGA-3C-AALI-0...
2,TCGA-3C-AALJ,TCGA-3C-AALJ-01,1,62.0,Female,No,Post (prior bilateral ovariectomy OR >12 mo si...,BLACK OR AFRICAN AMERICAN,Positive,Positive,0.5340,T2,N1a,M0,0:LIVING,48.42,/shared/fsartori/MuSEPI/dataset/TCGA-3C-AALJ-0...
3,TCGA-3C-AALK,TCGA-3C-AALK-01,1,52.0,Female,No,,BLACK OR AFRICAN AMERICAN,Positive,Positive,0.0764,T1c,N0 (i+),M0,0:LIVING,47.57,/shared/fsartori/MuSEPI/dataset/TCGA-3C-AALK-0...
4,TCGA-4H-AAAK,TCGA-4H-AAAK-01,1,50.0,Female,No,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,Positive,Positive,0.2364,T2,N2a,M0,0:LIVING,11.43,/shared/fsartori/MuSEPI/dataset/TCGA-4H-AAAK-0...
5,TCGA-5L-AAT0,TCGA-5L-AAT0-01,1,42.0,Female,Yes,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,Positive,Positive,0.0702,T2,N0,M0,0:LIVING,48.52,/shared/fsartori/MuSEPI/dataset/TCGA-5L-AAT0-0...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1103,TCGA-WT-AB44,TCGA-WT-AB44-01,1,77.0,Female,No,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,Positive,Positive,0.3436,T1c,N0 (i-),MX,0:LIVING,29.01,/shared/fsartori/MuSEPI/dataset/TCGA-WT-AB44-0...
1104,TCGA-XX-A899,TCGA-XX-A899-01,1,46.0,Female,No,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,Positive,Positive,0.0625,T1c,N2a,MX,0:LIVING,15.34,/shared/fsartori/MuSEPI/dataset/TCGA-XX-A899-0...
1105,TCGA-XX-A89A,TCGA-XX-A89A-01,1,68.0,Female,No,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,Positive,Positive,0.1764,T3,N0,MX,0:LIVING,16.03,/shared/fsartori/MuSEPI/dataset/TCGA-XX-A89A-0...
1106,TCGA-Z7-A8R5,TCGA-Z7-A8R5-01,1,61.0,Female,Yes,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,Positive,Positive,0.0452,T3,N1a,MX,0:LIVING,107.98,/shared/fsartori/MuSEPI/dataset/TCGA-Z7-A8R5-0...


## Transform cancer codes

In [16]:
def get_digits(text):
    numbers = []
    for char in text:
        if char.isdigit() or char == 'X':
            numbers.append(char)
    return ''.join(numbers)

In [17]:
cbioportal_survival["American Joint Committee on Cancer Tumor Stage Code"] = cbioportal_survival["American Joint Committee on Cancer Tumor Stage Code"].apply(lambda x: get_digits(str(x)))
cbioportal_survival["American Joint Committee on Cancer Tumor Stage Code"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cbioportal_survival["American Joint Committee on Cancer Tumor Stage Code"] = cbioportal_survival["American Joint Committee on Cancer Tumor Stage Code"].apply(lambda x: get_digits(str(x)))


American Joint Committee on Cancer Tumor Stage Code
2    533
1    244
3    114
4     33
Name: count, dtype: int64

In [18]:
cbioportal_survival["American Joint Committee on Cancer Metastasis Stage Code"] = cbioportal_survival["American Joint Committee on Cancer Metastasis Stage Code"].apply(lambda x: get_digits(str(x)))
cbioportal_survival["American Joint Committee on Cancer Metastasis Stage Code"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cbioportal_survival["American Joint Committee on Cancer Metastasis Stage Code"] = cbioportal_survival["American Joint Committee on Cancer Metastasis Stage Code"].apply(lambda x: get_digits(str(x)))


American Joint Committee on Cancer Metastasis Stage Code
0    757
X    149
1     18
Name: count, dtype: int64

In [19]:
cbioportal_survival["Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code"] = cbioportal_survival["Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code"].apply(lambda x: get_digits(str(x)))
cbioportal_survival["Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cbioportal_survival["Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code"] = cbioportal_survival["Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code"].apply(lambda x: get_digits(str(x)))


Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code
0    433
1    310
2    103
3     66
X     12
Name: count, dtype: int64

## Transform binary categorical columns to int

In [89]:
cbioportal_survival["Sex"] = cbioportal_survival["Sex"].apply(lambda x: 1 if x == "Female" else 0)
cbioportal_survival["Sex"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cbioportal_survival["Sex"] = cbioportal_survival["Sex"].apply(lambda x: 1 if x == "Female" else 0)


1       1
2       1
3       1
4       1
5       1
       ..
1103    1
1104    1
1105    1
1106    1
1107    1
Name: Sex, Length: 924, dtype: int64

In [90]:
cbioportal_survival["Prior Cancer Diagnosis Occurence"] = cbioportal_survival["Prior Cancer Diagnosis Occurence"].apply(lambda x: 1 if x == "Yes" else 0)
cbioportal_survival["Prior Cancer Diagnosis Occurence"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cbioportal_survival["Prior Cancer Diagnosis Occurence"] = cbioportal_survival["Prior Cancer Diagnosis Occurence"].apply(lambda x: 1 if x == "Yes" else 0)


1       0
2       0
3       0
4       0
5       1
       ..
1103    0
1104    0
1105    0
1106    1
1107    0
Name: Prior Cancer Diagnosis Occurence, Length: 924, dtype: int64

In [91]:
cbioportal_survival["Overall Survival Status"] = cbioportal_survival["Overall Survival Status"].apply(lambda x: 0 if "0" in x else 1)
cbioportal_survival["Overall Survival Status"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cbioportal_survival["Overall Survival Status"] = cbioportal_survival["Overall Survival Status"].apply(lambda x: 0 if "0" in x else 1)


Overall Survival Status
0    814
1    110
Name: count, dtype: int64

## Impute missing values

In [23]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.preprocessing import OrdinalEncoder

In [24]:
encoder = OrdinalEncoder()
data_encoded = encoder.fit_transform(cbioportal_survival)
data_encoded

array([[  0.,   0.,   0., ...,   0., 692.,   0.],
       [  1.,   1.,   0., ...,   0., 447.,   1.],
       [  2.,   2.,   0., ...,   0., 444.,   2.],
       ...,
       [921., 921.,   0., ...,   0., 156., 921.],
       [922., 922.,   0., ...,   0., 668., 922.],
       [923., 923.,   0., ...,   0., 662., 923.]])

In [25]:
iter_imp = IterativeImputer(max_iter=10, random_state=42)
data_imputed = iter_imp.fit_transform(data_encoded)

In [26]:
data_imputed = np.round(data_imputed)  # Important for categorical values
data_decoded = encoder.inverse_transform(data_imputed)

cbioportal_imputed = pd.DataFrame(data_decoded, columns=cbioportal_survival.columns)
cbioportal_imputed


Unnamed: 0,Patient ID,Sample ID,Number of Samples Per Patient,Diagnosis Age,Sex,Prior Cancer Diagnosis Occurence,Menopause Status,Race Category,ER Status By IHC,PR status by ihc,Fraction Genome Altered,American Joint Committee on Cancer Tumor Stage Code,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code,American Joint Committee on Cancer Metastasis Stage Code,Overall Survival Status,Overall Survival (Months),Image Path
0,TCGA-3C-AALI,TCGA-3C-AALI-01,1,50.0,1,0,Post (prior bilateral ovariectomy OR >12 mo si...,BLACK OR AFRICAN AMERICAN,Positive,Positive,0.7164,2,1,0,0,131.57,/shared/fsartori/MuSEPI/dataset/TCGA-3C-AALI-0...
1,TCGA-3C-AALJ,TCGA-3C-AALJ-01,1,62.0,1,0,Post (prior bilateral ovariectomy OR >12 mo si...,BLACK OR AFRICAN AMERICAN,Positive,Positive,0.534,2,1,0,0,48.42,/shared/fsartori/MuSEPI/dataset/TCGA-3C-AALJ-0...
2,TCGA-3C-AALK,TCGA-3C-AALK-01,1,52.0,1,0,Post (prior bilateral ovariectomy OR >12 mo si...,BLACK OR AFRICAN AMERICAN,Positive,Positive,0.0764,1,0,0,0,47.57,/shared/fsartori/MuSEPI/dataset/TCGA-3C-AALK-0...
3,TCGA-4H-AAAK,TCGA-4H-AAAK-01,1,50.0,1,0,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,Positive,Positive,0.2364,2,2,0,0,11.43,/shared/fsartori/MuSEPI/dataset/TCGA-4H-AAAK-0...
4,TCGA-5L-AAT0,TCGA-5L-AAT0-01,1,42.0,1,1,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,Positive,Positive,0.0702,2,0,0,0,48.52,/shared/fsartori/MuSEPI/dataset/TCGA-5L-AAT0-0...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919,TCGA-WT-AB44,TCGA-WT-AB44-01,1,77.0,1,0,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,Positive,Positive,0.3436,1,0,X,0,29.01,/shared/fsartori/MuSEPI/dataset/TCGA-WT-AB44-0...
920,TCGA-XX-A899,TCGA-XX-A899-01,1,46.0,1,0,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,Positive,Positive,0.0625,1,2,X,0,15.34,/shared/fsartori/MuSEPI/dataset/TCGA-XX-A899-0...
921,TCGA-XX-A89A,TCGA-XX-A89A-01,1,68.0,1,0,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,Positive,Positive,0.1764,3,0,X,0,16.03,/shared/fsartori/MuSEPI/dataset/TCGA-XX-A89A-0...
922,TCGA-Z7-A8R5,TCGA-Z7-A8R5-01,1,61.0,1,1,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,Positive,Positive,0.0452,3,1,X,0,107.98,/shared/fsartori/MuSEPI/dataset/TCGA-Z7-A8R5-0...


In [27]:
cbioportal_imputed.isnull().any()

Patient ID                                                                   False
Sample ID                                                                    False
Number of Samples Per Patient                                                False
Diagnosis Age                                                                False
Sex                                                                          False
Prior Cancer Diagnosis Occurence                                             False
Menopause Status                                                             False
Race Category                                                                False
ER Status By IHC                                                             False
PR status by ihc                                                             False
Fraction Genome Altered                                                      False
American Joint Committee on Cancer Tumor Stage Code                          False
Neop

## Categorical to one hot encoding

In [28]:
def categorical_to_onehot(pd_dataframe, columns):
    for c in columns:
        onehots = pd.get_dummies(pd_dataframe[c], prefix=c, dtype=int)
        pd_dataframe = pd_dataframe.drop(c, axis=1)
        pd_dataframe = pd.concat([pd_dataframe, onehots], axis=1)
    return pd_dataframe

In [29]:
cbioportal_onehot = categorical_to_onehot(cbioportal_imputed, ["Menopause Status", "Race Category", "ER Status By IHC", "PR status by ihc", "American Joint Committee on Cancer Tumor Stage Code", "Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code", "American Joint Committee on Cancer Metastasis Stage Code"])
#cbioportal_onehot = categorical_to_onehot(cbioportal_imputed, ["Menopause Status", "Race Category"])


In [30]:
cbioportal_onehot

Unnamed: 0,Patient ID,Sample ID,Number of Samples Per Patient,Diagnosis Age,Sex,Prior Cancer Diagnosis Occurence,Fraction Genome Altered,Overall Survival Status,Overall Survival (Months),Image Path,...,American Joint Committee on Cancer Tumor Stage Code_3,American Joint Committee on Cancer Tumor Stage Code_4,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code_0,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code_1,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code_2,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code_3,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code_X,American Joint Committee on Cancer Metastasis Stage Code_0,American Joint Committee on Cancer Metastasis Stage Code_1,American Joint Committee on Cancer Metastasis Stage Code_X
0,TCGA-3C-AALI,TCGA-3C-AALI-01,1,50.0,1,0,0.7164,0,131.57,/shared/fsartori/MuSEPI/dataset/TCGA-3C-AALI-0...,...,0,0,0,1,0,0,0,1,0,0
1,TCGA-3C-AALJ,TCGA-3C-AALJ-01,1,62.0,1,0,0.534,0,48.42,/shared/fsartori/MuSEPI/dataset/TCGA-3C-AALJ-0...,...,0,0,0,1,0,0,0,1,0,0
2,TCGA-3C-AALK,TCGA-3C-AALK-01,1,52.0,1,0,0.0764,0,47.57,/shared/fsartori/MuSEPI/dataset/TCGA-3C-AALK-0...,...,0,0,1,0,0,0,0,1,0,0
3,TCGA-4H-AAAK,TCGA-4H-AAAK-01,1,50.0,1,0,0.2364,0,11.43,/shared/fsartori/MuSEPI/dataset/TCGA-4H-AAAK-0...,...,0,0,0,0,1,0,0,1,0,0
4,TCGA-5L-AAT0,TCGA-5L-AAT0-01,1,42.0,1,1,0.0702,0,48.52,/shared/fsartori/MuSEPI/dataset/TCGA-5L-AAT0-0...,...,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919,TCGA-WT-AB44,TCGA-WT-AB44-01,1,77.0,1,0,0.3436,0,29.01,/shared/fsartori/MuSEPI/dataset/TCGA-WT-AB44-0...,...,0,0,1,0,0,0,0,0,0,1
920,TCGA-XX-A899,TCGA-XX-A899-01,1,46.0,1,0,0.0625,0,15.34,/shared/fsartori/MuSEPI/dataset/TCGA-XX-A899-0...,...,0,0,0,0,1,0,0,0,0,1
921,TCGA-XX-A89A,TCGA-XX-A89A-01,1,68.0,1,0,0.1764,0,16.03,/shared/fsartori/MuSEPI/dataset/TCGA-XX-A89A-0...,...,1,0,1,0,0,0,0,0,0,1
922,TCGA-Z7-A8R5,TCGA-Z7-A8R5-01,1,61.0,1,1,0.0452,0,107.98,/shared/fsartori/MuSEPI/dataset/TCGA-Z7-A8R5-0...,...,1,0,0,1,0,0,0,0,0,1


In [31]:
cbioportal_onehot["Image ID"] = cbioportal_onehot["Image Path"].apply(lambda x: x.lstrip(path_to_images).rstrip(".svs"))
cbioportal_onehot

Unnamed: 0,Patient ID,Sample ID,Number of Samples Per Patient,Diagnosis Age,Sex,Prior Cancer Diagnosis Occurence,Fraction Genome Altered,Overall Survival Status,Overall Survival (Months),Image Path,...,American Joint Committee on Cancer Tumor Stage Code_4,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code_0,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code_1,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code_2,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code_3,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code_X,American Joint Committee on Cancer Metastasis Stage Code_0,American Joint Committee on Cancer Metastasis Stage Code_1,American Joint Committee on Cancer Metastasis Stage Code_X,Image ID
0,TCGA-3C-AALI,TCGA-3C-AALI-01,1,50.0,1,0,0.7164,0,131.57,/shared/fsartori/MuSEPI/dataset/TCGA-3C-AALI-0...,...,0,0,1,0,0,0,1,0,0,TCGA-3C-AALI-01Z-00-DX2.CF4496E0-AB52-4F3E-BDF...
1,TCGA-3C-AALJ,TCGA-3C-AALJ-01,1,62.0,1,0,0.534,0,48.42,/shared/fsartori/MuSEPI/dataset/TCGA-3C-AALJ-0...,...,0,0,1,0,0,0,1,0,0,TCGA-3C-AALJ-01Z-00-DX2.62DFE56B-B84C-40F9-962...
2,TCGA-3C-AALK,TCGA-3C-AALK-01,1,52.0,1,0,0.0764,0,47.57,/shared/fsartori/MuSEPI/dataset/TCGA-3C-AALK-0...,...,0,1,0,0,0,0,1,0,0,TCGA-3C-AALK-01Z-00-DX1.4E6EB156-BB19-410F-878...
3,TCGA-4H-AAAK,TCGA-4H-AAAK-01,1,50.0,1,0,0.2364,0,11.43,/shared/fsartori/MuSEPI/dataset/TCGA-4H-AAAK-0...,...,0,0,0,1,0,0,1,0,0,TCGA-4H-AAAK-01Z-00-DX1.ABF1B042-1970-4E28-867...
4,TCGA-5L-AAT0,TCGA-5L-AAT0-01,1,42.0,1,1,0.0702,0,48.52,/shared/fsartori/MuSEPI/dataset/TCGA-5L-AAT0-0...,...,0,1,0,0,0,0,1,0,0,TCGA-5L-AAT0-01Z-00-DX1.5E171263-30BF-4C6B-88A...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919,TCGA-WT-AB44,TCGA-WT-AB44-01,1,77.0,1,0,0.3436,0,29.01,/shared/fsartori/MuSEPI/dataset/TCGA-WT-AB44-0...,...,0,1,0,0,0,0,0,0,1,TCGA-WT-AB44-01Z-00-DX1.B6ECEA7C-DA26-4B34-88C...
920,TCGA-XX-A899,TCGA-XX-A899-01,1,46.0,1,0,0.0625,0,15.34,/shared/fsartori/MuSEPI/dataset/TCGA-XX-A899-0...,...,0,0,0,1,0,0,0,0,1,TCGA-XX-A899-01Z-00-DX1.08FE27B7-73B8-4CE3-ACF...
921,TCGA-XX-A89A,TCGA-XX-A89A-01,1,68.0,1,0,0.1764,0,16.03,/shared/fsartori/MuSEPI/dataset/TCGA-XX-A89A-0...,...,0,1,0,0,0,0,0,0,1,TCGA-XX-A89A-01Z-00-DX1.671E2AD6-4D1A-4579-88C...
922,TCGA-Z7-A8R5,TCGA-Z7-A8R5-01,1,61.0,1,1,0.0452,0,107.98,/shared/fsartori/MuSEPI/dataset/TCGA-Z7-A8R5-0...,...,0,0,1,0,0,0,0,0,1,TCGA-Z7-A8R5-01Z-00-DX1.3BDB407F-514C-4131-B05...


In [32]:
cbioportal_onehot["Overall Survival Status"] = cbioportal_onehot["Overall Survival Status"].apply(lambda x: "event" if x ==1 else "censored")

In [33]:
cbioportal_onehot.drop(labels=["Sample ID", "Image Path"], axis=1, inplace=True)

In [34]:
cbioportal_onehot.rename({"Patient ID": "case_id", "Image ID": "slide_id", "Overall Survival Status": "label", "Overall Survival (Months)": "duration"},axis=1, inplace=True)

In [35]:
cbioportal_onehot

Unnamed: 0,case_id,Number of Samples Per Patient,Diagnosis Age,Sex,Prior Cancer Diagnosis Occurence,Fraction Genome Altered,label,duration,Menopause Status_Indeterminate (neither Pre or Postmenopausal),Menopause Status_Peri (6-12 months since last menstrual period),...,American Joint Committee on Cancer Tumor Stage Code_4,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code_0,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code_1,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code_2,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code_3,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code_X,American Joint Committee on Cancer Metastasis Stage Code_0,American Joint Committee on Cancer Metastasis Stage Code_1,American Joint Committee on Cancer Metastasis Stage Code_X,slide_id
0,TCGA-3C-AALI,1,50.0,1,0,0.7164,censored,131.57,0,0,...,0,0,1,0,0,0,1,0,0,TCGA-3C-AALI-01Z-00-DX2.CF4496E0-AB52-4F3E-BDF...
1,TCGA-3C-AALJ,1,62.0,1,0,0.534,censored,48.42,0,0,...,0,0,1,0,0,0,1,0,0,TCGA-3C-AALJ-01Z-00-DX2.62DFE56B-B84C-40F9-962...
2,TCGA-3C-AALK,1,52.0,1,0,0.0764,censored,47.57,0,0,...,0,1,0,0,0,0,1,0,0,TCGA-3C-AALK-01Z-00-DX1.4E6EB156-BB19-410F-878...
3,TCGA-4H-AAAK,1,50.0,1,0,0.2364,censored,11.43,0,0,...,0,0,0,1,0,0,1,0,0,TCGA-4H-AAAK-01Z-00-DX1.ABF1B042-1970-4E28-867...
4,TCGA-5L-AAT0,1,42.0,1,1,0.0702,censored,48.52,0,0,...,0,1,0,0,0,0,1,0,0,TCGA-5L-AAT0-01Z-00-DX1.5E171263-30BF-4C6B-88A...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919,TCGA-WT-AB44,1,77.0,1,0,0.3436,censored,29.01,0,0,...,0,1,0,0,0,0,0,0,1,TCGA-WT-AB44-01Z-00-DX1.B6ECEA7C-DA26-4B34-88C...
920,TCGA-XX-A899,1,46.0,1,0,0.0625,censored,15.34,0,0,...,0,0,0,1,0,0,0,0,1,TCGA-XX-A899-01Z-00-DX1.08FE27B7-73B8-4CE3-ACF...
921,TCGA-XX-A89A,1,68.0,1,0,0.1764,censored,16.03,0,0,...,0,1,0,0,0,0,0,0,1,TCGA-XX-A89A-01Z-00-DX1.671E2AD6-4D1A-4579-88C...
922,TCGA-Z7-A8R5,1,61.0,1,1,0.0452,censored,107.98,0,0,...,0,0,1,0,0,0,0,0,1,TCGA-Z7-A8R5-01Z-00-DX1.3BDB407F-514C-4131-B05...


## Normalize non-categorical

In [36]:
from sklearn.preprocessing import MinMaxScaler

In [37]:
scaler = MinMaxScaler()

In [38]:
type(cbioportal_onehot["Diagnosis Age"].values)

numpy.ndarray

In [39]:
cbioportal_onehot["Diagnosis Age"] = scaler.fit_transform(cbioportal_onehot["Diagnosis Age"].values.reshape(-1,1))
cbioportal_onehot

Unnamed: 0,case_id,Number of Samples Per Patient,Diagnosis Age,Sex,Prior Cancer Diagnosis Occurence,Fraction Genome Altered,label,duration,Menopause Status_Indeterminate (neither Pre or Postmenopausal),Menopause Status_Peri (6-12 months since last menstrual period),...,American Joint Committee on Cancer Tumor Stage Code_4,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code_0,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code_1,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code_2,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code_3,Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code_X,American Joint Committee on Cancer Metastasis Stage Code_0,American Joint Committee on Cancer Metastasis Stage Code_1,American Joint Committee on Cancer Metastasis Stage Code_X,slide_id
0,TCGA-3C-AALI,1,0.375000,1,0,0.7164,censored,131.57,0,0,...,0,0,1,0,0,0,1,0,0,TCGA-3C-AALI-01Z-00-DX2.CF4496E0-AB52-4F3E-BDF...
1,TCGA-3C-AALJ,1,0.562500,1,0,0.534,censored,48.42,0,0,...,0,0,1,0,0,0,1,0,0,TCGA-3C-AALJ-01Z-00-DX2.62DFE56B-B84C-40F9-962...
2,TCGA-3C-AALK,1,0.406250,1,0,0.0764,censored,47.57,0,0,...,0,1,0,0,0,0,1,0,0,TCGA-3C-AALK-01Z-00-DX1.4E6EB156-BB19-410F-878...
3,TCGA-4H-AAAK,1,0.375000,1,0,0.2364,censored,11.43,0,0,...,0,0,0,1,0,0,1,0,0,TCGA-4H-AAAK-01Z-00-DX1.ABF1B042-1970-4E28-867...
4,TCGA-5L-AAT0,1,0.250000,1,1,0.0702,censored,48.52,0,0,...,0,1,0,0,0,0,1,0,0,TCGA-5L-AAT0-01Z-00-DX1.5E171263-30BF-4C6B-88A...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919,TCGA-WT-AB44,1,0.796875,1,0,0.3436,censored,29.01,0,0,...,0,1,0,0,0,0,0,0,1,TCGA-WT-AB44-01Z-00-DX1.B6ECEA7C-DA26-4B34-88C...
920,TCGA-XX-A899,1,0.312500,1,0,0.0625,censored,15.34,0,0,...,0,0,0,1,0,0,0,0,1,TCGA-XX-A899-01Z-00-DX1.08FE27B7-73B8-4CE3-ACF...
921,TCGA-XX-A89A,1,0.656250,1,0,0.1764,censored,16.03,0,0,...,0,1,0,0,0,0,0,0,1,TCGA-XX-A89A-01Z-00-DX1.671E2AD6-4D1A-4579-88C...
922,TCGA-Z7-A8R5,1,0.546875,1,1,0.0452,censored,107.98,0,0,...,0,0,1,0,0,0,0,0,1,TCGA-Z7-A8R5-01Z-00-DX1.3BDB407F-514C-4131-B05...


In [40]:
clinical_columns

['Diagnosis Age',
 'Sex',
 'Prior Cancer Diagnosis Occurence',
 'Menopause Status',
 'Race Category',
 'ER Status By IHC',
 'PR status by ihc',
 'Fraction Genome Altered',
 'American Joint Committee on Cancer Tumor Stage Code',
 'Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code',
 'American Joint Committee on Cancer Metastasis Stage Code']

In [41]:
all_clinical_columns = []
for c in clinical_columns:
    for d in cbioportal_onehot.columns:
        if c in d:
            all_clinical_columns.append(d)
all_clinical_columns

['Diagnosis Age',
 'Sex',
 'Prior Cancer Diagnosis Occurence',
 'Menopause Status_Indeterminate (neither Pre or Postmenopausal)',
 'Menopause Status_Peri (6-12 months since last menstrual period)',
 'Menopause Status_Post (prior bilateral ovariectomy OR >12 mo since LMP with no prior hysterectomy)',
 'Menopause Status_Pre (<6 months since LMP AND no prior bilateral ovariectomy AND not on estrogen replacement)',
 'Race Category_AMERICAN INDIAN OR ALASKA NATIVE',
 'Race Category_ASIAN',
 'Race Category_BLACK OR AFRICAN AMERICAN',
 'Race Category_WHITE',
 'ER Status By IHC_Indeterminate',
 'ER Status By IHC_Negative',
 'ER Status By IHC_Positive',
 'PR status by ihc_Indeterminate',
 'PR status by ihc_Negative',
 'PR status by ihc_Positive',
 'Fraction Genome Altered',
 'American Joint Committee on Cancer Tumor Stage Code_1',
 'American Joint Committee on Cancer Tumor Stage Code_2',
 'American Joint Committee on Cancer Tumor Stage Code_3',
 'American Joint Committee on Cancer Tumor Stage C

In [42]:
prefix = "clinical_"
cbioportal_onehot.rename(columns={col: f'{prefix}{col}' for col in all_clinical_columns}, inplace=True)

In [43]:
cbioportal_onehot

Unnamed: 0,case_id,Number of Samples Per Patient,clinical_Diagnosis Age,clinical_Sex,clinical_Prior Cancer Diagnosis Occurence,clinical_Fraction Genome Altered,label,duration,clinical_Menopause Status_Indeterminate (neither Pre or Postmenopausal),clinical_Menopause Status_Peri (6-12 months since last menstrual period),...,clinical_American Joint Committee on Cancer Tumor Stage Code_4,clinical_Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code_0,clinical_Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code_1,clinical_Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code_2,clinical_Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code_3,clinical_Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code_X,clinical_American Joint Committee on Cancer Metastasis Stage Code_0,clinical_American Joint Committee on Cancer Metastasis Stage Code_1,clinical_American Joint Committee on Cancer Metastasis Stage Code_X,slide_id
0,TCGA-3C-AALI,1,0.375000,1,0,0.7164,censored,131.57,0,0,...,0,0,1,0,0,0,1,0,0,TCGA-3C-AALI-01Z-00-DX2.CF4496E0-AB52-4F3E-BDF...
1,TCGA-3C-AALJ,1,0.562500,1,0,0.534,censored,48.42,0,0,...,0,0,1,0,0,0,1,0,0,TCGA-3C-AALJ-01Z-00-DX2.62DFE56B-B84C-40F9-962...
2,TCGA-3C-AALK,1,0.406250,1,0,0.0764,censored,47.57,0,0,...,0,1,0,0,0,0,1,0,0,TCGA-3C-AALK-01Z-00-DX1.4E6EB156-BB19-410F-878...
3,TCGA-4H-AAAK,1,0.375000,1,0,0.2364,censored,11.43,0,0,...,0,0,0,1,0,0,1,0,0,TCGA-4H-AAAK-01Z-00-DX1.ABF1B042-1970-4E28-867...
4,TCGA-5L-AAT0,1,0.250000,1,1,0.0702,censored,48.52,0,0,...,0,1,0,0,0,0,1,0,0,TCGA-5L-AAT0-01Z-00-DX1.5E171263-30BF-4C6B-88A...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919,TCGA-WT-AB44,1,0.796875,1,0,0.3436,censored,29.01,0,0,...,0,1,0,0,0,0,0,0,1,TCGA-WT-AB44-01Z-00-DX1.B6ECEA7C-DA26-4B34-88C...
920,TCGA-XX-A899,1,0.312500,1,0,0.0625,censored,15.34,0,0,...,0,0,0,1,0,0,0,0,1,TCGA-XX-A899-01Z-00-DX1.08FE27B7-73B8-4CE3-ACF...
921,TCGA-XX-A89A,1,0.656250,1,0,0.1764,censored,16.03,0,0,...,0,1,0,0,0,0,0,0,1,TCGA-XX-A89A-01Z-00-DX1.671E2AD6-4D1A-4579-88C...
922,TCGA-Z7-A8R5,1,0.546875,1,1,0.0452,censored,107.98,0,0,...,0,0,1,0,0,0,0,0,1,TCGA-Z7-A8R5-01Z-00-DX1.3BDB407F-514C-4131-B05...


## Set Menopause for Males to 0

In [44]:
menopause_cols = [col for col in cbioportal_onehot.columns if "Menopause" in col]

In [45]:
cbioportal_onehot.loc[cbioportal_onehot["clinical_Sex"] == 0, menopause_cols] = 0

## Save

In [None]:
cbioportal_onehot.to_csv("/shared/cgorner/tcga-brca-survival-clinical.csv", index=False)

In [20]:
cbioportal_survival.to_csv("/shared/cgorner/tcga-brca-survival-clinical-raw.csv", index=False)