## Full xena file

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
import os

In [2]:
ge = np.load("/home/bram/jointomicscomp/data/GE.npy")
me = np.load("/home/bram/jointomicscomp/data/ME.npy")
gcn = np.load("/home/bram/jointomicscomp/data/GCN.npy")
samples = np.load("/home/bram/jointomicscomp/data/sampleNames.npy")
cancertype = np.load("/home/bram/jointomicscomp/data/cancerType.npy")
cancertypes = np.load("/home/bram/jointomicscomp/data/cancerTypes.npy")

## Curated Data

In [3]:
survival_table = pd.read_table("/home/bram/jointomicscomp/data/Survival_SupplementalTable_S1_20171025_xena_sp", index_col=0).loc[samples]

survival_table

Unnamed: 0_level_0,_PATIENT,cancer type abbreviation,age_at_initial_pathologic_diagnosis,gender,race,ajcc_pathologic_tumor_stage,clinical_stage,histological_type,histological_grade,initial_pathologic_dx_year,...,residual_tumor,OS,OS.time,DSS,DSS.time,DFI,DFI.time,PFI,PFI.time,Redaction
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-05-4384-01,TCGA-05-4384,LUAD,66.0,MALE,,Stage IIIA,,Lung Adenocarcinoma,,2009.0,...,,0.0,426.0,0.0,426.0,,,1.0,183.0,
TCGA-05-4390-01,TCGA-05-4390,LUAD,58.0,FEMALE,,Stage IB,,Lung Adenocarcinoma,,2005.0,...,,0.0,1126.0,0.0,1126.0,,,1.0,395.0,
TCGA-05-4396-01,TCGA-05-4396,LUAD,76.0,MALE,,Stage IIIB,,Lung Adenocarcinoma,,2006.0,...,,1.0,303.0,,303.0,,,0.0,303.0,
TCGA-05-4405-01,TCGA-05-4405,LUAD,74.0,FEMALE,,Stage IB,,Lung Adenocarcinoma,,2006.0,...,,0.0,610.0,0.0,610.0,,,0.0,610.0,
TCGA-05-4410-01,TCGA-05-4410,LUAD,62.0,MALE,,Stage IB,,Lung Adenocarcinoma,,2007.0,...,,0.0,0.0,0.0,0.0,,,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-ZS-A9CF-01,TCGA-ZS-A9CF,LIHC,64.0,MALE,WHITE,Stage II,,Hepatocellular Carcinoma,G2,2008.0,...,R0,0.0,2412.0,0.0,2412.0,1.0,636.0,1.0,636.0,
TCGA-ZS-A9CG-01,TCGA-ZS-A9CG,LIHC,55.0,MALE,WHITE,Stage II,,Hepatocellular Carcinoma,G2,2013.0,...,R0,0.0,341.0,0.0,341.0,0.0,341.0,0.0,341.0,
TCGA-ZT-A8OM-01,TCGA-ZT-A8OM,THYM,73.0,FEMALE,WHITE,,IIb,Thymoma; Type A,,2011.0,...,,0.0,1398.0,0.0,1398.0,,,0.0,1398.0,
TCGA-ZU-A8S4-01,TCGA-ZU-A8S4,CHOL,52.0,MALE,WHITE,Stage I,,Cholangiocarcinoma; intrahepatic,G3,2012.0,...,R0,1.0,98.0,1.0,98.0,1.0,72.0,1.0,72.0,


## Show most common cancer types

In [4]:
counter = dict()
cancer_types = survival_table.iloc[:, 1]
for value in cancer_types:
    if value in counter:
        counter[value] += 1
        
    else:
        counter[value] = 1
        
counter = sorted(counter.items(), key=lambda x: x[1], reverse=True)
counter

[('BRCA', 763),
 ('HNSC', 514),
 ('LGG', 512),
 ('THCA', 497),
 ('PRAD', 487),
 ('LUAD', 441),
 ('UCEC', 410),
 ('BLCA', 401),
 ('STAD', 370),
 ('LUSC', 367),
 ('SKCM', 366),
 ('LIHC', 364),
 ('KIRC', 310),
 ('CESC', 292),
 ('COAD', 282),
 ('KIRP', 272),
 ('SARC', 255),
 ('ESCA', 183),
 ('PAAD', 177),
 ('PCPG', 162),
 ('TGCT', 134),
 ('THYM', 119),
 ('LAML', 117),
 ('READ', 95),
 ('MESO', 87),
 ('UVM', 80),
 ('ACC', 77),
 ('KICH', 66),
 ('UCS', 56),
 ('DLBC', 48),
 ('GBM', 47),
 ('CHOL', 36),
 ('OV', 9)]

In [5]:
(unique, counts) = np.unique(cancertype, return_counts=True)
frequencies = np.asarray((unique, counts)).T
frequencies

array([[  0., 117.],
       [  1.,  77.],
       [  2., 401.],
       [  3., 512.],
       [  4., 763.],
       [  5., 292.],
       [  6.,  36.],
       [  7., 282.],
       [  8., 183.],
       [  9.,  47.],
       [ 10., 514.],
       [ 11.,  66.],
       [ 12., 310.],
       [ 13., 272.],
       [ 14., 364.],
       [ 15., 441.],
       [ 16., 367.],
       [ 17.,  48.],
       [ 18.,  87.],
       [ 19.,   9.],
       [ 20., 177.],
       [ 21., 162.],
       [ 22., 487.],
       [ 23.,  95.],
       [ 24., 255.],
       [ 25., 366.],
       [ 26., 370.],
       [ 27., 134.],
       [ 28., 119.],
       [ 29., 497.],
       [ 30.,  56.],
       [ 31., 410.],
       [ 32.,  80.]])

## Select cancer type and data

In [6]:
# All patients with a certain cancer cancer.
ctype = 'BRCA'
ctype_idx = np.where(cancertypes == ctype)[0][0] # Get index of ctype in total cancers list

# Save all required data in data/cancer_type folder
save_dir = os.path.join("/home/bram/jointomicscomp/data", '{}_OStime'.format(ctype))
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

cancer_type_info_table = survival_table.loc[survival_table["cancer type abbreviation"] == ctype]
print(cancer_type_info_table.shape)
cancer_type_patients = cancer_type_info_table["_PATIENT"]
    
cancer_type_patients

(763, 33)


sample
TCGA-3C-AAAU-01    TCGA-3C-AAAU
TCGA-3C-AALI-01    TCGA-3C-AALI
TCGA-3C-AALJ-01    TCGA-3C-AALJ
TCGA-3C-AALK-01    TCGA-3C-AALK
TCGA-4H-AAAK-01    TCGA-4H-AAAK
                       ...     
TCGA-WT-AB44-01    TCGA-WT-AB44
TCGA-XX-A899-01    TCGA-XX-A899
TCGA-XX-A89A-01    TCGA-XX-A89A
TCGA-Z7-A8R5-01    TCGA-Z7-A8R5
TCGA-Z7-A8R6-01    TCGA-Z7-A8R6
Name: _PATIENT, Length: 763, dtype: object

## Get RNA data from these patients grab corresponding stage

In [7]:
# OS_times.
gene_expression_csv = pd.read_csv("/home/bram/jointomicscomp/data/rna-pancancer-5000-minmax.csv", index_col=0)
gene_expression_csv = gene_expression_csv.loc[samples]  # Take only samples used in this research
print("gene_expression_csv shape: ", gene_expression_csv.shape)

OS_times = []
no_time_counter = 0
no_time_samples = []
for patient_idx, patient in enumerate(cancer_type_patients):
    # Get all occurences of the patient in the original data file.
    x = gene_expression_csv[[patient in label for label in samples]]
    
    y = survival_table[survival_table["_PATIENT"] == patient]["OS.time"].values[0]
    
    if np.isnan(y) or x.empty:
        no_time_counter += 1
        print(x.index.values)
        no_time_samples.append(x.index.values)
        continue
        
    OS_times.append(y)
    
print("{} Samples did not have OS.time in the info dataframe.".format(no_time_counter))
print(len(OS_times))
    


gene_expression_csv shape:  (8396, 5000)
['TCGA-OL-A66H-01']
1 Samples did not have OS.time in the info dataframe.
762


In [8]:
OS_times = np.array(OS_times).astype(int)

In [9]:
no_time_samples

[array(['TCGA-OL-A66H-01'], dtype=object)]

In [10]:
no_time_sample_indices = np.zeros(len(no_time_samples), dtype=int)
for i, no_time in enumerate(no_time_samples):
    sample_idx = np.where(samples == no_time)[0][0]
    no_time_sample_indices[i] = sample_idx

    

In [11]:
# All samples of the cancer type that do not have a stage should be removed from the dataset
print("patient indices to be removed is : ", len(no_time_sample_indices))
ge = np.delete(ge, no_time_sample_indices, 0)
gcn = np.delete(gcn, no_time_sample_indices, 0)
me = np.delete(me, no_time_sample_indices, 0)
samples = np.delete(samples, no_time_sample_indices)
cancertype = np.delete(cancertype, no_time_sample_indices)


patient indices to be removed is :  1


In [12]:
# Use X to denote GE and Y for ME, take indices on Y later
X = ge

# variable stage contains cancer stage (indexed from stageTypes), y contains cancertypes
Xtype = X[cancertype == ctype_idx]
Xrest = X[cancertype != ctype_idx]

yrest = cancertype[cancertype != ctype_idx]

# All BRCA samples
Xtype_samples = samples[cancertype == ctype_idx]

print("Data contains {} samples of Xtype_samples type {} that have Survival time defined".format(Xtype_samples.shape[0], ctype))

Data contains 762 samples of Xtype_samples type BRCA that have Survival time defined


In [13]:
# # Shuffle labels and dataframes for training, but keep the same order of samples and labels
shuffler = np.random.permutation(len(Xtype_samples))

np.save("{}/{}_sampleNames.npy".format(save_dir, ctype), Xtype_samples)
np.save("{}/{}_OSTimes.npy".format(save_dir, ctype), OS_times)

# Code by the Warrior_B
low_bound = np.sort(OS_times)[(len(OS_times) - 1) // 3]
print("low_bound", low_bound)
mid_bound = np.sort(OS_times)[(len(OS_times) - 1) // 3 * 2]
print("mid bound", mid_bound)
print("highest survival time", np.sort(OS_times)[-1])

OSTimes_3cats = []
for y in OS_times:
    if y <= low_bound:
        OSTimes_3cats.append('low')
        
    elif y <= mid_bound:
        OSTimes_3cats.append('mid')
        
    else:
        OSTimes_3cats.append('high')
        
OSTimes_3cats = np.array(OSTimes_3cats)
        
np.save("{}/{}_OSTimes_3cats.npy".format(save_dir, ctype), OSTimes_3cats)


low_bound 574
mid bound 1285
highest survival time 8605


## Stratified Split Data

In [14]:
split1 = StratifiedShuffleSplit(n_splits=1, test_size=0.1)

sss1 = split1.split(Xtype, OSTimes_3cats)

trainValidInd = 0
testInd = 0
for i, j in sss1:
    trainValidInd = i
    testInd = j
    


Xtest = Xtype[testInd]
print("Test samples with {}".format(ctype), Xtest.shape)
print("testInd dtype", testInd.dtype)
os_time_3cats_test = OSTimes_3cats[testInd] 
np.save("{}/{}_test_OSTimes_3cats.npy".format(save_dir, ctype), os_time_3cats_test)
np.save("{}/{}_test_samples.npy".format(save_dir, ctype), Xtype_samples[testInd])

XtrainValid = Xtype[trainValidInd]

stagetrainValid = OSTimes_3cats[trainValidInd]

split2 = StratifiedShuffleSplit(n_splits=1, test_size=1/9)

sss2 = split1.split(XtrainValid, stagetrainValid)

trainInd = 0
validInd = 0
for i, j in sss2:
    trainInd = i
    validInd = j

Xtrain = XtrainValid[trainInd]
print("Train samples with {}".format(ctype), Xtrain.shape)

stagetrain = stagetrainValid[trainInd]
np.save("{}/{}_train_OSTimes_3cats.npy".format(save_dir, ctype), stagetrain)
np.save("{}/{}_train_samples.npy".format(save_dir, ctype), Xtype_samples[trainValidInd][trainInd])


Xvalid = XtrainValid[validInd]
print("Validation samples with {}".format(ctype), Xvalid.shape)

stagevalid = stagetrainValid[validInd]
np.save("{}/{}_valid_OSTimes_3cats.npy".format(save_dir, ctype), stagevalid)
np.save("{}/{}_valid_samples.npy".format(save_dir, ctype), Xtype_samples[trainValidInd][validInd])


splitRest = StratifiedShuffleSplit(n_splits=1, test_size=1/9)

sss3 = splitRest.split(Xrest, yrest)
trainIndrest = 0
validIndrest = 0
for i, j in sss3:
    trainIndrest = i
    validIndrest = j


Xresttrain = Xrest[trainIndrest]
print("Train samples from remaining cancer types".format(ctype), Xresttrain.shape)


Xrestvalid = Xrest[validIndrest]
print("Validation samples from remaining cancer types".format(ctype), Xrestvalid.shape)



Test samples with BRCA (77, 5000)
testInd dtype int64
Train samples with BRCA (616, 5000)
Validation samples with BRCA (69, 5000)
Train samples from remaining cancer types (6784, 5000)
Validation samples from remaining cancer types (849, 5000)


In [15]:
# Use same indices used to split X (= GE) on Y (= ME) and Z (= GCN)
Y = me
Ytype = Y[cancertype == ctype_idx]
Yrest = Y[cancertype != ctype_idx]


Ytest = Ytype[testInd]
YtrainValid = Ytype[trainValidInd]
Ytrain = YtrainValid[trainInd]
Yvalid = YtrainValid[validInd]

Yresttrain = Yrest[trainIndrest]
Yrestvalid = Yrest[validIndrest]

Z = gcn
Ztype = Z[cancertype == ctype_idx]
Zrest = Z[cancertype != ctype_idx]

Ztest = Ztype[testInd]
ZtrainValid = Ztype[trainValidInd]
Ztrain = ZtrainValid[trainInd]
Zvalid = ZtrainValid[validInd]

Zresttrain = Zrest[trainIndrest]
Zrestvalid = Zrest[validIndrest]


In [16]:
# Throw together BRCA training data and Rest training data (and same for validation)
XtrainAll = np.vstack((Xtrain, Xresttrain))

XvalidAll = np.vstack((Xvalid, Xrestvalid))

print(XtrainAll.shape)
print(XvalidAll.shape)
print(Xtest.shape)

(7400, 5000)
(918, 5000)
(77, 5000)


In [17]:
(unique, counts) = np.unique(os_time_3cats_test, return_counts=True                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      )
frequencies = np.asarray((unique, counts)).T

print("List Cancer stage from {} and occurences in test set".format(ctype))
print(frequencies)

List Cancer stage from BRCA and occurences in test set
[['high' '26']
 ['low' '26']
 ['mid' '25']]


In [18]:
# Save all splits for testing convenience
np.save("{}/{}_GE_train".format(save_dir, ctype), Xtrain)
np.save("{}/rest_GE_train".format(save_dir), Xresttrain)
np.save("{}/{}_GE_valid".format(save_dir, ctype), Xvalid)
np.save("{}/rest_GE_valid".format(save_dir), Xrestvalid)
np.save("{}/{}_GE_test".format(save_dir, ctype), Xtest)

np.save("{}/{}_ME_train".format(save_dir, ctype), Ytrain)
np.save("{}/rest_ME_train".format(save_dir), Yresttrain)
np.save("{}/{}_ME_valid".format(save_dir, ctype), Yvalid)
np.save("{}/rest_ME_valid".format(save_dir), Yrestvalid)
np.save("{}/{}_ME_test".format(save_dir, ctype), Ytest)

np.save("{}/{}_GCN_train".format(save_dir, ctype), Ztrain)
np.save("{}/rest_GCN_train".format(save_dir), Zresttrain)
np.save("{}/{}_GCN_valid".format(save_dir, ctype), Zvalid)
np.save("{}/rest_GCN_valid".format(save_dir), Zrestvalid)
np.save("{}/{}_GCN_test".format(save_dir, ctype), Ztest)