## Full xena file

In [64]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
import os

In [65]:
ge = np.load("/home/bram/jointomicscomp/data/GE.npy")
me = np.load("/home/bram/jointomicscomp/data/ME.npy")
gcn = np.load("/home/bram/jointomicscomp/data/GCN.npy")
samples = np.load("/home/bram/jointomicscomp/data/sampleNames.npy")
cancertype = np.load("/home/bram/jointomicscomp/data/cancerType.npy")
cancertypes = np.load("/home/bram/jointomicscomp/data/cancerTypes.npy")
stageTypes = np.load("/home/bram/jointomicscomp/data/stageTypes.npy")

## Curated Data

In [66]:
survival_table = pd.read_table("/home/bram/jointomicscomp/data/Survival_SupplementalTable_S1_20171025_xena_sp", index_col=0).loc[samples]

print("Shape:", survival_table.shape)


Shape: (8396, 33)


## Show most common cancer types

In [67]:
counter = dict()
cancer_types = survival_table.iloc[:, 1]
for value in cancer_types:
    if value in counter:
        counter[value] += 1
        
    else:
        counter[value] = 1
        
counter = sorted(counter.items(), key=lambda x: x[1], reverse=True)
counter

[('BRCA', 763),
 ('HNSC', 514),
 ('LGG', 512),
 ('THCA', 497),
 ('PRAD', 487),
 ('LUAD', 441),
 ('UCEC', 410),
 ('BLCA', 401),
 ('STAD', 370),
 ('LUSC', 367),
 ('SKCM', 366),
 ('LIHC', 364),
 ('KIRC', 310),
 ('CESC', 292),
 ('COAD', 282),
 ('KIRP', 272),
 ('SARC', 255),
 ('ESCA', 183),
 ('PAAD', 177),
 ('PCPG', 162),
 ('TGCT', 134),
 ('THYM', 119),
 ('LAML', 117),
 ('READ', 95),
 ('MESO', 87),
 ('UVM', 80),
 ('ACC', 77),
 ('KICH', 66),
 ('UCS', 56),
 ('DLBC', 48),
 ('GBM', 47),
 ('CHOL', 36),
 ('OV', 9)]

In [68]:
(unique, counts) = np.unique(cancertype, return_counts=True)
frequencies = np.asarray((unique, counts)).T
frequencies

array([[  0., 117.],
       [  1.,  77.],
       [  2., 401.],
       [  3., 512.],
       [  4., 763.],
       [  5., 292.],
       [  6.,  36.],
       [  7., 282.],
       [  8., 183.],
       [  9.,  47.],
       [ 10., 514.],
       [ 11.,  66.],
       [ 12., 310.],
       [ 13., 272.],
       [ 14., 364.],
       [ 15., 441.],
       [ 16., 367.],
       [ 17.,  48.],
       [ 18.,  87.],
       [ 19.,   9.],
       [ 20., 177.],
       [ 21., 162.],
       [ 22., 487.],
       [ 23.,  95.],
       [ 24., 255.],
       [ 25., 366.],
       [ 26., 370.],
       [ 27., 134.],
       [ 28., 119.],
       [ 29., 497.],
       [ 30.,  56.],
       [ 31., 410.],
       [ 32.,  80.]])

## Select cancer type and data

In [69]:
# All patients with a certain cancer cancer.
ctype = 'LUSC'
ctype_idx = np.where(cancertypes == ctype)[0][0] # Get index of ctype in total cancers list
print(ctype_idx)

# Save all required data in data/cancer_type folder
save_dir = os.path.join("/home/bram/jointomicscomp/data", '{}'.format(ctype))
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

cancer_type_info_table = survival_table.loc[survival_table["cancer type abbreviation"] == ctype]
print(cancer_type_info_table.shape)
cancer_type_patients = cancer_type_info_table["_PATIENT"]
    
cancer_type_patients

16
(367, 33)


sample
TCGA-18-3417-01    TCGA-18-3417
TCGA-18-4721-01    TCGA-18-4721
TCGA-18-5592-01    TCGA-18-5592
TCGA-18-5595-01    TCGA-18-5595
TCGA-21-5782-01    TCGA-21-5782
                       ...     
TCGA-O2-A52S-01    TCGA-O2-A52S
TCGA-O2-A52V-01    TCGA-O2-A52V
TCGA-O2-A52W-01    TCGA-O2-A52W
TCGA-O2-A5IB-01    TCGA-O2-A5IB
TCGA-XC-AA0X-01    TCGA-XC-AA0X
Name: _PATIENT, Length: 367, dtype: object

## Get RNA data from these patients grab corresponding stage

In [70]:
# Stages.
gene_expression_csv = pd.read_csv("/home/bram/jointomicscomp/data/rna-pancancer-5000-minmax.csv", index_col=0)
gene_expression_csv = gene_expression_csv.loc[samples]  # Take only samples used in this research
print("gene_expression_csv shape: ", gene_expression_csv.shape)

stages = dict()
s = set()
no_stage_counter = 0
no_stage_samples = []
for patient_idx, patient in enumerate(cancer_type_patients):
    # Get all occurences of the patient in the original data file.
    x = gene_expression_csv[[patient in label for label in samples]]
    
    y = survival_table[survival_table["_PATIENT"] == patient]["ajcc_pathologic_tumor_stage"].values[0]
    # Get strin to remove nans.
    y = str(y)
    s.add(y)
    if y == "nan" or y == 'Stage X' or y == '[Discrepancy]' or x.empty:
        no_stage_counter += 1
        print(x.index.values)
        no_stage_samples.append(x.index.values)
        continue
    
    # If it's stage 4 then add to stage for. No A's and B's here.
    if y == 'Stage IV':
        stage = 'Stage 4'
    
    # Create stage string for other stages than 4.
    else:
        stage = 'Stage ' + str(y.count('I'))
    
    # Create stage key.
    if stage not in stages:
        stages[stage] = []
    
    # Add to dict.
    stages[stage].append(x)
    
print("{} Samples did not have a stage in the info dataframe.".format(no_stage_counter))
    


gene_expression_csv shape:  (8396, 5000)
['TCGA-22-5473-01']
['TCGA-52-7812-01']
['TCGA-92-8064-01']
3 Samples did not have a stage in the info dataframe.


In [71]:
no_stage_samples

[array(['TCGA-22-5473-01'], dtype=object),
 array(['TCGA-52-7812-01'], dtype=object),
 array(['TCGA-92-8064-01'], dtype=object)]

In [72]:
no_stage_sample_indices = np.zeros(len(no_stage_samples), dtype=int)
for i, no_stage in enumerate(no_stage_samples):
    sample_idx = np.where(samples == no_stage)[0][0]
    no_stage_sample_indices[i] = sample_idx

    

In [73]:
# All samples of the cancer type that do not have a stage should be removed from the dataset
print("patient indices to be removed is : ", len(no_stage_sample_indices))
ge = np.delete(ge, no_stage_sample_indices, 0)
gcn = np.delete(gcn, no_stage_sample_indices, 0)
me = np.delete(me, no_stage_sample_indices, 0)
samples = np.delete(samples, no_stage_sample_indices)
cancertype = np.delete(cancertype, no_stage_sample_indices)


patient indices to be removed is :  3


In [74]:
class Stage:
    
    def __init__(self, rna):
        self.patient_data = rna

In [75]:
# Add the dataframes together.
for stage in stageTypes:
    stages[stage] = Stage(pd.concat(stages[stage]))
    stages[stage].patient_data['Stage'] = stage

samples_ctr = 0
for x in stageTypes:
    print(x, stages[x].patient_data.shape)
    samples_ctr += len(stages[x].patient_data)
    
print("Total number of samples:", samples_ctr)

Stage 1 (170, 5001)
Stage 2 (134, 5001)
Stage 3 (56, 5001)
Stage 4 (4, 5001)
Total number of samples: 364


In [76]:
stages

{'Stage 4': <__main__.Stage at 0x7f347bd442b0>,
 'Stage 1': <__main__.Stage at 0x7f3438c4d970>,
 'Stage 2': <__main__.Stage at 0x7f347bd44910>,
 'Stage 3': <__main__.Stage at 0x7f3437405280>}

In [77]:
# Use X to denote GE and Y for ME, take indices on Y later
X = ge

# variable stage contains cancer stage (indexed from stageTypes), y contains cancertypes
Xtype = X[cancertype == ctype_idx]
Xrest = X[cancertype != ctype_idx]

yrest = cancertype[cancertype != ctype_idx]

# All BRCA samples
Xtype_samples = samples[cancertype == ctype_idx]

print("Data contains {} samples of Xtype_samples type {}".format(Xtype_samples.shape[0], ctype))

Data contains 364 samples of Xtype_samples type LUSC


In [78]:
np.save("{}/{}_sampleNames.npy".format(save_dir, ctype), Xtype_samples)

In [79]:
# Create stage array
stageType = np.zeros(Xtype_samples.shape[0])

for i in range(len(stageTypes)):
    stage_samples = stages[stageTypes[i]].patient_data.index.values
    print(stageTypes[i], stage_samples.shape[0])
    for stage_sample in stage_samples:
        idx = np.where(Xtype_samples == stage_sample)[0][0]
        stageType[idx] = i  # stages in StageTypes array are 0-indexed


Stage 1 170
Stage 2 134
Stage 3 56
Stage 4 4


## Stratified Split Data

In [80]:
split1 = StratifiedShuffleSplit(n_splits=1, test_size=0.1)

sss1 = split1.split(Xtype, stageType)

trainValidInd = 0
testInd = 0
for i, j in sss1:
    trainValidInd = i
    testInd = j
    


Xtest = Xtype[testInd]
print("Test samples with {}".format(ctype), Xtest.shape)
stagetest = stageType[testInd] 
np.save("{}/{}_test_stageType.npy".format(save_dir, ctype), stagetest)
np.save("{}/{}_test_samples.npy".format(save_dir, ctype), Xtype_samples[testInd])

XtrainValid = Xtype[trainValidInd]

stagetrainValid = stageType[trainValidInd]

split2 = StratifiedShuffleSplit(n_splits=1, test_size=1/9)

sss2 = split1.split(XtrainValid, stagetrainValid)

trainInd = 0
validInd = 0
for i, j in sss2:
    trainInd = i
    validInd = j

Xtrain = XtrainValid[trainInd]
print("Train samples with {}".format(ctype), Xtrain.shape)

stagetrain = stagetrainValid[trainInd]
np.save("{}/{}_train_stageType.npy".format(save_dir, ctype), stagetrain)
np.save("{}/{}_train_samples.npy".format(save_dir, ctype), Xtype_samples[trainValidInd][trainInd])


Xvalid = XtrainValid[validInd]
print("Validation samples with {}".format(ctype), Xvalid.shape)

stagevalid = stagetrainValid[validInd]
np.save("{}/{}_valid_stageType.npy".format(save_dir, ctype), stagevalid)
np.save("{}/{}_valid_samples.npy".format(save_dir, ctype), Xtype_samples[trainValidInd][validInd])


splitRest = StratifiedShuffleSplit(n_splits=1, test_size=1/9)

sss3 = splitRest.split(Xrest, yrest)
trainIndrest = 0
validIndrest = 0
for i, j in sss3:
    trainIndrest = i
    validIndrest = j


Xresttrain = Xrest[trainIndrest]
print("Train samples from remaining cancer types".format(ctype), Xresttrain.shape)


Xrestvalid = Xrest[validIndrest]
print("Validation samples from remaining cancer types".format(ctype), Xrestvalid.shape)



Test samples with LUSC (37, 5000)
Train samples with LUSC (294, 5000)
Validation samples with LUSC (33, 5000)
Train samples from remaining cancer types (7136, 5000)
Validation samples from remaining cancer types (893, 5000)


In [81]:
# Use same indices used to split X (= GE) on Y (= ME) and Z (= GCN)
Y = me
Ytype = Y[cancertype == ctype_idx]
Yrest = Y[cancertype != ctype_idx]


Ytest = Ytype[testInd]
YtrainValid = Ytype[trainValidInd]
Ytrain = YtrainValid[trainInd]
Yvalid = YtrainValid[validInd]

Yresttrain = Yrest[trainIndrest]
Yrestvalid = Yrest[validIndrest]

Z = gcn
Ztype = Z[cancertype == ctype_idx]
Zrest = Z[cancertype != ctype_idx]

Ztest = Ztype[testInd]
ZtrainValid = Ztype[trainValidInd]
Ztrain = ZtrainValid[trainInd]
Zvalid = ZtrainValid[validInd]

Zresttrain = Zrest[trainIndrest]
Zrestvalid = Zrest[validIndrest]


In [82]:
# Throw together BRCA training data and Rest training data (and same for validation)
XtrainAll = np.vstack((Xtrain, Xresttrain))

XvalidAll = np.vstack((Xvalid, Xrestvalid))

print(XtrainAll.shape)
print(XvalidAll.shape)
print(Xtest.shape)

(7430, 5000)
(926, 5000)
(37, 5000)


In [83]:
(unique, counts) = np.unique(stagetest, return_counts=True)
frequencies = np.asarray((unique, counts)).T

print("List Cancer stage from {} and occurences in test set".format(ctype))
print(frequencies)

List Cancer stage from LUSC and occurences in test set
[[ 0. 17.]
 [ 1. 14.]
 [ 2.  6.]]


In [84]:
# Save all splits for testing convenience
np.save("{}/{}_GE_train".format(save_dir, ctype), Xtrain)
np.save("{}/rest_GE_train".format(save_dir), Xresttrain)
np.save("{}/{}_GE_valid".format(save_dir, ctype), Xvalid)
np.save("{}/rest_GE_valid".format(save_dir), Xrestvalid)
np.save("{}/{}_GE_test".format(save_dir, ctype), Xtest)

np.save("{}/{}_ME_train".format(save_dir, ctype), Ytrain)
np.save("{}/rest_ME_train".format(save_dir), Yresttrain)
np.save("{}/{}_ME_valid".format(save_dir, ctype), Yvalid)
np.save("{}/rest_ME_valid".format(save_dir), Yrestvalid)
np.save("{}/{}_ME_test".format(save_dir, ctype), Ytest)

np.save("{}/{}_GCN_train".format(save_dir, ctype), Ztrain)
np.save("{}/rest_GCN_train".format(save_dir), Zresttrain)
np.save("{}/{}_GCN_valid".format(save_dir, ctype), Zvalid)
np.save("{}/rest_GCN_valid".format(save_dir), Zrestvalid)
np.save("{}/{}_GCN_test".format(save_dir, ctype), Ztest)