## Full xena file

In [78]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
import os

In [79]:
ge = np.load("/home/bram/jointomicscomp/data/GE.npy")
me = np.load("/home/bram/jointomicscomp/data/ME.npy")
samples = np.load("/home/bram/jointomicscomp/data/sampleNames.npy")
cancertype = np.load("/home/bram/jointomicscomp/data/cancerType.npy")
cancertypes = np.load("/home/bram/jointomicscomp/data/cancerTypes.npy")
stageTypes = np.load("/home/bram/jointomicscomp/data/stageTypes.npy")

## Curated Data

In [80]:
survival_table = pd.read_table("/home/bram/jointomicscomp/data/Survival_SupplementalTable_S1_20171025_xena_sp", index_col=0)

print("Shape:", survival_table.shape)


Shape: (12591, 33)


## Show most common cancer types

In [81]:
counter = dict()
cancer_types = survival_table.iloc[:, 1]
for value in cancer_types:
    if value in counter:
        counter[value] += 1
        
    else:
        counter[value] = 1
        
counter = sorted(counter.items(), key=lambda x: x[1], reverse=True)
counter

[('BRCA', 1236),
 ('KIRC', 944),
 ('LUAD', 641),
 ('LUSC', 623),
 ('HNSC', 604),
 ('OV', 604),
 ('GBM', 602),
 ('UCEC', 583),
 ('THCA', 580),
 ('PRAD', 566),
 ('COAD', 545),
 ('LGG', 529),
 ('STAD', 511),
 ('SKCM', 479),
 ('LIHC', 438),
 ('BLCA', 436),
 ('KIRP', 352),
 ('CESC', 312),
 ('SARC', 271),
 ('ESCA', 204),
 ('LAML', 200),
 ('PAAD', 196),
 ('PCPG', 187),
 ('READ', 183),
 ('TGCT', 139),
 ('THYM', 126),
 ('ACC', 92),
 ('KICH', 91),
 ('MESO', 87),
 ('UVM', 80),
 ('UCS', 57),
 ('DLBC', 48),
 ('CHOL', 45)]

## Select cancer type and data

In [82]:
# All patients with a certain cancer cancer.
ctype = 'BRCA'
ctype_idx = np.where(cancertypes == ctype)[0][0] # Get index of ctype in total cancers list

# Save all required data in data/cancer_type folder
save_dir = os.path.join("/home/bram/jointomicscomp/data", '{}'.format(ctype))
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

cancer_type_info_table = survival_table.loc[survival_table["cancer type abbreviation"] == ctype]
cancer_type_patients = cancer_type_info_table["_PATIENT"]
    
cancer_type_patients

sample
TCGA-3C-AAAU-01    TCGA-3C-AAAU
TCGA-3C-AALI-01    TCGA-3C-AALI
TCGA-3C-AALJ-01    TCGA-3C-AALJ
TCGA-3C-AALK-01    TCGA-3C-AALK
TCGA-4H-AAAK-01    TCGA-4H-AAAK
                       ...     
TCGA-WT-AB44-01    TCGA-WT-AB44
TCGA-XX-A899-01    TCGA-XX-A899
TCGA-XX-A89A-01    TCGA-XX-A89A
TCGA-Z7-A8R5-01    TCGA-Z7-A8R5
TCGA-Z7-A8R6-01    TCGA-Z7-A8R6
Name: _PATIENT, Length: 1236, dtype: object

## Get RNA data from these patients grab corresponding stage

In [83]:
# Stages.
gene_expression_csv = pd.read_csv("/home/bram/jointomicscomp/data/rna-pancancer-5000-minmax.csv", index_col=0)
gene_expression_csv = gene_expression_csv.loc[samples]  # Take only samples used in this research

stages = dict()
s = set()
no_stage_counter = 0
for patient in cancer_type_patients:
    # Get all occurences of the patient in the original data file.
    x = gene_expression_csv[[patient in label for label in samples]]
    
    y = survival_table[survival_table["_PATIENT"] == patient]["ajcc_pathologic_tumor_stage"].values[0]
    # Get strin to remove nans.
    y = str(y)
    s.add(y)
    if y == "nan" or y == 'Stage X' or y == '[Discrepancy]' or x.empty:
        no_stage_counter += 1
        continue
    
    # If it's stage 4 then add to stage for. No A's and B's here.
    if y == 'Stage IV':
        stage = 'Stage 4'
    
    # Create stage string for other stages than 4.
    else:
        stage = 'Stage ' + str(y.count('I'))
    
    # Create stage key.
    if stage not in stages:
        stages[stage] = []
    
    # Add to dict.
    stages[stage].append(x)
    
print("{} Samples did not have a stage in the info dataframe.".format(no_stage_counter))
    


357 Samples did not have a stage in the info dataframe.


In [84]:
for x in stageTypes:
    print(x, len(stages[x]))
    

Stage 1 139
Stage 2 508
Stage 3 220
Stage 4 12


In [85]:
class Stage:
    
    def __init__(self, rna):
        self.patient_data = rna

In [86]:
# Add the dataframes together.
for stage in stageTypes:
    stages[stage] = Stage(pd.concat(stages[stage]))
    stages[stage].patient_data['Stage'] = stage

samples_ctr = 0
for x in stageTypes:
    print(x, stages[x].patient_data.shape)
    samples_ctr += len(stages[x].patient_data)
    
print("Total number of samples:", samples_ctr)

Stage 1 (159, 5001)
Stage 2 (607, 5001)
Stage 3 (262, 5001)
Stage 4 (14, 5001)
Total number of samples: 1042


In [87]:
stages

{'Stage 2': <__main__.Stage at 0x7f631c563850>,
 'Stage 1': <__main__.Stage at 0x7f631d064340>,
 'Stage 3': <__main__.Stage at 0x7f638e387ca0>,
 'Stage 4': <__main__.Stage at 0x7f631ca71100>}

In [88]:
# Use X to denote GE and Y for ME, take indices on Y later
X = ge

# variable stage contains cancer stage (indexed from stageTypes), y contains cancertypes
Xtype = X[cancertype == ctype_idx]
Xrest = X[cancertype != ctype_idx]

yrest = cancertype[cancertype != ctype_idx]

# All BRCA samples
Xtype_samples = samples[cancertype == ctype_idx]

# Now fetch stage data for these samples

ctr= 0
Xtype_samples_withstage = []
for stage in stageTypes:
    ctr+= np.intersect1d(samples, stages[stage].patient_data.index.values).shape[0]
    Xtype_samples_withstage.extend(np.intersect1d(samples, stages[stage].patient_data.index.values))
    
Xtype_samples_withstage = np.array(Xtype_samples_withstage)


# There are fewer samples whose stage is known, so we need to discard some
# Xtype_nostage = np.setdiff1d(Xtype_samples, Xtype_samples_withstage)
# print(Xtype_nostage.shape)
Xtype_sample_withstage_idx = np.where(np.isin(Xtype_samples, Xtype_samples_withstage))[0]
Xtype_samples = Xtype_samples[Xtype_sample_withstage_idx]
Xtype = Xtype[Xtype_sample_withstage_idx]

print("Data contains {} samples of Xtype_samples type {}".format(Xtype_samples.shape[0], ctype))

Data contains 849 samples of Xtype_samples type BRCA


In [89]:
np.save("{}/{}_sampleNames.npy".format(save_dir, ctype), Xtype_samples)

In [90]:
# Create stage array
stageType = np.zeros(Xtype_samples.shape[0])

for i in range(len(stageTypes)):
    stage_samples = stages[stageTypes[i]].patient_data.index.values
    print(stageTypes[i], stage_samples.shape[0])
    for stage_sample in stage_samples:
        idx = np.where(Xtype_samples == stage_sample)[0][0]
        stageType[idx] = i  # stages in StageTypes array are 0-indexed


Stage 1 159
Stage 2 607
Stage 3 262
Stage 4 14


In [91]:
## Stratified Split Data

In [92]:
split1 = StratifiedShuffleSplit(n_splits=1, test_size=0.1)

sss1 = split1.split(Xtype, stageType)

trainValidInd = 0
testInd = 0
for i, j in sss1:
    trainValidInd = i
    testInd = j
    


Xtest = Xtype[testInd]
print("Test samples with {}".format(ctype), Xtest.shape)
stagetest = stageType[testInd] 
np.save("{}/{}_test_stageType.npy".format(save_dir, ctype), stagetest)
np.save("{}/{}_test_samples.npy".format(save_dir, ctype), Xtype_samples[testInd])


XtrainValid = Xtype[trainValidInd]

stagetrainValid = stageType[trainValidInd]

split2 = StratifiedShuffleSplit(n_splits=1, test_size=1/9)

sss2 = split1.split(XtrainValid, stagetrainValid)

trainInd = 0
validInd = 0
for i, j in sss2:
    trainInd = i
    validInd = j

Xtrain = XtrainValid[trainInd]
print("Train samples with {}".format(ctype), Xtrain.shape)

stagetrain = stagetrainValid[trainInd]
np.save("{}/{}_train_stageType.npy".format(save_dir, ctype), stagetrain)
np.save("{}/{}_train_samples.npy".format(save_dir, ctype), Xtype_samples[trainValidInd][trainInd])


Xvalid = XtrainValid[validInd]
print("Validation samples with {}".format(ctype), Xvalid.shape)

stagevalid = stagetrainValid[validInd]
np.save("{}/{}_valid_stageType.npy".format(save_dir, ctype), stagevalid)
np.save("{}/{}_valid_samples.npy".format(save_dir, ctype), Xtype_samples[trainValidInd][validInd])


splitRest = StratifiedShuffleSplit(n_splits=1, test_size=1/9)

sss3 = splitRest.split(Xrest, yrest)
trainIndrest = 0
validIndrest = 0
for i, j in sss3:
    trainIndrest = i
    validIndrest = j


Xresttrain = Xrest[trainIndrest]
print("Train samples from remaining cancer types".format(ctype), Xresttrain.shape)


Xrestvalid = Xrest[validIndrest]
print("Validation samples from remaining cancer types".format(ctype), Xrestvalid.shape)



Test samples with BRCA (85, 5000)
Train samples with BRCA (687, 5000)
Validation samples with BRCA (77, 5000)
Train samples from remaining cancer types (7334, 5000)
Validation samples from remaining cancer types (917, 5000)


In [93]:
# Use same indices used to split X (= GE) on Y (= ME)
Y = me
Ytype = Y[cancertype == ctype_idx]
Ytype = Ytype[Xtype_sample_withstage_idx]
Yrest = Y[cancertype != ctype_idx]


Ytest = Ytype[testInd]
YtrainValid = Ytype[trainValidInd]
Ytrain = YtrainValid[trainInd]
Yvalid = YtrainValid[validInd]

Yresttrain = Yrest[trainIndrest]
Yrestvalid = Yrest[validIndrest]


In [94]:
# Throw together BRCA training data and Rest training data (and same for validation)
XtrainAll = np.vstack((Xtrain, Xresttrain))

XvalidAll = np.vstack((Xvalid, Xrestvalid))

print(XtrainAll.shape)
print(XvalidAll.shape)
print(Xtest.shape)

(8021, 5000)
(994, 5000)
(85, 5000)


In [95]:
(unique, counts) = np.unique(stagetest, return_counts=True)
frequencies = np.asarray((unique, counts)).T

print("List Cancer stage from {} and occurences in test set".format(ctype))
print(frequencies)

List Cancer stage from BRCA and occurences in test set
[[ 0. 13.]
 [ 1. 49.]
 [ 2. 22.]
 [ 3.  1.]]


In [96]:
# Save all splits for testing convenience
np.save("{}/{}_GE_train".format(save_dir, ctype), Xtrain)
np.save("{}/rest_GE_train".format(save_dir), Xresttrain)
np.save("{}/{}_GE_valid".format(save_dir, ctype), Xvalid)
np.save("{}/rest_GE_valid".format(save_dir), Xrestvalid)
np.save("{}/{}_GE_test".format(save_dir, ctype), Xtest)

np.save("{}/{}_ME_train".format(save_dir, ctype), Ytrain)
np.save("{}/rest_ME_train".format(save_dir), Yresttrain)
np.save("{}/{}_ME_valid".format(save_dir, ctype), Yvalid)
np.save("{}/rest_ME_valid".format(save_dir), Yrestvalid)
np.save("{}/{}_ME_test".format(save_dir, ctype), Ytest)