In [1]:
"""
This notebook creates smaller data files that only include the three most prevalent cancer types from the TCGA data
"""

import pandas as pd
import numpy as np
import os

In [2]:
# Fetch the dataframes of known cancertypes (they are sorted by cancertype)

# Get data file from /data folder relative to this notebook
ipynb_dir = os.path.dirname(os.path.realpath("__file__"))
rna_file = os.path.join(ipynb_dir, '..', '..', 'data', 'RNASeq_3000MAD.csv')
gcn_file = os.path.join(ipynb_dir, '..', '..', 'data', 'GCN_3000MAD.csv')
dna_file = os.path.join(ipynb_dir, '..', '..', 'data', 'DNAMe_3000MAD.csv')


rna = pd.read_csv(rna_file, index_col=0)
gcn = pd.read_csv(gcn_file, index_col=0)
dna = pd.read_csv(dna_file, index_col=0)

In [3]:
# Fetch the cancertype labels (same order as dataframes)
labels = np.load(os.path.join(ipynb_dir, '..', '..', 'data', 'cancertype_labels.npy'))

In [4]:
# Also create a dataset with only 3 cancertypes
# only use 3 most common cancer types (BRCA : 766, KIRC : 306, LUAD : 449)
brca = np.where(labels == 'BRCA')[0]
kirc = np.where(labels == 'KIRC')[0]
luad = np.where(labels == 'LUAD')[0]

# Create list of the three cancer type indices
cancer3 = [brca, kirc, luad]
len(cancer3[0])

766

In [5]:
# Now get the indices 70-10-20 split, where every split has the same relative representation of cancertypes
TRAINING_DATA_SPLIT = 0.7
VALIDATION_DATA_SPLIT = 0.1
PREDICT_DATA_SPLIT = 0.2

all_training_ids = []
all_validation_ids = []
all_predict_ids = []

all_training_labels = []
all_validation_labels = []
all_predict_labels = []

label_names = ["BRCA", "KIRC", "LUAD"]

for cancer_type in range(len(cancer3)):

    nr_of_samples = len(cancer3[cancer_type])
    nr_of_training_samples = int(TRAINING_DATA_SPLIT * nr_of_samples)
    nr_of_validation_samples = int(VALIDATION_DATA_SPLIT * nr_of_samples)

    # Random ordering of all sample id's
    random_sample_indices = np.random.choice(a=cancer3[cancer_type], size=nr_of_samples, replace=False)

    # Split into three sets of sizes
    # [:nr_of_training_samples], [nr_of_training_samples:nr_of_validation_samples], [:nr_of_predict_samples]
    sets = np.split(random_sample_indices,
                    [nr_of_training_samples, (nr_of_training_samples + nr_of_validation_samples)])

    training_ids = sets[0]
    validation_ids = sets[1]
    predict_ids = sets[2]

    print(len(training_ids), len(validation_ids), len(predict_ids))
    all_training_ids.extend(training_ids)
    all_validation_ids.extend(validation_ids)
    all_predict_ids.extend(predict_ids)
    
    all_training_labels.extend([label_names[cancer_type]] * len(training_ids))
    all_validation_labels.extend([label_names[cancer_type]] * len(validation_ids))
    all_predict_labels.extend([label_names[cancer_type]] * len(predict_ids))


print(len(all_training_ids), len(all_validation_ids), len(all_predict_ids))
len(all_predict_labels)

536 76 154
214 30 62
314 44 91
1064 150 307


307

In [6]:
# Make selection of the dataframes based on the above ids
rna_training_3types = rna.iloc[all_training_ids]
gcn_training_3types = gcn.iloc[all_training_ids]
dna_training_3types = dna.iloc[all_training_ids]

rna_validation_3types = rna.iloc[all_validation_ids]
gcn_validation_3types = gcn.iloc[all_validation_ids]
dna_validation_3types = dna.iloc[all_validation_ids]

rna_predict_3types = rna.iloc[all_predict_ids]
gcn_predict_3types = gcn.iloc[all_predict_ids]
dna_predict_3types = dna.iloc[all_predict_ids]

save_dir = os.path.join(ipynb_dir, '..', '..', 'data', '3typesnoclamp')

In [7]:
# TRAINING
# We now have data splits for each modality, and also the labels for each split
# For true randomness, we need to shuffle them all

# Shuffle labels and dataframes for training, but keep the same order of samples and labels
training_shuffler = np.random.permutation(len(all_training_labels))

training_labels_shuffle = np.take(all_training_labels, training_shuffler)
rna_training_shuffle = rna_training_3types.iloc[training_shuffler]
gcn_training_shuffle = gcn_training_3types.iloc[training_shuffler]
dna_training_shuffle = dna_training_3types.iloc[training_shuffler]

np.save(os.path.join(save_dir, 'training_3types.npy'), training_labels_shuffle)
rna_training_shuffle.to_csv(os.path.join(save_dir, 'RNASeq_3types_training.csv'))
gcn_training_shuffle.to_csv(os.path.join(save_dir, 'GCN_3types_training.csv'))
dna_training_shuffle.to_csv(os.path.join(save_dir, 'DNAMe_3types_training.csv'))

In [8]:
# VALIDATION
# We now have data splits for each modality, and also the labels for each split
# For true randomness, we need to shuffle them all

# Shuffle labels and dataframes for validation, but keep the same order of samples and labels
validation_shuffler = np.random.permutation(len(all_validation_labels))

validation_labels_shuffle = np.take(all_validation_labels, validation_shuffler)
rna_validation_shuffle = rna_validation_3types.iloc[validation_shuffler]
gcn_validation_shuffle = gcn_validation_3types.iloc[validation_shuffler]
dna_validation_shuffle = dna_validation_3types.iloc[validation_shuffler]

np.save(os.path.join(save_dir, 'validation_3types.npy'), validation_labels_shuffle)
rna_validation_shuffle.to_csv(os.path.join(save_dir, 'RNASeq_3types_validation.csv'))
gcn_validation_shuffle.to_csv(os.path.join(save_dir, 'GCN_3types_validation.csv'))
dna_validation_shuffle.to_csv(os.path.join(save_dir, 'DNAMe_3types_validation.csv'))

In [9]:
# PREDICTION
# We now have data splits for each modality, and also the labels for each split
# For true randomness, we need to shuffle them all

# Shuffle labels and dataframes for predict, but keep the same order of samples and labels
predict_shuffler = np.random.permutation(len(all_predict_labels))

predict_labels_shuffle = np.take(all_predict_labels, predict_shuffler)
rna_predict_shuffle = rna_predict_3types.iloc[predict_shuffler]
gcn_predict_shuffle = gcn_predict_3types.iloc[predict_shuffler]
dna_predict_shuffle = dna_predict_3types.iloc[predict_shuffler]

np.save(os.path.join(save_dir, 'predict_3types.npy'), predict_labels_shuffle)
rna_predict_shuffle.to_csv(os.path.join(save_dir, 'RNASeq_3types_predict.csv'))
gcn_predict_shuffle.to_csv(os.path.join(save_dir, 'GCN_3types_predict.csv'))
dna_predict_shuffle.to_csv(os.path.join(save_dir, 'DNAMe_3types_predict.csv'))