In [16]:
"""
File that ensures ordering of datafiles is done per cancer type, and provides the cancer type name of each sample.
This is done to plot UMAP of cancer types.
"""

import pandas as pd
import numpy as np
import os

In [17]:
# Get data file from /data folder relative to this notebook
ipynb_dir = os.path.dirname(os.path.realpath("__file__"))
patient_info_file = os.path.join(ipynb_dir, '..', '..', 'data', 'Survival_SupplementalTable_S1_20171025_xena_sp')
rna_file = os.path.join(ipynb_dir, '..', '..', 'data', 'clamped_RNASeq_3000MAD.csv')

In [19]:
patient_info = pd.read_table(patient_info_file, index_col=0)
rna = pd.read_csv(rna_file, index_col=0)

In [20]:
common = []
cancer_patients = patient_info.index.values
cancer_types = []

# Get all occurences of the patient in the original data file.
for patient in cancer_patients:
        x = rna[[patient in label for label in rna.index]]
        if x.empty:
            continue

        common.append(x)
        for _ in range(len(x)):
            cancer_types.append(patient_info.loc[patient]["cancer type abbreviation"])
            
# common now holds all the rows of the dataframe, ordered by patients with known cancertypes

[                   RPS4Y1      XIST      KRT5   CEACAM5      AGR2     KRT6A  \
 TCGA-OR-A5J1-01  0.662724  0.250084  0.230821  0.230821  0.250084  0.230821   
 
                     KRT14   CEACAM6     DDX3Y   TMPRSS4  ...      GNG7  \
 TCGA-OR-A5J1-01  0.230821  0.230821  0.612369  0.274417  ...  0.446097   
 
                   CXorf57     FETUB     RPH3A     ASTN2   CCDC74B  LOC338651  \
 TCGA-OR-A5J1-01  0.475161  0.230821  0.283204  0.447786  0.560662   0.258871   
 
                    TRIM17      GCGR     KCNQ3  
 TCGA-OR-A5J1-01  0.307874  0.250084  0.327475  
 
 [1 rows x 3000 columns],
                    RPS4Y1      XIST      KRT5   CEACAM5     AGR2     KRT6A  \
 TCGA-OR-A5J2-01  0.320041  0.668807  0.259547  0.230821  0.34978  0.230821   
 
                     KRT14   CEACAM6     DDX3Y   TMPRSS4  ...      GNG7  \
 TCGA-OR-A5J2-01  0.269348  0.230821  0.247381  0.230821  ...  0.552552   
 
                   CXorf57     FETUB     RPH3A     ASTN2   CCDC74B  LOC338651  \
 TC

In [21]:
# RNA datafile is now defined with samples only from patients whose cancer type is known
rna_bycancertype = pd.concat(common)
rna_bycancertype.shape

(8418, 3000)

In [28]:
gcn_file = os.path.join(ipynb_dir, '..', '..', 'data', 'clamped_GCN_3000MAD.csv')
dname_file = os.path.join(ipynb_dir, '..', '..', 'data', 'clamped_DNAMe_3000MAD.csv')

# Load in the other modalities and use .loc to order them in the same manner as the rna file
gcn = pd.read_csv(gcn_file, index_col=0)
gcn_bycancertype = gcn.loc[rna_bycancertype.index.values]
dna = pd.read_csv(dname_file, index_col=0)
dna_bycancertype = dna.loc[rna_bycancertype.index.values]

In [30]:
labels = np.array(cancer_types)

In [None]:
# Now we have dataframes and cancer type labels, but they are all ordered
# They need to be shuffled to avoid overfitting of models

# # Shuffle labels and dataframes for training, but keep the same order of samples and labels
shuffler = np.random.permutation(len(labels))

labels_shuffle = labels[shuffler]
rna_shuffle = rna_bycancertype.iloc[shuffler]
gcn_shuffle = gcn_bycancertype.iloc[shuffler]
dna_shuffle = dna_bycancertype.iloc[shuffler]

# Save ordered output files
np.save(os.path.join(ipynb_dir, '..', '..', 'data', 'cancertype_labels.npy'), labels)
rna_bycancertype.to_csv(rna_file)
gcn_bycancertype.to_csv(gcn_file)
dna_bycancertype.to_csv(dname_file)

# Save shuffled output files
np.save(os.path.join(ipynb_dir, '..', '..', 'data', 'shuffle_cancertype_labels.npy'), labels_shuffle)
rna_shuffle.to_csv(os.path.join(ipynb_dir, '..', '..', 'data', 'shuffle_clamped_RNASeq_3000MAD.csv'))
gcn_shuffle.to_csv(os.path.join(ipynb_dir, '..', '..', 'data', 'shuffle_clamped_GCN_3000MAD.csv'))
dna_shuffle.to_csv(os.path.join(ipynb_dir, '..', '..', 'data', 'shuffle_clamped_DNAMe_3000MAD.csv'))