In [13]:
"""
File that ensures ordering of datafiles is done per cancer type, and provides the cancer type name of each sample.
This is done to plot UMAP of cancer types.
"""

import pandas as pd
import numpy as np
import os
import run
# Get data file from /data folder relative to Root dir
patient_info_file = os.path.join(run.ROOT_DIR, 'data', 'Survival_SupplementalTable_S1_20171025_xena_sp')
rna_file = os.path.join(run.ROOT_DIR, 'data', 'RNASeq_5000MAD.csv')

In [14]:
patient_info = pd.read_table(patient_info_file, index_col=0)
rna = pd.read_csv(rna_file, index_col=0)

In [15]:
common = []
cancer_patients = patient_info.index.values
cancer_types = []

# Get all occurences of the patient in the original data file.
for patient in cancer_patients:
        x = rna[[patient in label for label in rna.index]]
        if x.empty:
            continue

        common.append(x)
        for _ in range(len(x)):
            cancer_types.append(patient_info.loc[patient]["cancer type abbreviation"])
            
# common now holds all the rows of the dataframe, ordered by patients with known cancertypes

In [16]:
# RNA datafile is now defined with samples only from patients whose cancer type is known
rna_bycancertype = pd.concat(common)
rna_bycancertype.shape

(9094, 5000)

In [17]:
dname_file = os.path.join(run.ROOT_DIR, 'data', 'DNAMe_5000MAD.csv')

# Load in the other modalities and use .loc to order them in the same manner as the rna file
dna = pd.read_csv(dname_file, index_col=0)
dna_bycancertype = dna.loc[rna_bycancertype.index.values]

In [18]:
labels = np.array(cancer_types)
labels

array(['ACC', 'ACC', 'ACC', ..., 'UVM', 'UVM', 'UVM'], dtype='<U4')

In [19]:
# Now we have dataframes and cancer type labels, but they are all ordered
# They need to be shuffled to avoid overfitting of models

# # Shuffle labels and dataframes for training, but keep the same order of samples and labels
shuffler = np.random.permutation(len(labels))

labels_shuffle = labels[shuffler]
rna_shuffle = rna_bycancertype.iloc[shuffler]
dna_shuffle = dna_bycancertype.iloc[shuffler]

# Save ordered output files
np.save(os.path.join(run.ROOT_DIR, 'data', 'cancertype_labels.npy'), labels)
rna_bycancertype.to_csv(rna_file)
dna_bycancertype.to_csv(dname_file)

# Save shuffled output files
np.save(os.path.join(run.ROOT_DIR, 'data', 'shuffle_cancertype_labels.npy'), labels_shuffle)
rna_shuffle.to_csv(os.path.join(run.ROOT_DIR, 'data', 'shuffle_RNASeq_3000MAD.csv'))
dna_shuffle.to_csv(os.path.join(run.ROOT_DIR, 'data', 'shuffle_DNAMe_3000MAD.csv'))