# Process CNA data

In [1]:
import os
import numpy as np
import pandas as pd

data_dir = "../data"
raw_folder_name = "raw"
processed_folder_name = "processed"
hgnc_symbol_entrezgene_id_mapping_file_name = "hgnc_to_entrezgene_id_mapping.tsv"

cancer_type_cna_file_name_mapping = {
    "BLCA": "TCGA.BLCA.sampleMap_Gistic2_CopyNumber_Gistic2_all_data_by_genes",
    "LUSC": "TCGA.LUSC.sampleMap_Gistic2_CopyNumber_Gistic2_all_data_by_genes",
    "OV": "TCGA.OV.sampleMap_Gistic2_CopyNumber_Gistic2_all_data_by_genes",
}

hgnc_symbol_entrezgene_id_mapping = dict(pd.read_csv(os.path.join(data_dir, processed_folder_name, hgnc_symbol_entrezgene_id_mapping_file_name), sep="\t").values)


def select_the_row_with_max_expression_per_gene(x):
    x_sum = x.values.sum(axis=1).ravel()
    x_argmax = np.argmax(x_sum)
    return x.iloc[x_argmax, :]


def get_cna_df(cancer_type: str) -> pd.DataFrame:
    cna_file_name = cancer_type_cna_file_name_mapping[cancer_type]
    cna_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, cna_file_name), sep="\t")

    for index, row in cna_df.iterrows():
        gene_symbol_splitted = row["Gene Symbol"].split("|")
        if len(gene_symbol_splitted) == 1:
            cna_df.at[index, "entrezgene_id"] = hgnc_symbol_entrezgene_id_mapping.get(gene_symbol_splitted[0], np.NaN)
            cna_df.at[index, "ensembl_id"] = np.NaN
        elif len(gene_symbol_splitted) == 2:
            cna_df.at[index, "entrezgene_id"] = hgnc_symbol_entrezgene_id_mapping.get(gene_symbol_splitted[0], np.NaN)
            cna_df.at[index, "ensembl_id"] = gene_symbol_splitted[1]
        else:
            raise Exception("gene_symbol_splitted has more than 1 '|'s")
    
    cna_df = cna_df.drop(columns=["Gene Symbol", "ensembl_id"])
    cna_df = cna_df[~pd.isnull(cna_df["entrezgene_id"])]
    cna_df = cna_df.reset_index(drop=True)
    cna_df = cna_df.groupby("entrezgene_id").apply(lambda x: select_the_row_with_max_expression_per_gene(x)).reset_index(drop=True)
    cna_df = cna_df.T
    cna_df.columns = cna_df.loc["entrezgene_id", :]
    cna_df = cna_df.drop("entrezgene_id", axis=0)
    cna_df.columns = [int(column) for column in cna_df.columns]
    return cna_df

blca_cna_df = get_cna_df(cancer_type="BLCA")
lusc_cna_df = get_cna_df(cancer_type="LUSC")
ov_cna_df = get_cna_df(cancer_type="OV")


# Process RNA data

In [None]:
rna_data_file_name = "EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv"

rna_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, rna_data_file_name), sep="\t")
rna_df["gene_id"] = rna_df["gene_id"].apply(lambda x: int(x.split("|")[1]))

tumor_sample_ids = ["0" + str(i) for i in range(1, 10)]

column_dict = {}
for column in rna_df.columns:
    if column == "gene_id":
        continue
    if column.split("-")[3] not in tumor_sample_ids:
        continue
    column_first_15 = column[:15]
    if column_first_15 in column_dict.keys():
        if column < column_dict[column_first_15]:
            column_dict[column_first_15] = column
        else:
            continue
    else:
        column_dict[column_first_15] = column

rna_df = rna_df[["gene_id"] + list(column_dict.values())]
rna_df.columns = [column[:15] for column in rna_df.columns]

rna_df = rna_df.T
rna_df.columns = rna_df.loc["gene_id", :]
rna_df = rna_df.drop("gene_id", axis=0)
rna_df.columns = [int(column) for column in rna_df.columns]


In [None]:
rna_df

# Process RPPA data

In [8]:
data_dir = "../data"
raw_folder_name = "raw"
rppa_data_file_name = "TCGA-RPPA-pancan-clean.txt"

rppa_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, rppa_data_file_name), sep="\t")
rppa_df = rppa_df.drop(columns=["TumorType"])
rppa_df = rppa_df.T
rppa_df.columns = rppa_df.loc["SampleID", :].tolist()
rppa_df = rppa_df.drop("SampleID", axis=0)

column_dict = {}
for column in rppa_df.columns:
    if column.split("-")[3] not in tumor_sample_ids:
        continue
    column_first_15 = column[:15]
    if column_first_15 in column_dict.keys():
        if column < column_dict[column_first_15]:
            column_dict[column_first_15] = column
        else:
            continue
    else:
        column_dict[column_first_15] = column

rppa_df = rppa_df[list(column_dict.values())]

set([column.split("-")[-1] for column in rppa_df.columns])


In [11]:
set([sample_id.split("-")[-1] for sample_id in rppa_df["SampleID"]])

{'20', '23'}

In [40]:
(ov_cna_df.describe().loc["count", :] != 24776).sum()


1

In [24]:
blca_cna_df = blca_cna_df[pd.isnull(blca_cna_df["entrezgene_id"])]
lusc_cna_df = lusc_cna_df[pd.isnull(lusc_cna_df["entrezgene_id"])]
ov_cna_df = ov_cna_df[pd.isnull(ov_cna_df["entrezgene_id"])]


In [25]:
blca_nan_genes = [column.split("|")[0] for column in blca_cna_df["Gene Symbol"].tolist()]
lusc_nan_genes = [column.split("|")[0] for column in lusc_cna_df["Gene Symbol"].tolist()]
ov_nan_genes = [column.split("|")[0] for column in ov_cna_df["Gene Symbol"].tolist()]


In [28]:
rna_df[rna_df.gene_id.isin(blca_nan_genes + lusc_nan_genes + ov_nan_genes)]


Unnamed: 0,gene_id,TCGA-OR-A5J1-01A-11R-A29S-07,TCGA-OR-A5J2-01A-11R-A29S-07,TCGA-OR-A5J3-01A-11R-A29S-07,TCGA-OR-A5J5-01A-11R-A29S-07,TCGA-OR-A5J6-01A-31R-A29S-07,TCGA-OR-A5J7-01A-11R-A29S-07,TCGA-OR-A5J8-01A-11R-A29S-07,TCGA-OR-A5J9-01A-11R-A29S-07,TCGA-OR-A5JA-01A-11R-A29S-07,...,TCGA-CG-4449-01A-01R-1157-13,TCGA-CG-4462-01A-01R-1157-13,TCGA-CG-4465-01A-01R-1157-13,TCGA-CG-4466-01A-01R-1157-13,TCGA-CG-4469-01A-01R-1157-13,TCGA-CG-4472-01A-01R-1157-13,TCGA-CG-4474-01A-02R-1157-13,TCGA-CG-4475-01A-01R-1157-13,TCGA-CG-4476-01A-01R-1157-13,TCGA-CG-4477-01A-01R-1157-13
495,AKAP2,25.9126,555.163,89.4386,23.1758,309.869,118.319,321.741,187.438,112.558,...,1115.512151,1472.113832,441.238305,179.135973,850.465906,988.18573,588.680385,998.222727,941.993199,1036.272131
4243,CRIPAK,1000.0,111.931,134.499,920.217,161.51,202.567,179.859,207.728,401.691,...,119.426172,178.450011,183.994794,183.934078,264.24754,170.322916,298.361856,635.248453,238.273356,234.991948
6526,FLJ45079,0.0,0.0,0.0,0.0,0.0,0.0,1.6577,0.0,0.0,...,2.970308,0.663336,-0.078105,8.367408,1.410988,-0.078105,2.685201,-0.078105,2.310279,5.19024
12284,OCLM,2.4015,1.6105,1.7775,0.7746,0.0,3.209,0.4144,1.0199,0.0,...,5.990349,4.757428,2.636631,31.63021,13.390735,5.261172,8.437542,4.564389,6.66462,8.942182
12897,PALM2,40.3458,60.6361,34.3653,44.9264,43.5908,7.2202,149.606,36.3779,54.4397,...,27.214875,48.545469,30.326484,38.163535,16.352979,27.482717,84.796572,60.75224,17.386517,42.649851
13986,PRAMEF16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
13992,PRAMEF3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
17132,SPHAR,187.599,103.218,174.825,160.225,242.276,209.587,85.1057,152.899,142.532,...,87.281454,116.091629,100.504175,205.147945,146.295648,304.084884,127.005765,110.213938,139.509572,148.587225


In [20]:
blca_nan_genes

['MIR4417',
 'snoU13',
 'RN7SL729P',
 'MIR1273D',
 'RN7SL721P',
 'snoU13',
 'snoU13',
 'PRAMEF16',
 'PRAMEF3',
 'snoU13',
 'C1orf134',
 'snoU13',
 'RN7SL85P',
 'snoU13',
 'RN7SL421P',
 'RN7SL768P',
 'MIR4419A',
 'RN7SL532P',
 'RN7SL24P',
 'snoU13',
 'RN7SL857P',
 'snoU13',
 'snoU13',
 'RN7SL490P',
 'snoU13',
 'RN7SL501P',
 'snoU13',
 'SCARNA24',
 'FKSG48',
 'RN7SL136P',
 'RN7SL281P',
 'snoU13',
 'snoU13',
 'snoU13',
 'snoU13',
 'snoU13',
 'MIR1273F',
 'MIR5095',
 'MIR1273G',
 'snoU13',
 'SNORA2',
 'snoU13',
 'RN7SL235P',
 'snoU13',
 'snoU13',
 'snoU13',
 'snoU13',
 'snoU13',
 'snoU13',
 'snoU13',
 'snoU13',
 'SNORD59',
 'snoU13',
 'snoU13',
 'snoU13',
 'snoU13',
 'snoU13',
 'snoU13',
 'snoU13',
 'RN7SL374P',
 'OCLM',
 'snoU109',
 'snoU13',
 'snoU13',
 'snoU13',
 'RN7SL512P',
 'snoU13',
 'snoU13',
 'RN7SL276P',
 'snoU13',
 'snoU13',
 'snoU13',
 'snoU13',
 'SPHAR',
 'RN7SL467P',
 'snoU13',
 'snoU13',
 'RNA5SP81',
 'snoU13',
 'snoU13',
 'snoU13',
 'SNORA2',
 'snoU13',
 'SNORA36',
 'snoZ24

In [5]:
rna_df

Unnamed: 0,gene_id,TCGA-OR-A5J1-01A-11R-A29S-07,TCGA-OR-A5J2-01A-11R-A29S-07,TCGA-OR-A5J3-01A-11R-A29S-07,TCGA-OR-A5J5-01A-11R-A29S-07,TCGA-OR-A5J6-01A-31R-A29S-07,TCGA-OR-A5J7-01A-11R-A29S-07,TCGA-OR-A5J8-01A-11R-A29S-07,TCGA-OR-A5J9-01A-11R-A29S-07,TCGA-OR-A5JA-01A-11R-A29S-07,...,TCGA-CG-4449-01A-01R-1157-13,TCGA-CG-4462-01A-01R-1157-13,TCGA-CG-4465-01A-01R-1157-13,TCGA-CG-4466-01A-01R-1157-13,TCGA-CG-4469-01A-01R-1157-13,TCGA-CG-4472-01A-01R-1157-13,TCGA-CG-4474-01A-02R-1157-13,TCGA-CG-4475-01A-01R-1157-13,TCGA-CG-4476-01A-01R-1157-13,TCGA-CG-4477-01A-01R-1157-13
0,100130426,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,,,,,,,,,,
1,100133144,3.2661,2.6815,1.7301,0.0000,0.0000,1.1673,1.4422,0.0000,4.4556,...,4.358154,5.676995,5.219350,14.846708,20.115492,6.997533,18.311906,12.057112,18.628740,17.874417
2,100134869,3.9385,8.9948,6.5650,1.5492,4.4709,6.0529,2.2876,1.3599,5.0581,...,2.656360,3.342794,2.423442,5.055287,11.626054,13.654193,7.417109,11.585177,11.482418,14.919338
3,10357,149.1350,81.0777,86.4879,53.9117,66.9063,103.5060,94.9316,78.1955,69.2389,...,633.299781,294.018042,686.569179,563.573453,1039.307597,639.238135,742.479964,506.336449,712.452165,703.713324
4,10431,2034.1000,1304.9300,1054.6600,2350.8900,1257.9900,1866.4300,995.0270,1762.1200,1213.5300,...,1202.538277,644.002317,1181.884532,663.885074,647.530395,1297.152549,1152.909807,1375.495774,971.893874,1736.988111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20526,440590,0.4803,31.4052,0.5925,11.6189,7.8240,85.4392,0.4144,2.3799,1.0571,...,20.923873,1.839530,2.916935,239.014921,1.845753,3.268489,17.164493,3.756246,0.301440,217.431795
20527,79699,648.4150,1166.0200,806.3990,553.8340,795.8120,520.6580,556.1540,913.1870,805.4970,...,1322.386301,1025.213701,814.306556,907.845035,953.276441,905.046317,757.811259,927.963540,845.677334,859.078048
20528,7791,1841.0200,3059.9900,2655.6100,2367.9300,708.0710,855.1940,10924.6000,2122.1600,1939.2200,...,2783.898049,4960.431833,3447.701267,978.304677,2789.057736,3359.241568,4264.469081,3103.609391,3302.569055,2497.814797
20529,23140,1157.5400,1895.9900,1482.4500,1140.2000,796.3710,897.7140,1095.7300,1003.6200,904.8630,...,1284.992478,2054.896390,2420.047163,1302.821382,1119.313995,1740.926312,2702.668453,1370.141309,1915.477072,1247.130940


In [4]:
blca_cna_df


Unnamed: 0,TCGA-2F-A9KO-01,TCGA-2F-A9KP-01,TCGA-2F-A9KQ-01,TCGA-2F-A9KR-01,TCGA-2F-A9KT-01,TCGA-2F-A9KW-01,TCGA-4Z-AA7M-01,TCGA-4Z-AA7N-01,TCGA-4Z-AA7O-01,TCGA-4Z-AA7Q-01,...,TCGA-ZF-AA52-01,TCGA-ZF-AA53-01,TCGA-ZF-AA54-01,TCGA-ZF-AA56-01,TCGA-ZF-AA58-01,TCGA-ZF-AA5H-01,TCGA-ZF-AA5N-01,TCGA-ZF-AA5P-01,entrezgene_id,ensembl_id
0,-0.235,0.575,-0.010,-0.073,-0.037,-0.842,0.031,0.029,0.002,-0.038,...,0.337,0.242,-0.366,0.009,0.040,0.117,-0.018,-0.056,1.169830e+05,
1,-0.235,0.575,-0.010,-0.073,-0.037,-0.842,0.031,0.029,0.002,-0.038,...,0.337,0.242,-0.366,0.009,0.040,0.117,-0.018,-0.056,1.406250e+05,
2,-0.235,0.575,-0.010,-0.073,-0.037,-0.842,0.031,0.029,0.002,-0.038,...,0.337,0.242,-0.366,0.009,0.040,0.117,-0.018,-0.056,3.757900e+05,
3,-0.235,0.575,-0.010,-0.073,-0.037,-0.842,0.031,0.029,0.002,-0.038,...,0.337,0.242,-0.366,0.009,0.040,0.117,-0.018,-0.056,4.418690e+05,
4,-0.235,0.575,-0.010,-0.073,-0.037,-0.842,0.031,0.029,0.002,-0.038,...,0.337,0.242,-0.366,0.009,0.040,0.117,-0.018,-0.056,5.521000e+04,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24771,0.017,-0.087,-0.148,-0.103,-0.128,0.542,0.511,0.031,-0.123,-0.020,...,0.069,0.001,0.025,0.037,0.086,0.217,0.056,0.003,3.581000e+03,ENSG00000124334.12
24772,0.017,-0.087,-0.148,-0.103,-0.128,0.542,0.511,0.031,-0.123,-0.020,...,0.069,0.001,0.025,0.037,0.086,0.217,0.056,0.003,1.025100e+04,ENSG00000168939.6
24773,0.017,-0.087,-0.148,-0.103,-0.128,0.542,0.511,0.031,-0.123,-0.020,...,0.069,0.001,0.025,0.037,0.086,0.217,0.056,0.003,6.845000e+03,ENSG00000124333.10
24774,0.017,-0.087,-0.148,-0.103,-0.128,0.542,0.511,0.031,-0.123,-0.020,...,0.069,0.001,0.025,0.037,0.086,0.217,0.056,0.003,-2.147484e+09,ENSG00000182484.10


In [4]:
lusc_cna_df


Unnamed: 0,TCGA-18-3406-01,TCGA-18-3407-01,TCGA-18-3408-01,TCGA-18-3409-01,TCGA-18-3410-01,TCGA-18-3411-01,TCGA-18-3412-01,TCGA-18-3414-01,TCGA-18-3415-01,TCGA-18-3416-01,...,TCGA-NK-A7XE-01,TCGA-O2-A52N-01,TCGA-O2-A52Q-01,TCGA-O2-A52S-01,TCGA-O2-A52V-01,TCGA-O2-A52W-01,TCGA-O2-A5IB-01,TCGA-XC-AA0X-01,entrezgene_id,ensembl_id
0,0.284,-0.083,-0.775,-0.13,-0.004,-0.458,0.109,-0.132,-0.339,0.049,...,0.094,-0.375,-0.438,-0.634,-0.134,0.008,0.973,0.073,1.169830e+05,
1,0.284,-0.083,-0.775,-0.13,-0.004,-0.458,0.109,-0.132,-0.339,0.049,...,0.094,-0.375,-0.438,-0.634,-0.134,0.008,0.973,0.073,1.406250e+05,
2,0.284,-0.083,-0.775,-0.13,-0.004,-0.458,0.109,-0.132,-0.339,0.049,...,0.094,-0.375,-0.438,-0.634,-0.134,0.008,0.973,0.073,3.757900e+05,
3,0.284,-0.083,-0.775,-0.13,-0.004,-0.458,0.109,-0.132,-0.339,0.049,...,0.094,-0.375,-0.438,-0.634,-0.134,0.008,0.973,0.073,4.418690e+05,
4,0.284,-0.083,-0.775,-0.13,-0.004,-0.458,0.109,-0.132,-0.339,0.049,...,0.094,-0.375,-0.438,-0.634,-0.134,0.008,0.973,0.073,5.521000e+04,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24771,-0.258,0.249,0.319,0.00,0.019,0.013,-0.502,0.080,0.241,-0.237,...,0.254,-0.047,0.474,-0.175,-0.394,0.051,1.121,0.008,3.581000e+03,ENSG00000124334.12
24772,-0.258,0.249,0.319,0.00,0.019,0.013,-0.502,0.080,0.241,-0.237,...,0.254,-0.047,0.474,-0.175,-0.394,0.051,1.121,0.008,1.025100e+04,ENSG00000168939.6
24773,-0.258,0.249,0.319,0.00,0.019,0.013,-0.502,0.080,0.241,-0.237,...,0.254,-0.047,0.474,-0.175,-0.394,0.051,1.121,0.008,6.845000e+03,ENSG00000124333.10
24774,-0.258,0.249,0.319,0.00,0.019,0.013,-0.502,0.080,0.241,-0.237,...,0.254,-0.047,0.474,-0.175,-0.394,0.051,1.121,0.008,-2.147484e+09,ENSG00000182484.10


In [5]:
ov_cna_df


Unnamed: 0,TCGA-04-1331-01,TCGA-04-1332-01,TCGA-04-1335-01,TCGA-04-1336-01,TCGA-04-1337-01,TCGA-04-1338-01,TCGA-04-1341-01,TCGA-04-1342-01,TCGA-04-1343-01,TCGA-04-1346-01,...,TCGA-72-4237-01,TCGA-72-4238-01,TCGA-72-4240-01,TCGA-72-4241-01,TCGA-OY-A56P-01,TCGA-OY-A56Q-01,TCGA-VG-A8LO-01,TCGA-WR-A838-01,entrezgene_id,ensembl_id
0,-0.703,0.080,-0.807,0.101,0.021,-0.999,-0.421,0.089,0.279,-0.396,...,0.801,0.04,0.058,0.339,0.813,0.000,0.137,-0.176,1.169830e+05,
1,-0.703,0.080,-0.807,0.101,0.021,-0.999,-0.421,0.089,0.279,-0.396,...,0.801,0.04,0.058,0.339,0.813,0.000,0.137,-0.176,1.406250e+05,
2,-0.703,0.080,-0.807,0.101,0.021,-0.999,-0.421,0.089,0.279,-0.396,...,0.801,0.04,0.058,0.339,0.813,0.000,0.137,-0.176,3.757900e+05,
3,-0.703,0.080,-0.807,0.101,0.021,-0.999,-0.421,0.089,0.279,-0.396,...,0.801,0.04,0.058,0.339,0.813,0.000,0.137,-0.176,4.418690e+05,
4,-0.703,0.080,-0.807,0.101,0.021,-0.999,-0.421,0.089,0.279,-0.396,...,0.801,0.04,0.058,0.339,0.813,0.000,0.137,-0.176,5.521000e+04,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24771,-0.368,-0.256,0.673,-0.336,0.062,-0.082,-0.589,0.136,0.187,0.356,...,-0.143,-0.21,-0.349,-0.100,-0.080,-0.195,-0.675,0.065,3.581000e+03,ENSG00000124334.12
24772,-0.368,-0.256,0.673,-0.336,0.062,-0.082,-0.589,0.136,0.187,0.356,...,-0.143,-0.21,-0.349,-0.100,-0.080,-0.195,-0.675,0.065,1.025100e+04,ENSG00000168939.6
24773,-0.368,-0.256,0.673,-0.336,0.062,-0.082,-0.589,0.136,0.187,0.356,...,-0.143,-0.21,-0.349,-0.100,-0.080,-0.195,-0.675,0.065,6.845000e+03,ENSG00000124333.10
24774,-0.368,-0.256,0.673,-0.336,0.062,-0.082,-0.589,0.136,0.187,0.356,...,-0.143,-0.21,-0.349,-0.100,-0.080,-0.195,-0.675,0.065,-2.147484e+09,ENSG00000182484.10


# Number of OV samples which occurs both in ov_subtype_info and rna_data

In [14]:
import os
import pandas as pd

data_dir = "../data"
raw_folder_name = "raw"
rna_data_file_name = "EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv"

rna_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, rna_data_file_name), sep="\t")

ov_subtype_info_file_name = "ov_subtype_info.tsv"
ov_subtype_info_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, ov_subtype_info_file_name), sep="\t")

len(set([column[:15] for column in rna_df.columns if column != "gene_id"]).intersection(set(ov_subtype_info_df.ID)))


295

In [1]:
import os
import pandas as pd

data_dir = "../data"
raw_folder_name = "raw"
rna_data_file_name = "EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv"

rna_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, rna_data_file_name), sep="\t")

cna_file_name = "TCGA.LUSC.sampleMap_Gistic2_CopyNumber_Gistic2_all_data_by_genes"
cna_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, cna_file_name), sep="\t")

# ov_subtype_info_file_name = "o_subtype_info.tsv"
# ov_subtype_info_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, ov_subtype_info_file_name), sep="\t")

# len(set([column[:15] for column in rna_df.columns if column != "gene_id"]).intersection(set(ov_subtype_info_df.ID)))


In [3]:
rna_df

Unnamed: 0,gene_id,TCGA-OR-A5J1-01A-11R-A29S-07,TCGA-OR-A5J2-01A-11R-A29S-07,TCGA-OR-A5J3-01A-11R-A29S-07,TCGA-OR-A5J5-01A-11R-A29S-07,TCGA-OR-A5J6-01A-31R-A29S-07,TCGA-OR-A5J7-01A-11R-A29S-07,TCGA-OR-A5J8-01A-11R-A29S-07,TCGA-OR-A5J9-01A-11R-A29S-07,TCGA-OR-A5JA-01A-11R-A29S-07,...,TCGA-CG-4449-01A-01R-1157-13,TCGA-CG-4462-01A-01R-1157-13,TCGA-CG-4465-01A-01R-1157-13,TCGA-CG-4466-01A-01R-1157-13,TCGA-CG-4469-01A-01R-1157-13,TCGA-CG-4472-01A-01R-1157-13,TCGA-CG-4474-01A-02R-1157-13,TCGA-CG-4475-01A-01R-1157-13,TCGA-CG-4476-01A-01R-1157-13,TCGA-CG-4477-01A-01R-1157-13
0,?|100130426,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,,,,,,,,,,
1,?|100133144,3.2661,2.6815,1.7301,0.0000,0.0000,1.1673,1.4422,0.0000,4.4556,...,4.358154,5.676995,5.219350,14.846708,20.115492,6.997533,18.311906,12.057112,18.628740,17.874417
2,?|100134869,3.9385,8.9948,6.5650,1.5492,4.4709,6.0529,2.2876,1.3599,5.0581,...,2.656360,3.342794,2.423442,5.055287,11.626054,13.654193,7.417109,11.585177,11.482418,14.919338
3,?|10357,149.1350,81.0777,86.4879,53.9117,66.9063,103.5060,94.9316,78.1955,69.2389,...,633.299781,294.018042,686.569179,563.573453,1039.307597,639.238135,742.479964,506.336449,712.452165,703.713324
4,?|10431,2034.1000,1304.9300,1054.6600,2350.8900,1257.9900,1866.4300,995.0270,1762.1200,1213.5300,...,1202.538277,644.002317,1181.884532,663.885074,647.530395,1297.152549,1152.909807,1375.495774,971.893874,1736.988111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20526,ZYG11A|440590,0.4803,31.4052,0.5925,11.6189,7.8240,85.4392,0.4144,2.3799,1.0571,...,20.923873,1.839530,2.916935,239.014921,1.845753,3.268489,17.164493,3.756246,0.301440,217.431795
20527,ZYG11B|79699,648.4150,1166.0200,806.3990,553.8340,795.8120,520.6580,556.1540,913.1870,805.4970,...,1322.386301,1025.213701,814.306556,907.845035,953.276441,905.046317,757.811259,927.963540,845.677334,859.078048
20528,ZYX|7791,1841.0200,3059.9900,2655.6100,2367.9300,708.0710,855.1940,10924.6000,2122.1600,1939.2200,...,2783.898049,4960.431833,3447.701267,978.304677,2789.057736,3359.241568,4264.469081,3103.609391,3302.569055,2497.814797
20529,ZZEF1|23140,1157.5400,1895.9900,1482.4500,1140.2000,796.3710,897.7140,1095.7300,1003.6200,904.8630,...,1284.992478,2054.896390,2420.047163,1302.821382,1119.313995,1740.926312,2702.668453,1370.141309,1915.477072,1247.130940


In [2]:
cna_df

Unnamed: 0,Gene Symbol,TCGA-18-3406-01,TCGA-18-3407-01,TCGA-18-3408-01,TCGA-18-3409-01,TCGA-18-3410-01,TCGA-18-3411-01,TCGA-18-3412-01,TCGA-18-3414-01,TCGA-18-3415-01,...,TCGA-NK-A5CX-01,TCGA-NK-A5D1-01,TCGA-NK-A7XE-01,TCGA-O2-A52N-01,TCGA-O2-A52Q-01,TCGA-O2-A52S-01,TCGA-O2-A52V-01,TCGA-O2-A52W-01,TCGA-O2-A5IB-01,TCGA-XC-AA0X-01
0,ACAP3,0.284,-0.083,-0.775,-0.13,-0.004,-0.458,0.109,-0.132,-0.339,...,0.043,0.021,0.094,-0.375,-0.438,-0.634,-0.134,0.008,0.973,0.073
1,ACTRT2,0.284,-0.083,-0.775,-0.13,-0.004,-0.458,0.109,-0.132,-0.339,...,0.043,0.021,0.094,-0.375,-0.438,-0.634,-0.134,0.008,0.973,0.073
2,AGRN,0.284,-0.083,-0.775,-0.13,-0.004,-0.458,0.109,-0.132,-0.339,...,0.043,0.021,0.094,-0.375,-0.438,-0.634,-0.134,0.008,0.973,0.073
3,ANKRD65,0.284,-0.083,-0.775,-0.13,-0.004,-0.458,0.109,-0.132,-0.339,...,0.043,0.021,0.094,-0.375,-0.438,-0.634,-0.134,0.008,0.973,0.073
4,ATAD3A,0.284,-0.083,-0.775,-0.13,-0.004,-0.458,0.109,-0.132,-0.339,...,0.043,0.021,0.094,-0.375,-0.438,-0.634,-0.134,0.008,0.973,0.073
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24771,IL9R|ENSG00000124334.12,-0.258,0.249,0.319,0.00,0.019,0.013,-0.502,0.080,0.241,...,0.044,-0.447,0.254,-0.047,0.474,-0.175,-0.394,0.051,1.121,0.008
24772,SPRY3|ENSG00000168939.6,-0.258,0.249,0.319,0.00,0.019,0.013,-0.502,0.080,0.241,...,0.044,-0.447,0.254,-0.047,0.474,-0.175,-0.394,0.051,1.121,0.008
24773,VAMP7|ENSG00000124333.10,-0.258,0.249,0.319,0.00,0.019,0.013,-0.502,0.080,0.241,...,0.044,-0.447,0.254,-0.047,0.474,-0.175,-0.394,0.051,1.121,0.008
24774,WASH6P|ENSG00000182484.10,-0.258,0.249,0.319,0.00,0.019,0.013,-0.502,0.080,0.241,...,0.044,-0.447,0.254,-0.047,0.474,-0.175,-0.394,0.051,1.121,0.008


In [18]:
rna_df

Unnamed: 0,gene_id,TCGA-OR-A5J1-01A-11R-A29S-07,TCGA-OR-A5J2-01A-11R-A29S-07,TCGA-OR-A5J3-01A-11R-A29S-07,TCGA-OR-A5J5-01A-11R-A29S-07,TCGA-OR-A5J6-01A-31R-A29S-07,TCGA-OR-A5J7-01A-11R-A29S-07,TCGA-OR-A5J8-01A-11R-A29S-07,TCGA-OR-A5J9-01A-11R-A29S-07,TCGA-OR-A5JA-01A-11R-A29S-07,...,TCGA-CG-4449-01A-01R-1157-13,TCGA-CG-4462-01A-01R-1157-13,TCGA-CG-4465-01A-01R-1157-13,TCGA-CG-4466-01A-01R-1157-13,TCGA-CG-4469-01A-01R-1157-13,TCGA-CG-4472-01A-01R-1157-13,TCGA-CG-4474-01A-02R-1157-13,TCGA-CG-4475-01A-01R-1157-13,TCGA-CG-4476-01A-01R-1157-13,TCGA-CG-4477-01A-01R-1157-13
0,?|100130426,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,,,,,,,,,,
1,?|100133144,3.2661,2.6815,1.7301,0.0000,0.0000,1.1673,1.4422,0.0000,4.4556,...,4.358154,5.676995,5.219350,14.846708,20.115492,6.997533,18.311906,12.057112,18.628740,17.874417
2,?|100134869,3.9385,8.9948,6.5650,1.5492,4.4709,6.0529,2.2876,1.3599,5.0581,...,2.656360,3.342794,2.423442,5.055287,11.626054,13.654193,7.417109,11.585177,11.482418,14.919338
3,?|10357,149.1350,81.0777,86.4879,53.9117,66.9063,103.5060,94.9316,78.1955,69.2389,...,633.299781,294.018042,686.569179,563.573453,1039.307597,639.238135,742.479964,506.336449,712.452165,703.713324
4,?|10431,2034.1000,1304.9300,1054.6600,2350.8900,1257.9900,1866.4300,995.0270,1762.1200,1213.5300,...,1202.538277,644.002317,1181.884532,663.885074,647.530395,1297.152549,1152.909807,1375.495774,971.893874,1736.988111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20526,ZYG11A|440590,0.4803,31.4052,0.5925,11.6189,7.8240,85.4392,0.4144,2.3799,1.0571,...,20.923873,1.839530,2.916935,239.014921,1.845753,3.268489,17.164493,3.756246,0.301440,217.431795
20527,ZYG11B|79699,648.4150,1166.0200,806.3990,553.8340,795.8120,520.6580,556.1540,913.1870,805.4970,...,1322.386301,1025.213701,814.306556,907.845035,953.276441,905.046317,757.811259,927.963540,845.677334,859.078048
20528,ZYX|7791,1841.0200,3059.9900,2655.6100,2367.9300,708.0710,855.1940,10924.6000,2122.1600,1939.2200,...,2783.898049,4960.431833,3447.701267,978.304677,2789.057736,3359.241568,4264.469081,3103.609391,3302.569055,2497.814797
20529,ZZEF1|23140,1157.5400,1895.9900,1482.4500,1140.2000,796.3710,897.7140,1095.7300,1003.6200,904.8630,...,1284.992478,2054.896390,2420.047163,1302.821382,1119.313995,1740.926312,2702.668453,1370.141309,1915.477072,1247.130940


In [17]:
import os
import pandas as pd

data_dir = "../data"
raw_folder_name = "raw"

ov_cna_file_name = "TCGA.OV.sampleMap_Gistic2_CopyNumber_Gistic2_all_data_by_genes"
ov_cna_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, ov_cna_file_name), sep="\t")

len(set([column[:15] for column in rna_df.columns if column != "gene_id"]).intersection(set([column for column in ov_cna_df.columns if column != "Gene Symbol"])))


301

In [16]:
ov_cna_df

Unnamed: 0,Gene Symbol,TCGA-04-1331-01,TCGA-04-1332-01,TCGA-04-1335-01,TCGA-04-1336-01,TCGA-04-1337-01,TCGA-04-1338-01,TCGA-04-1341-01,TCGA-04-1342-01,TCGA-04-1343-01,...,TCGA-72-4235-01,TCGA-72-4236-01,TCGA-72-4237-01,TCGA-72-4238-01,TCGA-72-4240-01,TCGA-72-4241-01,TCGA-OY-A56P-01,TCGA-OY-A56Q-01,TCGA-VG-A8LO-01,TCGA-WR-A838-01
0,ACAP3,-0.703,0.080,-0.807,0.101,0.021,-0.999,-0.421,0.089,0.279,...,-0.076,0.054,0.801,0.04,0.058,0.339,0.813,0.000,0.137,-0.176
1,ACTRT2,-0.703,0.080,-0.807,0.101,0.021,-0.999,-0.421,0.089,0.279,...,-0.076,0.054,0.801,0.04,0.058,0.339,0.813,0.000,0.137,-0.176
2,AGRN,-0.703,0.080,-0.807,0.101,0.021,-0.999,-0.421,0.089,0.279,...,-0.076,0.054,0.801,0.04,0.058,0.339,0.813,0.000,0.137,-0.176
3,ANKRD65,-0.703,0.080,-0.807,0.101,0.021,-0.999,-0.421,0.089,0.279,...,-0.076,0.054,0.801,0.04,0.058,0.339,0.813,0.000,0.137,-0.176
4,ATAD3A,-0.703,0.080,-0.807,0.101,0.021,-0.999,-0.421,0.089,0.279,...,-0.076,0.054,0.801,0.04,0.058,0.339,0.813,0.000,0.137,-0.176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24771,IL9R|ENSG00000124334.12,-0.368,-0.256,0.673,-0.336,0.062,-0.082,-0.589,0.136,0.187,...,-0.081,-0.216,-0.143,-0.21,-0.349,-0.100,-0.080,-0.195,-0.675,0.065
24772,SPRY3|ENSG00000168939.6,-0.368,-0.256,0.673,-0.336,0.062,-0.082,-0.589,0.136,0.187,...,-0.081,-0.216,-0.143,-0.21,-0.349,-0.100,-0.080,-0.195,-0.675,0.065
24773,VAMP7|ENSG00000124333.10,-0.368,-0.256,0.673,-0.336,0.062,-0.082,-0.589,0.136,0.187,...,-0.081,-0.216,-0.143,-0.21,-0.349,-0.100,-0.080,-0.195,-0.675,0.065
24774,WASH6P|ENSG00000182484.10,-0.368,-0.256,0.673,-0.336,0.062,-0.082,-0.589,0.136,0.187,...,-0.081,-0.216,-0.143,-0.21,-0.349,-0.100,-0.080,-0.195,-0.675,0.065


In [None]:
ov_subtype_info_file_name = "ov_subtype_info.tsv"
ov_subtype_info = pd.read_csv(os.path.join(data_dir, raw_folder_name, ov_subtype_info_file_name), sep="\t")
len(set(ov_subtype_info.ID))


In [3]:
import os
import pandas as pd

data_dir = "../data"
raw_folder_name = "raw"
cancer_type_raw_cna_file_name_map = {
    "BLCA": "TCGA.BLCA.sampleMap_Gistic2_CopyNumber_Gistic2_all_data_by_genes",
    "LUSC": "TCGA.LUSC.sampleMap_Gistic2_CopyNumber_Gistic2_all_data_by_genes",
    "OV": "TCGA.OV.sampleMap_Gistic2_CopyNumber_Gistic2_all_data_by_genes"
}
cancer_type = "OV"
cancer_specific_cna_file_name = cancer_type_raw_cna_file_name_map[cancer_type]
df = pd.read_csv(os.path.join(data_dir, raw_folder_name, cancer_specific_cna_file_name), sep="\t")


In [8]:
import os
import numpy as np
import pandas as pd

data_dir = "../data"
raw_folder_name = "raw"

tumor_sample_ids = ["0" + str(i) for i in range(1, 10)]

cancer_type_raw_cna_file_name_map = {
    "BLCA": "TCGA.BLCA.sampleMap_Gistic2_CopyNumber_Gistic2_all_data_by_genes",
    "LUSC": "TCGA.LUSC.sampleMap_Gistic2_CopyNumber_Gistic2_all_data_by_genes",
    "OV": "TCGA.OV.sampleMap_Gistic2_CopyNumber_Gistic2_all_data_by_genes"
}

cancer_type = "BLCA"
cancer_specific_cna_file_name = cancer_type_raw_cna_file_name_map[cancer_type]
cna_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, cancer_specific_cna_file_name), sep="\t")



In [9]:

def select_the_ensemble_id_with_maximum_gene_expression(x):
    x_copy = x.drop(columns=["ensembl_id"])
    x_copy = 
    


def get_one_ensemble_id_per_gene(cna_df):
    cna_df_nan_ensemble_id = cna_df[pd.isnull(cna_df["ensemble_id"])]
    assert (cna_df_nan_ensemble_id.groupby("hgnc_symbol").count() != 1).sum() == 0
    cna_df_not_nan_ensemble_id = cna_df[~pd.isnull(cna_df["ensemble_id"])]
    cna_df_not_nan_ensemble_id = cna_df_not_nan_ensemble_id.groupby("hgnc_symbol").apply(lambda x: )
    




Unnamed: 0,hgnc_symbol,TCGA-2F-A9KO-01,TCGA-2F-A9KP-01,TCGA-2F-A9KQ-01,TCGA-2F-A9KR-01,TCGA-2F-A9KT-01,TCGA-2F-A9KW-01,TCGA-4Z-AA7M-01,TCGA-4Z-AA7N-01,TCGA-4Z-AA7O-01,...,TCGA-ZF-AA51-01,TCGA-ZF-AA52-01,TCGA-ZF-AA53-01,TCGA-ZF-AA54-01,TCGA-ZF-AA56-01,TCGA-ZF-AA58-01,TCGA-ZF-AA5H-01,TCGA-ZF-AA5N-01,TCGA-ZF-AA5P-01,ensembl_id
0,ACAP3,-0.235,0.575,-0.010,-0.073,-0.037,-0.842,0.031,0.029,0.002,...,0.561,0.337,0.242,-0.366,0.009,0.040,0.117,-0.018,-0.056,
1,ACTRT2,-0.235,0.575,-0.010,-0.073,-0.037,-0.842,0.031,0.029,0.002,...,0.561,0.337,0.242,-0.366,0.009,0.040,0.117,-0.018,-0.056,
2,AGRN,-0.235,0.575,-0.010,-0.073,-0.037,-0.842,0.031,0.029,0.002,...,0.561,0.337,0.242,-0.366,0.009,0.040,0.117,-0.018,-0.056,
3,ANKRD65,-0.235,0.575,-0.010,-0.073,-0.037,-0.842,0.031,0.029,0.002,...,0.561,0.337,0.242,-0.366,0.009,0.040,0.117,-0.018,-0.056,
4,ATAD3A,-0.235,0.575,-0.010,-0.073,-0.037,-0.842,0.031,0.029,0.002,...,0.561,0.337,0.242,-0.366,0.009,0.040,0.117,-0.018,-0.056,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24771,IL9R,0.017,-0.087,-0.148,-0.103,-0.128,0.542,0.511,0.031,-0.123,...,-0.413,0.069,0.001,0.025,0.037,0.086,0.217,0.056,0.003,ENSG00000124334.12
24772,SPRY3,0.017,-0.087,-0.148,-0.103,-0.128,0.542,0.511,0.031,-0.123,...,-0.413,0.069,0.001,0.025,0.037,0.086,0.217,0.056,0.003,ENSG00000168939.6
24773,VAMP7,0.017,-0.087,-0.148,-0.103,-0.128,0.542,0.511,0.031,-0.123,...,-0.413,0.069,0.001,0.025,0.037,0.086,0.217,0.056,0.003,ENSG00000124333.10
24774,WASH6P,0.017,-0.087,-0.148,-0.103,-0.128,0.542,0.511,0.031,-0.123,...,-0.413,0.069,0.001,0.025,0.037,0.086,0.217,0.056,0.003,ENSG00000182484.10


In [25]:
import os
import numpy as np
import pandas as pd

data_dir = "../data"
raw_folder_name = "raw"




def get_one_ensembl_id_per_gene():
    
    
    



In [27]:
cna_df

Unnamed: 0,hgnc_symbol,TCGA-2F-A9KO-01,TCGA-2F-A9KP-01,TCGA-2F-A9KQ-01,TCGA-2F-A9KR-01,TCGA-2F-A9KT-01,TCGA-2F-A9KW-01,TCGA-4Z-AA7M-01,TCGA-4Z-AA7N-01,TCGA-4Z-AA7O-01,...,TCGA-ZF-AA51-01,TCGA-ZF-AA52-01,TCGA-ZF-AA53-01,TCGA-ZF-AA54-01,TCGA-ZF-AA56-01,TCGA-ZF-AA58-01,TCGA-ZF-AA5H-01,TCGA-ZF-AA5N-01,TCGA-ZF-AA5P-01,ensembl_id
0,ACAP3,-0.235,0.575,-0.010,-0.073,-0.037,-0.842,0.031,0.029,0.002,...,0.561,0.337,0.242,-0.366,0.009,0.040,0.117,-0.018,-0.056,
1,ACTRT2,-0.235,0.575,-0.010,-0.073,-0.037,-0.842,0.031,0.029,0.002,...,0.561,0.337,0.242,-0.366,0.009,0.040,0.117,-0.018,-0.056,
2,AGRN,-0.235,0.575,-0.010,-0.073,-0.037,-0.842,0.031,0.029,0.002,...,0.561,0.337,0.242,-0.366,0.009,0.040,0.117,-0.018,-0.056,
3,ANKRD65,-0.235,0.575,-0.010,-0.073,-0.037,-0.842,0.031,0.029,0.002,...,0.561,0.337,0.242,-0.366,0.009,0.040,0.117,-0.018,-0.056,
4,ATAD3A,-0.235,0.575,-0.010,-0.073,-0.037,-0.842,0.031,0.029,0.002,...,0.561,0.337,0.242,-0.366,0.009,0.040,0.117,-0.018,-0.056,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24771,IL9R,0.017,-0.087,-0.148,-0.103,-0.128,0.542,0.511,0.031,-0.123,...,-0.413,0.069,0.001,0.025,0.037,0.086,0.217,0.056,0.003,ENSG00000124334.12
24772,SPRY3,0.017,-0.087,-0.148,-0.103,-0.128,0.542,0.511,0.031,-0.123,...,-0.413,0.069,0.001,0.025,0.037,0.086,0.217,0.056,0.003,ENSG00000168939.6
24773,VAMP7,0.017,-0.087,-0.148,-0.103,-0.128,0.542,0.511,0.031,-0.123,...,-0.413,0.069,0.001,0.025,0.037,0.086,0.217,0.056,0.003,ENSG00000124333.10
24774,WASH6P,0.017,-0.087,-0.148,-0.103,-0.128,0.542,0.511,0.031,-0.123,...,-0.413,0.069,0.001,0.025,0.037,0.086,0.217,0.056,0.003,ENSG00000182484.10


In [18]:
[column for column in cna_df.columns if column.split("-")[-1] != "01"]

['Gene Symbol']

In [12]:
import os
import pandas as pd

cancer_type_raw_cna_file_name_map = {
    "BLCA": "TCGA.BLCA.sampleMap_Gistic2_CopyNumber_Gistic2_all_data_by_genes",
    "LUSC": "TCGA.LUSC.sampleMap_Gistic2_CopyNumber_Gistic2_all_data_by_genes",
    "OV": "TCGA.OV.sampleMap_Gistic2_CopyNumber_Gistic2_all_data_by_genes"
}

cancer_type = "BLCA"

cancer_specific_cna_file_name = cancer_type_raw_cna_file_name_map[cancer_type]
cna_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, cancer_specific_cna_file_name), sep="\t")
cancer_specific_sample_ids = [column for column in cna_df.columns if column.split("-")[-1] in tumor_sample_ids]
cancer_specific_rna_df = rna_df[rna_df.index.map(lambda x: x[:15] in cancer_specific_sample_ids)]



In [13]:
cancer_specific_rna_df


Unnamed: 0,100130426,100133144,100134869,10357,10431,136542,155060,26823,280660,317712,...,55055,11130,7789,158586,79364,440590,79699,7791,23140,26009
TCGA-2F-A9KO-01A-11R-A38B-07,0.0,20.4373,37.8717,123.09,702.041,0.0,406.997,0.5831,0.0,0.0,...,694.857,554.519,102.624,776.676,1450.73,5.8309,697.959,4262.39,1787.76,572.595
TCGA-2F-A9KP-01A-11R-A38B-07,0.0,16.1382,12.5759,137.886,882.231,0.0,182.272,0.8323,0.0,0.0,...,718.119,969.621,37.037,316.688,1420.72,17.062,1526.43,3105.29,1467.75,860.591
TCGA-2F-A9KQ-01A-11R-A38B-07,0.0,13.3333,10.7742,104.678,954.103,0.0,224.849,0.0,0.0,0.0,...,532.536,795.086,60.2689,378.303,873.435,1.8544,1571.16,3275.38,750.58,621.233
TCGA-2F-A9KR-01A-11R-A38B-07,0.0,15.3523,42.581,146.453,487.21,0.0,472.964,2.3743,0.0,0.0,...,616.848,568.412,45.5869,470.115,1924.15,1.8995,748.86,4465.62,910.788,935.481
TCGA-2F-A9KT-01A-11R-A38B-07,0.0,14.0136,17.6427,142.962,954.777,0.0,234.596,0.0,0.0,0.0,...,726.625,1087.05,17.524,281.515,938.383,6.7835,878.462,3449.41,850.198,630.865
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-ZF-AA56-01A-31R-A39I-07,0.4411,5.8271,7.8474,185.673,575.21,0.0,88.6634,0.0,0.0,0.0,...,551.566,1201.15,22.4967,220.997,745.038,2.2056,697.839,5666.08,752.977,619.321
TCGA-ZF-AA58-01A-12R-A42T-07,0.4216,1.7032,2.0911,210.181,1595.7,0.0,193.086,0.0,0.0,0.0,...,532.618,1059.44,37.5211,254.637,1037.1,143.339,711.214,9911.05,745.363,548.904
TCGA-ZF-AA5H-01A-11R-A39I-07,0.0,3.3904,1.1387,186.938,1100.57,0.0,136.196,0.0,0.0,0.0,...,520.105,1248.41,19.0869,186.663,498.2,0.3235,671.924,9212.5,911.319,1053.66
TCGA-ZF-AA5N-01A-11R-A42T-07,1.227,0.0,0.0,155.301,1123.93,0.0,233.129,1.227,0.0,0.0,...,496.638,641.718,13.4969,121.472,534.969,15.9509,358.282,6306.75,850.307,380.368


In [None]:
import os
import pandas as pd

data_dir = "../data"
raw_folder_name = "raw"
processed_folder_name = "processed"
rna_file_name = "EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv"

rna_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, rna_file_name), sep="\t")
rna_df["gene_id"] = rna_df["gene_id"].apply(lambda x: x.split("|")[1])
rna_df = rna_df.T
rna_df.columns = rna_df.loc["gene_id"].tolist()
rna_df = rna_df.drop("gene_id")

cancer_type_raw_cna_file_name_map = {
    "BLCA": "TCGA.BLCA.sampleMap_Gistic2_CopyNumber_Gistic2_all_data_by_genes",
    "LUSC": "TCGA.LUSC.sampleMap_Gistic2_CopyNumber_Gistic2_all_data_by_genes",
    "OV": "TCGA.OV.sampleMap_Gistic2_CopyNumber_Gistic2_all_data_by_genes"
}

cancer_type_id_mapping = {
    "BLCA": 0,
    "LUSC": 1,
    "OV": 2
}

tumor_sample_ids = ["0" + str(i) for i in range(1, 10)]

def get_cancer_specific_cna_df(cancer_type: str) -> pd.DataFrame:
    cancer_specific_cna_file_name = cancer_type_raw_cna_file_name_map[cancer_type]
    cna_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, cancer_specific_cna_file_name), sep="\t")
    cancer_specific_sample_ids = [column for column in cna_df.columns if column.split("-")[-1] in tumor_sample_ids]
    cancer_specific_rna_df = rna_df[rna_df.index.map(lambda x: x[:15] in cancer_specific_sample_ids)]
    


def get_cancer_specific_rna_df(cancer_type: str) -> pd.DataFrame:
    cancer_specific_cna_file_name = cancer_type_raw_cna_file_name_map[cancer_type]
    cna_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, cancer_specific_cna_file_name), sep="\t")
    cancer_specific_sample_ids = [column for column in cna_df.columns if column.split("-")[-1] in tumor_sample_ids]
    cancer_specific_rna_df = rna_df[rna_df.index.map(lambda x: x[:15] in cancer_specific_sample_ids)]
    
    sample_id_dict = {}
    for index in rna_df.index:
        index_first_15 = index[:15]
        if index_first_15 in sample_id_dict.keys():
            if index < sample_id_dict[index_first_15]:
                sample_id_dict[index_first_15] = index
            else:
                continue
        else:
            sample_id_dict[index_first_15] = index
    
    cancer_specific_rna_df = cancer_specific_rna_df[cancer_specific_rna_df.index.map(lambda x: x in sample_id_dict.values())]
    cancer_specific_rna_df.index = cancer_specific_rna_df.index.map(lambda x: x[:15])
    cancer_specific_id = cancer_type_id_mapping[cancer_type]
    cancer_specific_rna_df["cancer_id"] = cancer_specific_id
    return cancer_specific_rna_df

rna_blca_df = get_cancer_specific_rna_df(cancer_type="BLCA")
rna_lusc_df = get_cancer_specific_rna_df(cancer_type="LUSC")
rna_ov_df = get_cancer_specific_rna_df(cancer_type="OV")

rna_blca_genes_without_nans = set(column for column in rna_blca_df.dropna(axis=1).columns if column != "cancer_id")
rna_lusc_genes_without_nans = set(column for column in rna_lusc_df.dropna(axis=1).columns if column != "cancer_id")
rna_ov_genes_without_nans = set(column for column in rna_ov_df.dropna(axis=1).columns if column != "cancer_id")

selected_genes = rna_blca_genes_without_nans.intersection(rna_lusc_genes_without_nans)
selected_genes = list(selected_genes.intersection(rna_ov_genes_without_nans))

rna_blca_df = rna_blca_df[["cancer_id"] + selected_genes]
rna_lusc_df = rna_lusc_df[["cancer_id"] + selected_genes]
rna_ov_df = rna_ov_df[["cancer_id"] + selected_genes]


#     
#     cancer_specific
    
    

# blca_rna_df = get_cancer_specific_rna_df(cancer_type="BLCA")
    

# cna_blca_df = 
# cna_lusc_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, cna_lusc_file_name), sep="\t")
# cna_ov_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, cna_ov_file_name), sep="\t")


# blca_samples = 
# lusc_samples = [column for column in cna_lusc_df.columns if column.split("-")[-1] in tumor_sample_ids]
# ov_samples = [column for column in cna_ov_df.columns if column.split("-")[-1] in tumor_sample_ids]

# output_file_name = "rna.tsv"

# 

# cancer_symbol_id_map = {
#     "BLCA": 0,
#     "LUSC": 1,
#     "OV": 2
# }

# rna_blca_df["cancer_type"] = cancer_symbol_id_map["BLCA"]
# rna_lusc_df["cancer_type"] = cancer_symbol_id_map["LUSC"]
# rna_ov_df["cancer_type"] = cancer_symbol_id_map["OV"]


# # rna_lusc_df = rna_df[["gene_id"] + [column for column in rna_df.columns if column[:15] in lusc_samples]]
# # rna_lusc_df["cancer_type"] = cancer_symbol_id_map["LUSC"]

# # rna_ov_df = rna_df[["gene_id"] + [column for column in rna_df.columns if column[:15] in ov_samples]]
# # rna_ov_df["cancer_type"] = cancer_symbol_id_map["OV"]


# # # rna_blca_lusc_ov_df = rna_df[[column for column in rna_df.columns if column[:15] in blca_lusc_ov_samples]]

# # # We take only the tumor samples.
# # # If there are multiple samples which have the same project id, TSS id, participant id, and sample id,
# # # then we take the one with lexicographically first vial id. If there is also a tie in vial ids, then we
# # # take the one with (numerically, lexicographically) first (portion id, analyte id).

# # # column_dict = {}
# # # for column in rna_df.columns:
# # #     if column == "gene_id":
# # #         continue

# # #     if column.split("-")[3][:2] not in tumor_sample_ids:
# # #         continue

# # #     column_first_15 = column[:15]
# # #     if column_first_15 in column_dict.keys():
# # #         if column < column_dict[column_first_15]:
# # #             column_dict[column_first_15] = column
# # #         else:
# # #             continue
# # #     else:
# # #         column_dict[column_first_15] = column

# # # columns_before = rna_df.shape[0]
# # # rna_df = rna_df[["gene_id"] + list(column_dict.values())]
# # # columns_after = rna_df.shape[1]
# # # print(f"Number of samples dropped: {columns_before - columns_after}")

# # # rna_df.columns = rna_df.columns.map(lambda x: x[:15])

# # # # Drop the genes with nan values
# # # rows_before = rna_df.shape[0]
# # # rna_df = rna_df.dropna()
# # # rows_after = rna_df.shape[0]

# # # print(f"Number of genes dropped: {rows_before - rows_after}")

# # # 
# # # rna_df = rna_df.T
# # # rna_df.columns = rna_df.loc["gene_id", :].values
# # # rna_df = rna_df.drop("gene_id")

# # # rna_blca_df.to_csv(os.path.join(data_dir, processed_folder_name, "rna_blca.tsv"), sep="\t")
# # # rna_blca_lusc_ov_df.to_csv(os.path.join(data_dir, processed_folder_name, "rna_blca_lusc_ov.tsv"), sep="\t")




In [2]:
rna_blca_df = rna_blca_df[["cancer_id"] + selected_genes]
rna_lusc_df = rna_lusc_df[["cancer_id"] + selected_genes]
rna_ov_df = rna_ov_df[["cancer_id"] + selected_genes]


In [3]:
rna_blca_df

Unnamed: 0,cancer_id,26220,6167,5792,1048,10809,23635,151393,724102,93556,...,51096,9420,55869,285555,203245,64897,5689,84515,346007,6599
TCGA-2F-A9KO-01,0,4.0816,17261.8,9304.96,1388.34,557.434,199.417,89.7959,12.1399,0.0,...,906.122,47.8134,401.166,1.7493,274.636,254.227,2155.69,494.461,8.1633,2893.88
TCGA-2F-A9KP-01,0,2.913,18470.2,36166.9,2.913,862.672,59.0928,61.5897,46.6875,3.7453,...,908.448,5.8261,387.849,0.4161,284.644,230.129,3184.77,362.88,4.9938,7364.13
TCGA-2F-A9KQ-01,0,11.1266,12731.1,22019.5,554.937,2545.2,200.278,37.5522,14.6685,11.5902,...,884.562,50.0695,350.95,1.3908,180.807,454.798,3751.04,247.566,11.5902,4118.68
TCGA-2F-A9KR-01,0,62.2071,15449.7,16615.5,2746.61,745.061,216.063,74.5536,30.4388,0.0,...,1047.07,107.319,419.305,0.9497,246.929,134.861,1684.34,210.365,4.7486,3947.54
TCGA-2F-A9KT-01,0,2256.6,32531.4,12169.0,704.353,2500.85,34.4828,23.7422,11.5885,0.0,...,584.511,9.0447,325.608,0.5653,283.776,196.156,3058.79,279.254,6.2182,4168.46
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-ZF-AA56-01,0,10.1456,20182.2,13210.0,24877.4,1050.73,199.824,91.7512,8.3194,0.4411,...,910.013,67.049,261.579,0.0,302.603,218.35,5419.5,273.048,1.3233,2393.91
TCGA-ZF-AA58-01,0,76.7285,17615.5,10836.0,1.6863,639.966,45.5312,56.0708,3.9713,0.0,...,992.833,29.511,364.671,0.4216,196.037,223.862,4580.52,663.997,3.3727,3927.49
TCGA-ZF-AA5H-01,0,24.91,19008.9,12204.6,0.3235,1685.79,179.546,19.0869,2.3713,1.6175,...,1241.3,21.3514,231.307,0.0,232.601,282.098,2969.79,625.986,3.5586,1872.13
TCGA-ZF-AA5N-01,0,53.9877,24116.6,9538.65,69.9387,3440.49,203.681,35.5828,8.589,9.816,...,709.202,2.454,213.497,1.227,247.853,214.724,4112.88,246.626,1.227,2457.67


In [4]:
rna_lusc_df

Unnamed: 0,cancer_id,26220,6167,5792,1048,10809,23635,151393,724102,93556,...,51096,9420,55869,285555,203245,64897,5689,84515,346007,6599
TCGA-18-3406-01,1,85.0394,19432.3,4011.81,4314.96,1079.53,735.433,239.37,8.6614,0.0,...,1601.57,131.496,258.268,0.7874,154.331,245.669,11203.1,208.661,12.5984,1353.54
TCGA-18-3407-01,1,18.5538,11107.5,12282.1,1661.28,531.399,505.233,56.137,16.8078,0.0,...,962.417,73.7393,284.015,0.4757,260.704,193.625,3354.42,338.249,4.7574,2959.56
TCGA-18-3408-01,1,46.2702,14493.8,4436.6,606.258,1123.54,688.714,247.961,13.6438,1.7796,...,1193.53,104.405,325.078,0.0,136.438,166.691,6805.87,435.414,36.7789,1964.7
TCGA-18-3409-01,1,176.41,8519.49,12577.4,7.1795,634.872,422.564,110.256,4.1026,0.0,...,997.949,96.4103,277.436,3.0769,209.231,267.18,3152.82,271.795,11.7949,4461.54
TCGA-18-3410-01,1,22.8454,10689.7,26295.9,9.8472,3558.37,508.901,105.168,11.4227,1.5755,...,1438.08,69.324,265.086,0.7878,200.882,300.142,8544.58,796.045,16.9371,3124.31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-O2-A52S-01,1,50.9106,5417.22,9300.08,110.513,1794.29,216.887,26.4901,49.7889,0.0,...,1838.16,19.0397,164.321,0.8278,239.652,397.765,4372.1,2185.84,14.9007,4804.22
TCGA-O2-A52V-01,1,125.223,9880.03,24764.8,5811.11,452.989,517.701,57.1489,7.0344,0.0,...,966.488,71.0159,367.265,1.6808,214.728,171.867,3525.58,560.143,32.7766,2463.7
TCGA-O2-A52W-01,1,16.1204,14624.9,12086.5,4383.66,315.959,909.726,71.467,12.8157,1.0747,...,807.093,238.044,348.737,1.0747,269.21,134.336,3509.4,998.925,5.9108,2836.11
TCGA-O2-A5IB-01,1,107.817,7304.58,28915.1,1.8637,146.965,480.298,60.9691,38.4239,0.2662,...,1098.78,123.802,411.874,0.5325,173.855,336.795,4549.52,1233.23,14.1108,5179.71


In [5]:
rna_ov_df

Unnamed: 0,cancer_id,26220,6167,5792,1048,10809,23635,151393,724102,93556,...,51096,9420,55869,285555,203245,64897,5689,84515,346007,6599
TCGA-09-2056-01,2,17.3512,42736.8,10804.6,7.3835,1947.76,1098.29,45.4084,22.5233,0.0,...,1234.89,7.7527,242.178,2.5842,115.921,463.683,3447.35,180.157,3.6917,2301.06
TCGA-24-2036-01,2,133.018,7366.3,15297.3,3.8727,370.022,2049.38,47.8811,34.8617,5.6331,...,512.61,82.0316,234.125,3.8727,225.675,286.231,1187.52,421.072,12.6744,3447.79
TCGA-04-1348-01,2,2.863934,17765.965465,5360.464661,1.135489,680.586145,287.028732,44.109155,88.805519,0.001344,...,1044.407337,5.199249,604.097562,0.853963,169.987804,375.998776,2770.108218,601.776729,1.735014,3953.869229
TCGA-04-1357-01,2,9.931712,19921.209171,15318.024901,447.64579,714.452304,648.822477,47.016021,50.613912,-0.445761,...,618.948277,6.597134,289.901411,0.205595,178.869497,253.329331,2828.954171,159.61418,8.247268,3234.405793
TCGA-04-1362-01,2,32.003048,31151.235527,12910.456498,177.173598,1750.751396,375.285613,23.574795,16.717439,-0.262307,...,724.820011,63.833537,439.766368,0.604646,146.924881,283.576963,2820.466774,522.778206,6.173802,3296.288971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-61-2113-01,2,9.169944,6427.638857,12481.134213,6.595177,660.917729,597.468581,74.60573,38.115805,1.103616,...,769.377661,33.745882,150.301967,0.205595,226.572936,244.81345,3828.122894,622.971693,3.481292,4184.288303
TCGA-OY-A56P-01,2,21.903496,31608.134371,28954.228424,0.397578,216.65826,532.254358,68.891488,73.983087,1.331131,...,905.100796,11.598872,274.701489,2.460177,149.080641,318.618721,1342.864823,323.086788,14.123664,11070.963481
TCGA-OY-A56Q-01,2,9.310729,22458.251336,22817.765886,7.195545,436.161464,288.326217,122.677115,25.585509,-0.076415,...,497.633877,0.07027,112.641175,1.009003,155.851268,455.939363,2929.550035,383.773198,6.906894,14216.808615
TCGA-VG-A8LO-01,2,5.119328,22556.248181,16291.623608,11.352935,935.057446,93.741853,73.08585,174.49918,-0.26724,...,558.344099,2.129889,249.360527,1.758762,186.170743,166.260635,4211.524219,484.850786,4.333901,5578.54275


In [6]:
clinical_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, clinical_file_name), sep="\t", encoding="latin")
clinical_df


  clinical_df = pd.read_csv(os.path.join(data_dir, raw_folder_name, clinical_file_name), sep="\t", encoding="latin")


Unnamed: 0,bcr_patient_uuid,bcr_patient_barcode,acronym,gender,vital_status,days_to_birth,days_to_death,days_to_last_followup,days_to_initial_pathologic_diagnosis,age_at_initial_pathologic_diagnosis,...,total_bilirubin_upper_limit,platelet_result_count,fibrosis_ishak_score,fetoprotein_outcome_value,fetoprotein_outcome_upper_limit,fetoprotein_outcome_lower_limit,inter_norm_ratio_lower_limit,family_cancer_type_txt,bilirubin_upper_limit,days_to_last_known_alive
0,B3164F7B-C826-4E08-9EE6-8FF96D29B913,TCGA-OR-A5J1,ACC,MALE,Dead,-21496,1355.0,[Not Available],0,58,...,,,,,,,,,,
1,8E7C2E31-D085-4B75-A970-162526DD07A0,TCGA-OR-A5J2,ACC,FEMALE,Dead,-16090,1677,[Not Available],0,44,...,,,,,,,,,,
2,DFD687BC-6E69-42F7-AF94-D17FC150D1A1,TCGA-OR-A5J3,ACC,FEMALE,Alive,-8624,[Not Applicable],2091.0,0,23,...,,,,,,,,,,
3,5F3E2974-F1DF-47A2-8A8A-29BB525EEEF6,TCGA-OR-A5J4,ACC,FEMALE,Dead,-8451,423,[Not Available],0,23,...,,,,,,,,,,
4,802DBD0D-EF07-4C91-AB8D-1DD39532E947,TCGA-OR-A5J5,ACC,MALE,Dead,-11171,365,[Not Available],0,30,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10951,E00CE846-9A0E-48B5-BE9F-84D443A0F78A,TCGA-YZ-A980,UVM,MALE,Alive,-27716,[Not Applicable],1601.0,0,75,...,,,,,,,,,,
10952,CBA920F4-C57F-47BC-958D-9B7872DF01C8,TCGA-YZ-A982,UVM,FEMALE,Alive,-28938,[Not Applicable],495.0,0,79,...,,,,,,,,,,
10953,DF291BBF-FC62-4C40-9582-289AC78225FD,TCGA-YZ-A983,UVM,FEMALE,Alive,-18769,[Not Applicable],547.0,0,51,...,,,,,,,,,,
10954,743FC661-9BA2-4FA8-966E-508FB4B965E0,TCGA-YZ-A984,UVM,FEMALE,Dead,-18342,1396.0,1280.0,0,50,...,,,,,,,,,,
