## Download data

In [1]:
!wget -O CNV.gz https://tcga-xena-hub.s3.us-east-1.amazonaws.com/download/TCGA.OV.sampleMap\%2FGistic2_CopyNumber_Gistic2_all_data_by_genes.gz
!wget -O Methyl.gz https://tcga-xena-hub.s3.us-east-1.amazonaws.com/download/TCGA.OV.sampleMap\%2FHumanMethylation27.gz
!wget -O mRNA.gz https://tcga-xena-hub.s3.us-east-1.amazonaws.com/download/TCGA.OV.sampleMap\%2FHT_HG-U133A.gz
!wget -O RNAseq.gz https://tcga-xena-hub.s3.us-east-1.amazonaws.com/download/TCGA.OV.sampleMap%2FHiSeqV2_PANCAN.gz
!wget -O gdc_methyl.gz https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-OV.methylation27.tsv.gz

--2021-11-14 22:05:04--  https://tcga-xena-hub.s3.us-east-1.amazonaws.com/download/TCGA.OV.sampleMap%2FGistic2_CopyNumber_Gistic2_all_data_by_genes.gz
Resolving tcga-xena-hub.s3.us-east-1.amazonaws.com (tcga-xena-hub.s3.us-east-1.amazonaws.com)... 52.217.84.184
Connecting to tcga-xena-hub.s3.us-east-1.amazonaws.com (tcga-xena-hub.s3.us-east-1.amazonaws.com)|52.217.84.184|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1633969 (1.6M) [binary/octet-stream]
Saving to: ‘CNV.gz’


2021-11-14 22:05:04 (4.62 MB/s) - ‘CNV.gz’ saved [1633969/1633969]

--2021-11-14 22:05:05--  https://tcga-xena-hub.s3.us-east-1.amazonaws.com/download/TCGA.OV.sampleMap%2FHumanMethylation27.gz
Resolving tcga-xena-hub.s3.us-east-1.amazonaws.com (tcga-xena-hub.s3.us-east-1.amazonaws.com)... 52.217.84.184
Connecting to tcga-xena-hub.s3.us-east-1.amazonaws.com (tcga-xena-hub.s3.us-east-1.amazonaws.com)|52.217.84.184|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 322

## Read data into DataFrame

In [2]:
!gunzip *.gz

In [3]:
import pandas as pd

df_list = {}
data_cnv_file = "CNV"
data_mRNA_file = "mRNA"
data_RNAseq_file = "RNAseq"
data_methyl_file = "Methyl"
data_gdc_methyl_file = "gdc_methyl"

df_list["cnv"] = pd.read_csv(data_cnv_file, sep="\t", index_col=0)
df_list["mrna"] = pd.read_csv(data_mRNA_file, sep="\t", index_col=0)
df_list["rnaseq"] = pd.read_csv(data_RNAseq_file, sep="\t", index_col=0)
df_list["methyl"] = pd.read_csv(data_methyl_file, sep="\t", index_col=0)
df_list["gdc_methyl"] = pd.read_csv(data_gdc_methyl_file, sep="\t", index_col=0)

for key in df_list.keys():
    print(f"{key} \t {df_list[key].shape}")

cnv 	 (24776, 579)
mrna 	 (12042, 593)
rnaseq 	 (20530, 308)
methyl 	 (27578, 616)
gdc_methyl 	 (27578, 613)


## Intersections

### Intersection \#1: CNV + mRNA + DNA methylation

In [11]:
col_idx = pd.concat([df_list["cnv"], df_list["mrna"], df_list["methyl"]], axis=0, join="inner").columns
print(len(col_idx))

555
Index(['TCGA-04-1331-01', 'TCGA-04-1332-01', 'TCGA-04-1335-01',
       'TCGA-04-1336-01', 'TCGA-04-1337-01', 'TCGA-04-1338-01',
       'TCGA-04-1341-01', 'TCGA-04-1342-01', 'TCGA-04-1343-01',
       'TCGA-04-1346-01',
       ...
       'TCGA-61-2101-01', 'TCGA-61-2102-01', 'TCGA-61-2104-01',
       'TCGA-61-2109-01', 'TCGA-61-2110-01', 'TCGA-61-2111-01',
       'TCGA-61-2113-01', 'TCGA-61-2612-01', 'TCGA-61-2613-01',
       'TCGA-61-2614-01'],
      dtype='object', length=555)


In [80]:
df_list["cnv_1"] = df_list["cnv"][col_idx]
df_list["mrna_1"] = df_list["mrna"][col_idx]
df_list["methyl_1"] = df_list["methyl"][col_idx]

### Intersection \#2: CNV + RNAseq + DNA methylation

In [81]:
col_idx = pd.concat([df_list["cnv"], df_list["rnaseq"], df_list["methyl"]], axis=0, join="inner").columns
print(len(col_idx))

292


In [82]:
df_list["cnv_2"] = df_list["cnv"][col_idx]
df_list["rnaseq_2"] = df_list["rnaseq"][col_idx]
df_list["methyl_2"] = df_list["methyl"][col_idx]

## Zero/Missing/NA values removal

In [84]:
def remove_missing_zero_data(df):
    df = df.dropna(axis=0, how='any')
    df = df[df.where(df != 0).any(axis=1)]
    return df

In [85]:
import numpy as np

for key in df_list.keys():
    df_list[key] = remove_missing_zero_data(df_list[key])
    print(f"{key} \t {df_list[key].shape}")

cnv 	 (24776, 579)
mrna 	 (12042, 593)
rnaseq 	 (20318, 308)
methyl 	 (21666, 616)
gdc_methyl 	 (21675, 613)
cnv_1 	 (24776, 555)
mrna_1 	 (12042, 555)
methyl_1 	 (21694, 555)
cnv_2 	 (24776, 292)
rnaseq_2 	 (20318, 292)
methyl_2 	 (22528, 292)


## Normalisation

In [86]:
def min_max_normalization(df):
    for column in df.columns:
        df[column] = (df[column] - df[column].min())/ (df[column].max() - df[column].min())
    return df

In [87]:
for key in df_list.keys():
    df_list[key] = min_max_normalization(df_list[key])

## Concatenation for multi-omics data

In [88]:
final_list = {}

final_list["mrna_methyl"] = pd.concat([df_list["mrna_1"], df_list["methyl_1"]], axis=0, join="inner")
final_list["methyl_cnv"] = pd.concat([df_list["cnv_1"], df_list["methyl_1"]], axis=0, join="inner")
final_list["cnv_mrna"] = pd.concat([df_list["cnv_1"], df_list["mrna_1"]], axis=0, join="inner")

final_list["methyl_rnaseq"] = pd.concat([df_list["methyl_2"], df_list["rnaseq_2"]], axis=0, join="inner")
final_list["cnv_rnaseq"] = pd.concat([df_list["cnv_2"], df_list["rnaseq_2"]], axis=0, join="inner")

final_list["cnv_methyl_mrna"] = pd.concat([final_list["methyl_cnv"], df_list["mrna_1"]], axis=0, join="inner")
final_list["cnv_methyl_rnaseq"] = pd.concat([final_list["methyl_rnaseq"], df_list["cnv_2"]], axis=0, join="inner")


for key in final_list.keys():
    print(f"{key} \t {final_list[key].shape}")

mrna_methyl 	 (33736, 555)
methyl_cnv 	 (46470, 555)
cnv_mrna 	 (36818, 555)
methyl_rnaseq 	 (42846, 292)
cnv_rnaseq 	 (45094, 292)
cnv_methyl_mrna 	 (58512, 555)
cnv_methyl_rnaseq 	 (67622, 292)


In [89]:
for key in final_list.keys():
    final_list[key].to_csv(f'{key}.csv')

In [90]:
for key in df_list.keys():
    df_list[key].to_csv(f'{key}.csv')