## Preprocess Gene Copy Number to work with other given Datasets

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import os

In [2]:
# read data and replace missing values with 0
gcn = pd.read_table("/home/bram/jointomicscomp/data/Gistic2_CopyNumber_Gistic2_all_data_by_genes", index_col=0).fillna(0)

## Preprocess GCN

In [3]:
def readRNA(data, Ngenes=5000, geneCriterion='pmad', scaling='min-max'):
	"""
	works for the dataset downloaded from Xena

	from_file: 		boolean, if false, read & pre-process raw data, else load from infile. If true, Ngenes, geneCriterion, scaling and save are ignored
	Ngenes:			int, number of top most variant genes to keep
	geneCriterion:	str, currently only pmad is support (mean absolute deviation from the mean)
	scaling:		str, 'none', 'min-max', 'z-score'
	save:			boolean, whether to save after preprocessing
	infile:			str, location of stored data matrix (if from_file == True)
	outfile:		str, location to save data frame (if save == True)

	"""

	if geneCriterion == 'pmad':
		mad_genes = data.mad(axis=1).sort_values(ascending=False)
	else:
		raise NotImplementedError

	selectedGenes = mad_genes.iloc[:Ngenes, ].index

	data = data.loc[selectedGenes].T

	if scaling is not None:
		if scaling == 'min-max':
			scaler = MinMaxScaler()
		else:
			assert scaling == 'z-score'
			scaler = StandardScaler()

		scaledData = scaler.fit_transform(data)

		data = pd.DataFrame(scaledData, columns=data.columns, index=data.index)


	return data

In [4]:
gcn_minmax5000 = readRNA(gcn)

In [5]:
gcn_minmax5000

Sample,MYC,PVT1,CASC8,POU5F1B,CCAT1,TMEM75,MIR1205,MIR1207,PCAT1,MIR1208,...,NSFL1C,CENPV,RBP2,INTS6,FAM124A,ELP5,SNPH,SNRPB,RN7SL561P,SIRPB1
TCGA-A5-A0GI-01,0.213333,0.258914,0.261010,0.214347,0.214347,0.258914,0.253317,0.253317,0.261010,0.253317,...,0.263232,0.264040,0.225888,0.261010,0.261010,0.278500,0.263232,0.256776,0.263232,0.263232
TCGA-S9-A7J2-01,0.211183,0.256888,0.258990,0.212199,0.212199,0.256888,0.251276,0.251276,0.258990,0.251276,...,0.267879,0.274949,0.226100,0.258586,0.258586,0.290006,0.267879,0.261463,0.267879,0.267879
TCGA-06-0150-01,0.211398,0.257091,0.259192,0.212414,0.212414,0.257091,0.251480,0.251480,0.259192,0.251480,...,0.394747,0.264242,0.231387,0.260202,0.260202,0.278713,0.394747,0.389444,0.394747,0.394747
TCGA-AR-A1AH-01,0.047742,0.102917,0.105455,0.048969,0.048969,0.102917,0.096142,0.096142,0.105455,0.096142,...,0.277374,0.107475,0.067682,0.107071,0.107071,0.113360,0.277374,0.271041,0.277374,0.277374
TCGA-EK-A2RE-01,0.214409,0.259927,0.262020,0.215421,0.215421,0.259927,0.254338,0.254338,0.262020,0.254338,...,0.257576,0.256364,0.233080,0.097778,0.097778,0.270403,0.257576,0.251070,0.257576,0.257576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-IB-7885-01,0.213548,0.259117,0.261212,0.214562,0.214562,0.259117,0.253521,0.253521,0.261212,0.253521,...,0.244242,0.253333,0.215102,0.234141,0.234141,0.267206,0.244242,0.237620,0.244242,0.244242
TCGA-95-7947-01,0.211183,0.256888,0.258990,0.212199,0.212199,0.256888,0.251276,0.251276,0.258990,0.251276,...,0.257172,0.126869,0.356176,0.266263,0.266263,0.133816,0.257172,0.250662,0.257172,0.257172
TCGA-VQ-AA6F-01,0.338495,0.376823,0.378586,0.339347,0.339347,0.376823,0.372117,0.372117,0.378586,0.372117,...,0.281212,0.217778,0.320643,0.266263,0.266263,0.229704,0.281212,0.274913,0.281212,0.281212
TCGA-BR-8588-01,0.262151,0.304903,0.306869,0.263101,0.263101,0.304903,0.299653,0.299653,0.306869,0.299653,...,0.262424,0.262020,0.248731,0.334747,0.334747,0.276369,0.262424,0.255961,0.262424,0.262424


In [8]:
samples_ge_me = np.load("/home/bram/jointomicscomp/data/sampleNames.npy")
samples_ge_me

array(['TCGA-05-4384-01', 'TCGA-05-4390-01', 'TCGA-05-4396-01', ...,
       'TCGA-ZU-A8S4-01', 'TCGA-ZU-A8S4-11', 'TCGA-ZX-AA5X-01'],
      dtype='<U15')

In [7]:
common = np.intersect1d(gcn_minmax5000.index.values, samples_ge_me)
print("Samples in common between all three datasets: {}".format(common.shape[0]))

Samples in common between all three datasets: 8417


In [9]:
gcn_common = gcn_minmax5000.loc[common]

In [10]:
gcn_common

Sample,MYC,PVT1,CASC8,POU5F1B,CCAT1,TMEM75,MIR1205,MIR1207,PCAT1,MIR1208,...,NSFL1C,CENPV,RBP2,INTS6,FAM124A,ELP5,SNPH,SNRPB,RN7SL561P,SIRPB1
TCGA-05-4384-01,0.213548,0.259117,0.261212,0.214562,0.214562,0.259117,0.253521,0.253521,0.261212,0.253521,...,0.263434,0.263636,0.226100,0.260808,0.260808,0.278074,0.263434,0.256980,0.263434,0.263434
TCGA-05-4390-01,0.626452,0.648096,0.649091,0.626933,0.626933,0.648096,0.645438,0.645438,0.649091,0.645438,...,0.266667,0.269091,0.204526,0.234343,0.234343,0.283827,0.266667,0.260240,0.266667,0.266667
TCGA-05-4396-01,0.216344,0.261750,0.263838,0.217354,0.217354,0.261750,0.256175,0.256175,0.263838,0.256175,...,0.271717,0.121616,0.221024,0.256970,0.256970,0.128276,0.271717,0.265335,0.271717,0.271717
TCGA-05-4405-01,0.240860,0.284846,0.286869,0.241838,0.241838,0.284846,0.279445,0.279445,0.286869,0.279445,...,0.249091,0.280202,0.234772,0.196970,0.196970,0.295547,0.249091,0.242511,0.249091,0.249091
TCGA-05-4410-01,0.295914,0.336710,0.338586,0.296821,0.296821,0.336710,0.331700,0.331700,0.338586,0.331700,...,0.325455,0.211919,0.256134,0.254141,0.254141,0.223524,0.325455,0.319544,0.325455,0.325455
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-ZS-A9CF-01,0.000000,0.057942,0.060606,0.001289,0.001289,0.057942,0.050827,0.050827,0.060606,0.050827,...,0.252121,0.066869,0.419416,0.249293,0.249293,0.065843,0.252121,0.258406,0.252121,0.252121
TCGA-ZS-A9CG-01,0.696344,0.713938,0.714747,0.696735,0.696735,0.713938,0.711778,0.711778,0.714747,0.711778,...,0.265657,0.255556,0.223139,0.207071,0.207071,0.269550,0.265657,0.259222,0.265657,0.265657
TCGA-ZT-A8OM-01,0.212258,0.257901,0.260000,0.213273,0.213273,0.257901,0.252296,0.252296,0.260000,0.252296,...,0.262424,0.262222,0.226734,0.262020,0.262020,0.276582,0.262424,0.255961,0.262424,0.262424
TCGA-ZU-A8S4-01,0.215484,0.260940,0.263030,0.216495,0.216495,0.260940,0.255358,0.255358,0.263030,0.255358,...,0.263838,0.256970,0.226946,0.236162,0.236162,0.271042,0.263838,0.257387,0.263838,0.263838


In [13]:
save_dir = "/home/bram/jointomicscomp/data2"

In [14]:
np.save(os.path.join(save_dir, "GCN.npy"), gcn_common.values)

In [54]:
np.save(os.path.join(save_dir, "sampleNames.npy"), common.astype(str))
common.astype(str)

array(['TCGA-05-4384-01', 'TCGA-05-4390-01', 'TCGA-05-4396-01', ...,
       'TCGA-ZT-A8OM-01', 'TCGA-ZU-A8S4-01', 'TCGA-ZX-AA5X-01'],
      dtype='<U15')

In [46]:
np.save(os.path.join(save_dir, "GCN_featureNames.npy"), gcn_common.columns.values)

In [26]:
# Indices of the common samples between three datasets found in the samples_ge_me
common_indices = np.where(np.isin(samples_ge_me, common))[0]

In [37]:
GE = np.load("/home/bram/jointomicscomp/data/GE.npy")
ME = np.load("/home/bram/jointomicscomp/data/ME.npy")

In [42]:
GE_3datasets = np.array([GE[i] for i in common_indices])
GE_3datasets.shape

(8417, 5000)

In [43]:
ME_3datasets = np.array([ME[i] for i in common_indices])
ME_3datasets.shape

(8417, 5000)

In [44]:
np.save(os.path.join(save_dir, "GE.npy"), GE_3datasets)
np.save(os.path.join(save_dir, "ME.npy"), ME_3datasets)