In [38]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

In [39]:
tcga_train_file_location = 'data/train_tcga_expression_matrix_processed.tsv.gz'
target_train_file_location = 'data/train_target_expression_matrix_processed.tsv.gz'
gtex_train_file_location = 'data/train_gtex_expression_matrix_processed.tsv.gz'
tcga_tybalt_file_location = 'data/pancan_scaled_zeroone_rnaseq.tsv.gz'

In [40]:
tcga_df = pd.read_table(tcga_train_file_location)
target_df = pd.read_table(target_train_file_location)
gtex_df = pd.read_table(gtex_train_file_location)

In [54]:
tcga_df = tcga_df.dropna()
target_df = target_df.dropna()
gtex_df = gtex_df.dropna()

In [55]:
target_df.head()

Unnamed: 0,sample_id,1,2,9,10,12,13,14,15,16,...,102724231,102724398,102724473,102724536,102724631,102724862,102724928,105375355,105378803,105378952
0,TARGET-30-PARSBI-01,4.69,7.37,0.455,-6.51,-3.03,-9.97,5.84,0.058,5.16,...,-3.17,-0.913,-9.97,-0.783,-9.97,1.47,-9.97,-0.619,-9.97,-9.97
1,TARGET-20-PADZCG-09,4.52,-0.86,0.099,-9.97,-8.24,-9.97,4.15,-3.46,3.35,...,3.09,-9.97,-9.97,1.18,-9.97,-3.05,-9.97,-2.18,-9.97,-9.97
2,TARGET-10-PARSZH-09,7.16,-0.913,1.2,-9.97,-9.97,-9.97,4.21,-9.97,3.88,...,-0.834,-9.97,-9.97,-3.46,-9.97,-9.97,-9.97,-9.97,-9.97,-9.97
3,TARGET-30-PATBMM-01,5.19,6.63,-1.06,-4.61,-2.12,-5.57,6.55,-1.35,5.49,...,-5.01,-2.47,-3.46,-1.32,-9.97,-0.512,-9.97,0.228,-4.04,-9.97
4,TARGET-50-PAJNAA-01,2.37,5.54,-0.0425,0.346,-0.856,-4.29,5.36,-2.39,5.46,...,-4.29,-9.97,-2.63,-4.04,-9.97,-5.01,-9.97,-2.73,-6.51,-9.97


In [56]:
#read gene dictioany
gene_ids = pd.read_csv('data/gene_dict.csv') 

In [57]:
# drop duplicate ids
gene_ids.drop_duplicates(subset=["entrezgene_id"], inplace=True)

In [58]:
gene_ids.head()

Unnamed: 0,hgnc_symbol,entrezgene_id,gene_biotype,description
0,A1BG,1,protein_coding,alpha-1-B glycoprotein [Source:HGNC Symbol;Acc...
1,NAT2,10,protein_coding,N-acetyltransferase 2 [Source:HGNC Symbol;Acc:...
2,ADA,100,protein_coding,adenosine deaminase [Source:HGNC Symbol;Acc:HG...
3,CDH2,1000,protein_coding,cadherin 2 [Source:HGNC Symbol;Acc:HGNC:1759]
4,AKT3,10000,protein_coding,AKT serine/threonine kinase 3 [Source:HGNC Sym...


In [59]:
gene_dict = {str(i):j for i,j in zip(gene_ids['entrezgene_id'],gene_ids['hgnc_symbol'])}

In [60]:
tcga_gene_column_ids = [i for i in tcga_df.columns[1:] if i in gene_dict.keys()]
target_gene_column_ids = [i for i in target_df.columns[1:] if i in gene_dict.keys()]
gtex_gene_column_ids = [i for i in gtex_df.columns[1:] if i in gene_dict.keys()]

tcga_gene_column_names = [gene_dict[i] for i in tcga_df.columns[1:] if i in gene_dict.keys()]
target_gene_column_names = [gene_dict[i] for i in target_df.columns[1:] if i in gene_dict.keys()]
gtex_gene_column_names = [gene_dict[i] for i in gtex_df.columns[1:] if i in gene_dict.keys()]

In [61]:
tcga_df_sorted = tcga_df[tcga_gene_column_ids]
target_df_sorted = target_df[target_gene_column_ids]
gtex_df_sorted = gtex_df[gtex_gene_column_ids]

In [62]:
tcga_df_sorted = tcga_df_sorted.rename(columns={i:j for i, j in zip(tcga_gene_column_ids, tcga_gene_column_names)})
target_df_sorted = target_df_sorted.rename(columns={i:j for i, j in zip(target_gene_column_ids, target_gene_column_names)})
gtex_df_sorted = gtex_df_sorted.rename(columns={i:j for i, j in zip(gtex_gene_column_ids, gtex_gene_column_names)})

In [63]:
target_df_sorted.head(4)

Unnamed: 0,A1BG,A2M,NAT1,NAT2,SERPINA3,AADAC,AAMP,AANAT,AARS1,ABAT,...,KLLN,SRRM5,ERICH4,TOMM6,C2CD4D,ARHGEF33,MEF2B,CMC4,ZNF605,TMED7-TICAM2
0,4.69,7.37,0.455,-6.51,-3.03,-9.97,5.84,0.058,5.16,3.59,...,-1.51,-0.735,-1.94,7.28,-2.93,1.3,0.567,2.5,2.68,-0.735
1,4.52,-0.86,0.099,-9.97,-8.24,-9.97,4.15,-3.46,3.35,1.11,...,-0.808,-0.997,-4.61,7.39,-6.51,-6.51,-0.302,2.49,1.47,-5.01
2,7.16,-0.913,1.2,-9.97,-9.97,-9.97,4.21,-9.97,3.88,0.527,...,-2.24,-3.05,-9.97,8.02,-2.83,-9.97,-2.05,4.29,0.0014,-9.97
3,5.19,6.63,-1.06,-4.61,-2.12,-5.57,6.55,-1.35,5.49,4.79,...,-1.12,1.54,-4.61,7.78,-0.941,1.49,1.04,2.76,2.67,-3.05


Read column names from tybalt tcga data


In [65]:
tcga_df_tybalt = pd.read_table(tcga_tybalt_file_location)

In [66]:
tcga_df_columns_filterd = [i for i in tcga_df_tybalt.columns[1:] if i in tcga_df_sorted]
target_df_columns_filterd = [i for i in tcga_df_tybalt.columns[1:] if i in target_df_sorted]
gtex_df_columns_filterd = [i for i in tcga_df_tybalt.columns[1:] if i in gtex_df_sorted]

In [67]:
tcga_df_sorted = tcga_df_sorted[tcga_df_columns_filterd]
target_df_sorted = target_df_sorted[target_df_columns_filterd]
gtex_df_sorted = gtex_df_sorted[gtex_df_columns_filterd]

In [68]:
tcga_df_sorted

Unnamed: 0,KRT5,AGR2,CEACAM5,KRT6A,KRT14,CEACAM6,SLC34A2,TMPRSS4,KRT6B,GPX2,...,EFCAB6,ABCG5,METTL7A,C8orf48,CDK5R1,FAM81A,GDPD3,SMAGP,POU5F1B,CHST2
0,10600.000,3230.00,1210.00,24.300,13100.000,5600.000,1610.000,172.00,877.000,500.000,...,32.10,0.459,1780.0,33.10,84.9,42.20,122.0,339.0,18.10,268.0
1,53.400,2000.00,27900.00,81.400,17.800,77200.000,63700.000,3670.00,5.090,5.810,...,42.50,1.090,1450.0,10.20,47.6,25.10,62.8,119.0,12.40,301.0
2,6.160,0.00,0.77,0.000,0.000,0.385,0.385,8.47,0.000,1.160,...,6.55,192.000,1260.0,4.62,1680.0,93.60,24.6,30.0,5.39,1140.0
3,-0.660,2.62,6.78,-0.667,-0.480,1.890,2.200,1.30,-0.274,0.135,...,28.90,2.440,5590.0,14.30,37.7,39.10,47.5,303.0,4.21,582.0
4,119.000,0.00,0.00,20.300,30.200,0.472,15.100,0.00,11.300,0.472,...,1.42,1.420,778.0,4.72,754.0,137.00,50.9,126.0,3.26,426.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9949,47100.000,749.00,124.00,91800.000,959.000,733.000,832.000,5980.00,10300.000,20500.000,...,4.98,0.356,142.0,4.63,207.0,15.70,21.7,814.0,5.69,4060.0
9950,93.200,223.00,0.00,0.000,0.000,0.000,28700.000,8.24,0.000,0.433,...,63.70,78.000,5810.0,24.30,118.0,91.00,45.7,1550.0,72.00,179.0
9951,0.473,0.00,0.00,0.000,0.000,0.000,1.420,0.00,0.000,0.000,...,55.40,2.370,9360.0,17.00,1510.0,423.00,18.3,17.5,7.10,268.0
9952,0.553,35.40,0.00,0.000,0.553,0.000,0.000,0.00,0.000,0.000,...,36.00,11.600,6460.0,11.60,120.0,5.53,50.8,248.0,9.41,81.4


In [69]:
# Scale RNAseq data using z-scores
tcga_df_sort = preprocessing.MinMaxScaler().fit_transform(tcga_df_sorted)
target_df_sort = preprocessing.MinMaxScaler().fit_transform(target_df_sorted)
gtex_df_sort = preprocessing.MinMaxScaler().fit_transform(gtex_df_sorted)

tcga_df_sort = pd.DataFrame(tcga_df_sort,
                                columns=tcga_df_sorted.columns)
target_df_sort = pd.DataFrame(target_df_sort,
                                columns=target_df_sorted.columns)
gtex_df_sort = pd.DataFrame(gtex_df_sort,
                                columns=gtex_df_sorted.columns)

In [71]:
gtex_df_sort

Unnamed: 0,KRT5,AGR2,CEACAM5,KRT6A,KRT14,CEACAM6,SLC34A2,TMPRSS4,KRT6B,GPX2,...,EFCAB6,ABCG5,METTL7A,C8orf48,CDK5R1,FAM81A,GDPD3,SMAGP,POU5F1B,CHST2
0,0.000040,0.000134,0.000010,0.000026,0.000011,0.000013,0.000189,0.016216,0.000007,0.002576,...,0.006417,0.000203,0.035019,0.003907,0.003498,0.024968,0.062578,0.035116,0.028046,0.072614
1,0.000091,0.000311,0.000025,0.000027,0.000030,0.002648,0.001665,0.000125,0.000045,0.002702,...,0.005926,0.000138,0.024438,0.010645,0.002935,0.001028,0.004933,0.028763,0.017356,0.001358
2,0.000163,0.000023,0.000000,0.000009,0.000133,0.000012,0.000060,0.000047,0.000016,0.000051,...,0.000981,0.000000,0.023546,0.005341,0.006084,0.000675,0.008072,0.005663,0.006149,0.000215
3,0.307263,0.000052,0.040901,0.009725,0.002734,0.091327,0.000076,0.366409,0.008848,0.157616,...,0.004880,0.000584,0.093274,0.057706,0.028175,0.008835,0.217401,0.561669,0.056437,0.003622
4,0.000269,0.000000,0.000007,0.000050,0.000075,0.000018,0.000011,0.000036,0.000017,0.000208,...,0.003306,0.000106,0.380474,0.051971,0.003297,0.000892,0.016598,0.044406,0.028391,0.054567
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10514,0.217877,0.000501,0.000685,0.005262,0.278806,0.003015,0.000451,0.006448,0.013374,0.021788,...,0.003333,0.000575,0.021251,0.010717,0.005247,0.005391,0.076576,0.178923,0.038276,0.023411
10515,0.003978,0.008499,0.000077,0.002011,0.000137,0.000470,0.000081,0.133977,0.000038,0.056159,...,0.003806,0.000460,0.004705,0.015735,0.006008,0.004277,0.074455,0.169394,0.013563,0.067232
10516,0.000156,0.000006,0.000000,0.000005,0.000490,0.000016,0.000025,0.000629,0.000007,0.000015,...,0.007491,0.000311,0.090597,0.164158,0.001837,0.004763,0.042642,0.039086,0.001379,0.010430
10517,0.000359,0.000162,0.000009,0.000072,0.000200,0.000209,0.000173,0.000068,0.000069,0.000196,...,0.004824,0.000269,0.140822,0.015520,0.006084,0.000749,0.016343,0.030113,0.019425,0.082746


In [33]:
tcga_df_sort.to_csv('data/rescaled_minmax_tcga_df_sort.tsv.gz', sep='\t', compression='gzip')
target_df_sort.to_csv('data/rescaled_minmax_target_df_sort.tsv.gz', sep='\t', compression='gzip')
gtex_df_sort.to_csv('data/rescaled_minmax_gtex_df_sort.tsv.gz', sep='\t', compression='gzip')

In [34]:
tcga_df_sort['KRT5'].head(4)

0    0.007261
1    0.000037
2    0.000005
3    0.000000
Name: KRT5, dtype: float64

In [35]:
tcga_df_tybalt['KRT5'].head()

0    0.034230
1    0.181993
2    0.081082
3    0.180042
4    0.034017
Name: KRT5, dtype: float64

In [36]:
target_df_sort['KRT5'].head(4)

0    0.459175
1    0.000000
2    0.279709
3    0.585287
Name: KRT5, dtype: float64

In [37]:
gtex_df_sort['KRT5'].head(4)

0    0.000040
1    0.000091
2    0.000163
3    0.307263
Name: KRT5, dtype: float64