In [74]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

In [75]:
tcga_train_file_location = 'data/train_tcga_expression_matrix_processed.tsv.gz'
target_train_file_location = 'data/train_target_expression_matrix_processed.tsv.gz'
gtex_train_file_location = 'data/train_gtex_expression_matrix_processed.tsv.gz'
tcga_tybalt_file_location = 'data/pancan_scaled_zeroone_rnaseq.tsv.gz'

In [76]:
tcga_df = pd.read_table(tcga_train_file_location)
target_df = pd.read_table(target_train_file_location)
gtex_df = pd.read_table(gtex_train_file_location)

In [77]:
tcga_df = tcga_df.dropna()
target_df = target_df.dropna()
gtex_df = gtex_df.dropna()

In [78]:
target_df.head()

Unnamed: 0,sample_id,1,2,9,10,12,13,14,15,16,...,102724231,102724398,102724473,102724536,102724631,102724862,102724928,105375355,105378803,105378952
0,TARGET-30-PARSBI-01,4.69,7.37,0.455,-6.51,-3.03,-9.97,5.84,0.058,5.16,...,-3.17,-0.913,-9.97,-0.783,-9.97,1.47,-9.97,-0.619,-9.97,-9.97
1,TARGET-20-PADZCG-09,4.52,-0.86,0.099,-9.97,-8.24,-9.97,4.15,-3.46,3.35,...,3.09,-9.97,-9.97,1.18,-9.97,-3.05,-9.97,-2.18,-9.97,-9.97
2,TARGET-10-PARSZH-09,7.16,-0.913,1.2,-9.97,-9.97,-9.97,4.21,-9.97,3.88,...,-0.834,-9.97,-9.97,-3.46,-9.97,-9.97,-9.97,-9.97,-9.97,-9.97
3,TARGET-30-PATBMM-01,5.19,6.63,-1.06,-4.61,-2.12,-5.57,6.55,-1.35,5.49,...,-5.01,-2.47,-3.46,-1.32,-9.97,-0.512,-9.97,0.228,-4.04,-9.97
4,TARGET-50-PAJNAA-01,2.37,5.54,-0.0425,0.346,-0.856,-4.29,5.36,-2.39,5.46,...,-4.29,-9.97,-2.63,-4.04,-9.97,-5.01,-9.97,-2.73,-6.51,-9.97


In [10]:
#read gene dictioany
gene_ids = pd.read_csv('data/gene_dict.csv') 

In [11]:
# drop duplicate ids
gene_ids.drop_duplicates(subset=["entrezgene_id"], inplace=True)

In [12]:
gene_ids.head()

Unnamed: 0,hgnc_symbol,entrezgene_id,gene_biotype,description
0,A1BG,1,protein_coding,alpha-1-B glycoprotein [Source:HGNC Symbol;Acc...
1,NAT2,10,protein_coding,N-acetyltransferase 2 [Source:HGNC Symbol;Acc:...
2,ADA,100,protein_coding,adenosine deaminase [Source:HGNC Symbol;Acc:HG...
3,CDH2,1000,protein_coding,cadherin 2 [Source:HGNC Symbol;Acc:HGNC:1759]
4,AKT3,10000,protein_coding,AKT serine/threonine kinase 3 [Source:HGNC Sym...


In [13]:
gene_dict = {str(i):j for i,j in zip(gene_ids['entrezgene_id'],gene_ids['hgnc_symbol'])}

In [14]:
tcga_gene_column_ids = [i for i in tcga_df.columns[1:] if i in gene_dict.keys()]
target_gene_column_ids = [i for i in target_df.columns[1:] if i in gene_dict.keys()]
gtex_gene_column_ids = [i for i in gtex_df.columns[1:] if i in gene_dict.keys()]

tcga_gene_column_names = [gene_dict[i] for i in tcga_df.columns[1:] if i in gene_dict.keys()]
target_gene_column_names = [gene_dict[i] for i in target_df.columns[1:] if i in gene_dict.keys()]
gtex_gene_column_names = [gene_dict[i] for i in gtex_df.columns[1:] if i in gene_dict.keys()]

In [52]:
tcga_df_sorted = tcga_df[tcga_gene_column_ids]
target_df_sorted = target_df[target_gene_column_ids]
gtex_df_sorted = gtex_df[gtex_gene_column_ids]

In [53]:
tcga_df_sorted = tcga_df_sorted.rename(columns={i:j for i, j in zip(tcga_gene_column_ids, tcga_gene_column_names)})
target_df_sorted = target_df_sorted.rename(columns={i:j for i, j in zip(target_gene_column_ids, target_gene_column_names)})
gtex_df_sorted = gtex_df_sorted.rename(columns={i:j for i, j in zip(gtex_gene_column_ids, gtex_gene_column_names)})

In [54]:
target_df_sorted.head(4)

Unnamed: 0,A1BG,A2M,NAT1,NAT2,SERPINA3,AADAC,AAMP,AANAT,AARS1,ABAT,...,KLLN,SRRM5,ERICH4,TOMM6,C2CD4D,ARHGEF33,MEF2B,CMC4,ZNF605,TMED7-TICAM2
0,4.69,7.37,0.455,-6.51,-3.03,-9.97,5.84,0.058,5.16,3.59,...,-1.51,-0.735,-1.94,7.28,-2.93,1.3,0.567,2.5,2.68,-0.735
1,4.52,-0.86,0.099,-9.97,-8.24,-9.97,4.15,-3.46,3.35,1.11,...,-0.808,-0.997,-4.61,7.39,-6.51,-6.51,-0.302,2.49,1.47,-5.01
2,7.16,-0.913,1.2,-9.97,-9.97,-9.97,4.21,-9.97,3.88,0.527,...,-2.24,-3.05,-9.97,8.02,-2.83,-9.97,-2.05,4.29,0.0014,-9.97
3,5.19,6.63,-1.06,-4.61,-2.12,-5.57,6.55,-1.35,5.49,4.79,...,-1.12,1.54,-4.61,7.78,-0.941,1.49,1.04,2.76,2.67,-3.05


Read column names from tybalt tcga data


In [55]:
tcga_df_tybalt = pd.read_table(tcga_tybalt_file_location)

In [56]:
tcga_df_columns_filterd = [i for i in tcga_df_tybalt.columns[1:] if i in tcga_df_sorted]
target_df_columns_filterd = [i for i in tcga_df_tybalt.columns[1:] if i in target_df_sorted]
gtex_df_columns_filterd = [i for i in tcga_df_tybalt.columns[1:] if i in gtex_df_sorted]

In [57]:
tcga_df_sorted = tcga_df_sorted[tcga_df_columns_filterd]
target_df_sorted = target_df_sorted[target_df_columns_filterd]
gtex_df_sorted = gtex_df_sorted[gtex_df_columns_filterd]

In [58]:
tcga_df_sorted = tcga_df_sorted / tcga_df_sorted.std()
tcga_df_sorted = np.log(tcga_df_sorted + 1)

In [59]:
target_df_sorted = target_df_sorted / target_df_sorted.std()
target_df_sorted = np.log(target_df_sorted + 1)

  result = func(self.values, **kwargs)


In [60]:
gtex_df_sorted = gtex_df_sorted / gtex_df_sorted.std()
gtex_df_sorted = np.log(gtex_df_sorted + 1)

In [61]:
# Scale RNAseq data using z-scores
tcga_df_sort = preprocessing.MinMaxScaler().fit_transform(tcga_df_sorted)
target_df_sort = preprocessing.MinMaxScaler().fit_transform(target_df_sorted)
gtex_df_sort = preprocessing.MinMaxScaler().fit_transform(gtex_df_sorted)

tcga_df_sort = pd.DataFrame(tcga_df_sort,
                                columns=tcga_df_sorted.columns)
target_df_sort = pd.DataFrame(target_df_sort,
                                columns=target_df_sorted.columns)
gtex_df_sort = pd.DataFrame(gtex_df_sort,
                                columns=gtex_df_sorted.columns)

In [62]:
tcga_df_sort = tcga_df_sort.dropna()
target_df_sort = target_df_sort.dropna()
gtex_df_sort = gtex_df_sort.dropna()

In [63]:
tcga_df_sort.to_csv('data/rescaled_minmax_tcga_df_sort.tsv.gz', sep='\t', compression='gzip')
target_df_sort.to_csv('data/rescaled_minmax_target_df_sort.tsv.gz', sep='\t', compression='gzip')
gtex_df_sort.to_csv('data/rescaled_minmax_gtex_df_sort.tsv.gz', sep='\t', compression='gzip')

In [64]:
tcga_df_sort['KRT5'].head(4)

0    0.033068
1    0.000176
2    0.000022
3    0.000000
Name: KRT5, dtype: float64

In [65]:
tcga_df_tybalt['KRT5'].head()

0    0.034230
1    0.181993
2    0.081082
3    0.180042
4    0.034017
Name: KRT5, dtype: float64

In [66]:
target_df_sort['KRT5'].head(4)

Series([], Name: KRT5, dtype: float64)

In [67]:
gtex_df_sort['KRT5'].head(4)

Series([], Name: KRT5, dtype: float64)

In [79]:
tcga_df_sort

Unnamed: 0,KRT5,AGR2,CEACAM5,KRT6A,KRT14,CEACAM6,SLC34A2,TMPRSS4,KRT6B,GPX2,...,EFCAB6,ABCG5,METTL7A,C8orf48,CDK5R1,FAM81A,GDPD3,SMAGP,POU5F1B,CHST2
0,0.033068,0.060887,1.287671e-02,0.000069,0.027122,0.095289,0.027064,0.022617,0.004107,0.025528,...,0.191982,0.000906,0.135526,0.200644,0.043282,0.079768,0.088029,0.206498,0.012990,0.060060
1,0.000176,0.039121,2.219099e-01,0.000227,0.000040,0.589294,0.522898,0.316991,0.000025,0.000312,...,0.236317,0.001627,0.113560,0.076768,0.024901,0.049606,0.048475,0.085863,0.009007,0.066847
2,0.000022,0.000000,9.292843e-06,0.000002,0.000001,0.000008,0.000018,0.001146,0.000001,0.000066,...,0.050510,0.170926,0.100339,0.037349,0.454755,0.157331,0.019919,0.023547,0.004039,0.203294
3,0.000000,0.000055,7.434176e-05,0.000000,0.000000,0.000037,0.000050,0.000176,0.000000,0.000012,...,0.177117,0.003165,0.327437,0.102861,0.019861,0.074490,0.037357,0.189202,0.003196,0.119391
4,0.000390,0.000000,9.579799e-07,0.000058,0.000066,0.000009,0.000274,0.000000,0.000055,0.000030,...,0.013539,0.002004,0.064682,0.038101,0.273325,0.211607,0.039863,0.090356,0.002515,0.091307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9949,0.129180,0.015250,1.340831e-03,0.188759,0.002068,0.014020,0.014227,0.432253,0.045079,0.495498,...,0.039608,0.000788,0.012129,0.037424,0.097652,0.031844,0.017637,0.381923,0.004254,0.455195
9950,0.000306,0.004621,9.579799e-07,0.000002,0.000001,0.000000,0.320654,0.001115,0.000001,0.000028,...,0.312216,0.079941,0.336130,0.158639,0.058847,0.153795,0.036022,0.553875,0.048381,0.040994
9951,0.000004,0.000000,9.579799e-07,0.000002,0.000001,0.000000,0.000036,0.000000,0.000001,0.000005,...,0.284444,0.003086,0.455130,0.118928,0.427565,0.444534,0.014940,0.013913,0.005258,0.060060
9952,0.000004,0.000738,9.579799e-07,0.000002,0.000002,0.000000,0.000011,0.000000,0.000001,0.000005,...,0.209278,0.013425,0.360754,0.085925,0.059766,0.011551,0.039790,0.161153,0.006898,0.018669
