In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

In [2]:
tcga_train_file_location = 'data/train_tcga_expression_matrix_processed.tsv.gz'
target_train_file_location = 'data/train_target_expression_matrix_processed.tsv.gz'
gtex_train_file_location = 'data/train_gtex_expression_matrix_processed.tsv.gz'
tcga_tybalt_file_location = 'data/pancan_scaled_zeroone_rnaseq.tsv.gz'

In [3]:
tcga_df = pd.read_table(tcga_train_file_location)
target_df = pd.read_table(target_train_file_location)
gtex_df = pd.read_table(gtex_train_file_location)

In [4]:
tcga_df = tcga_df.dropna()
target_df = target_df.dropna()
gtex_df = gtex_df.dropna()

In [13]:
# Commit from https://github.com/cognoma/genes
genes_commit = 'ad9631bb4e77e2cdc5413b0d77cb8f7e93fc5bee'

In [14]:
url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/genes.tsv'.format(genes_commit)
gene_df = pd.read_table(url)

# Only consider protein-coding genes
gene_df = (
    gene_df.query("gene_type == 'protein-coding'")
)

print(gene_df.shape)
gene_df.head(2)


(20395, 7)


Unnamed: 0,entrez_gene_id,symbol,description,chromosome,gene_type,synonyms,aliases
0,1,A1BG,alpha-1-B glycoprotein,19,protein-coding,A1B|ABG|GAB|HYST2477,alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...
1,2,A2M,alpha-2-macroglobulin,12,protein-coding,A2MD|CPAMD5|FWP007|S863-7,alpha-2-macroglobulin|C3 and PZP-like alpha-2-...


In [15]:
gene_dict = {str(i):j for i,j in zip(gene_df['entrez_gene_id'], gene_df['symbol'])}

In [16]:
tcga_gene_column_ids = [i for i in tcga_df.columns[1:] if i in gene_dict.keys()]
target_gene_column_ids = [i for i in target_df.columns[1:] if i in gene_dict.keys()]
gtex_gene_column_ids = [i for i in gtex_df.columns[1:] if i in gene_dict.keys()]

tcga_gene_column_names = [gene_dict[i] for i in tcga_df.columns[1:] if i in gene_dict.keys()]
target_gene_column_names = [gene_dict[i] for i in target_df.columns[1:] if i in gene_dict.keys()]
gtex_gene_column_names = [gene_dict[i] for i in gtex_df.columns[1:] if i in gene_dict.keys()]

In [38]:
tcga_df_sorted = tcga_df[tcga_gene_column_ids]
target_df_sorted = target_df[target_gene_column_ids]
gtex_df_sorted = gtex_df[gtex_gene_column_ids]

In [39]:
tcga_df_sorted = tcga_df_sorted.rename(columns={i:j for i, j in zip(tcga_gene_column_ids, tcga_gene_column_names)})
target_df_sorted = target_df_sorted.rename(columns={i:j for i, j in zip(target_gene_column_ids, target_gene_column_names)})
gtex_df_sorted = gtex_df_sorted.rename(columns={i:j for i, j in zip(gtex_gene_column_ids, gtex_gene_column_names)})

In [40]:
target_df_sorted.head(4)

Unnamed: 0,A1BG,A2M,NAT1,NAT2,SERPINA3,AADAC,AAMP,AANAT,AARS,ABAT,...,LINC00694,CH507-42P11.6,GAGE10,PRR33,POTEB3,TBC1D3I,CCNYL3,UPK3B,LRRC53,KLF18
0,4.69,7.37,0.455,-6.51,-3.03,-9.97,5.84,0.058,5.16,3.59,...,-3.17,-0.913,-9.97,-0.783,-9.97,1.47,-9.97,-0.619,-9.97,-9.97
1,4.52,-0.86,0.099,-9.97,-8.24,-9.97,4.15,-3.46,3.35,1.11,...,3.09,-9.97,-9.97,1.18,-9.97,-3.05,-9.97,-2.18,-9.97,-9.97
2,7.16,-0.913,1.2,-9.97,-9.97,-9.97,4.21,-9.97,3.88,0.527,...,-0.834,-9.97,-9.97,-3.46,-9.97,-9.97,-9.97,-9.97,-9.97,-9.97
3,5.19,6.63,-1.06,-4.61,-2.12,-5.57,6.55,-1.35,5.49,4.79,...,-5.01,-2.47,-3.46,-1.32,-9.97,-0.512,-9.97,0.228,-4.04,-9.97


Read column names from tybalt tcga data


In [41]:
tcga_df_tybalt = pd.read_table(tcga_tybalt_file_location)

In [42]:
tcga_df_columns_filterd = [i for i in tcga_df_tybalt.columns[1:] if i in tcga_df_sorted]
target_df_columns_filterd = [i for i in tcga_df_tybalt.columns[1:] if i in target_df_sorted]
gtex_df_columns_filterd = [i for i in tcga_df_tybalt.columns[1:] if i in gtex_df_sorted]

In [43]:
tcga_df_sorted = tcga_df_sorted[tcga_df_columns_filterd]
target_df_sorted = target_df_sorted[target_df_columns_filterd]
gtex_df_sorted = gtex_df_sorted[gtex_df_columns_filterd]

In [44]:
#tcga_df_sorted = tcga_df_sorted / tcga_df_sorted.std()
#tcga_df_sorted = np.log(tcga_df_sorted + 1)

In [45]:
#target_df_sorted = target_df_sorted / target_df_sorted.std()
#target_df_sorted = np.log(target_df_sorted + 1)

In [46]:
#gtex_df_sorted = gtex_df_sorted / gtex_df_sorted.std()
#gtex_df_sorted = np.log(gtex_df_sorted + 1)

In [47]:
# Scale RNAseq data using z-scores
tcga_df_sort = preprocessing.MinMaxScaler().fit_transform(tcga_df_sorted)
target_df_sort = preprocessing.MinMaxScaler().fit_transform(target_df_sorted)
gtex_df_sort = preprocessing.MinMaxScaler().fit_transform(gtex_df_sorted)

tcga_df_sort = pd.DataFrame(tcga_df_sort,
                                columns=tcga_df_sorted.columns)
target_df_sort = pd.DataFrame(target_df_sort,
                                columns=target_df_sorted.columns)
gtex_df_sort = pd.DataFrame(gtex_df_sort,
                                columns=gtex_df_sorted.columns)

In [48]:
tcga_df_sort = tcga_df_sort.dropna()
target_df_sort = target_df_sort.dropna()
gtex_df_sort = gtex_df_sort.dropna()

In [49]:
tcga_df_sort.to_csv('data/rescaled_minmax_tcga_df_sort.tsv.gz', sep='\t', compression='gzip')
target_df_sort.to_csv('data/rescaled_minmax_target_df_sort.tsv.gz', sep='\t', compression='gzip')
gtex_df_sort.to_csv('data/rescaled_minmax_gtex_df_sort.tsv.gz', sep='\t', compression='gzip')

In [52]:
gtex_df_sort['KRT5'].head(4)

0    0.000040
1    0.000091
2    0.000163
3    0.307263
Name: KRT5, dtype: float64

In [54]:
target_df_sort.head()

Unnamed: 0,RPS4Y1,KRT5,AGR2,CEACAM5,KRT6A,KRT14,CEACAM6,DDX3Y,KDM5D,SLC34A2,...,ABCG5,METTL7A,FAM129A,C8orf48,CDK5R1,FAM81A,GDPD3,SMAGP,POU5F1B,CHST2
0,0.224604,0.459175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.294505,...,0.41498,0.547677,0.681401,0.714029,0.921951,0.722627,0.888005,0.481132,0.469433,0.699463
1,0.359367,0.0,0.489525,0.779279,0.0,0.0,0.672609,0.320814,0.213448,0.19011,...,0.233468,0.639364,0.87156,0.0,0.457317,0.302613,0.72896,0.179363,0.369145,0.63864
2,0.422665,0.279709,0.0,0.0,0.0,0.0,0.668287,0.279604,0.0,0.0,...,0.0,0.452323,0.649708,0.0,0.518293,0.390646,0.705103,0.834906,0.0,0.668157
3,0.793772,0.585287,0.0,0.0,0.0,0.775893,0.0,0.571935,0.694016,0.19011,...,0.383266,0.408313,0.561968,0.914479,0.9,0.774415,0.984095,0.403302,0.676411,0.450537
4,0.0,0.497171,0.680306,0.0,0.338883,0.308929,0.0,0.0,0.0,0.835714,...,0.400135,0.749389,0.701418,0.745859,0.677439,0.828748,0.724122,0.665094,0.971941,0.86136


In [55]:
tcga_df_sort.head()

Unnamed: 0,KRT5,AGR2,CEACAM5,KRT6A,KRT14,CEACAM6,SLC34A2,TMPRSS4,KRT6B,GPX2,...,ABCG5,METTL7A,FAM129A,C8orf48,CDK5R1,FAM81A,GDPD3,SMAGP,POU5F1B,CHST2
0,0.007261,0.008478,0.002642114,1.090262e-05,0.003864448,0.019718,0.0059,0.004674,0.0005415271,0.004674,...,0.000157,0.035683,0.049389,0.038328,0.00855,0.015515,0.01302,0.055757,0.001886,0.0092
1,3.7e-05,0.005249,0.06091721,3.583711e-05,5.39233e-06,0.271831,0.233335,0.099728,3.311111e-06,5.5e-05,...,0.000282,0.029043,0.120576,0.011856,0.004794,0.009228,0.006702,0.019572,0.001299,0.010354
2,5e-06,0.0,1.874454e-06,2.912663e-07,1.415929e-07,1e-06,4e-06,0.00023,1.691358e-07,1.2e-05,...,0.038083,0.025219,0.063581,0.005406,0.169184,0.034412,0.002625,0.004934,0.000578,0.039695
3,0.0,7e-06,1.499672e-05,0.0,0.0,7e-06,1e-05,3.5e-05,0.0,2e-06,...,0.000549,0.112354,0.142093,0.016596,0.003797,0.014375,0.005069,0.049836,0.000457,0.020181
4,8.2e-05,0.0,1.932314e-07,9.155893e-06,9.050146e-06,2e-06,5.8e-05,0.0,7.144443e-06,5e-06,...,0.000347,0.01552,0.078688,0.005521,0.075932,0.050368,0.005432,0.020724,0.000359,0.014726


In [58]:
gtex_df_sort.head()

Unnamed: 0,RPS4Y1,KRT5,AGR2,CEACAM5,KRT6A,KRT14,CEACAM6,DDX3Y,KDM5D,SLC34A2,...,ABCG5,METTL7A,FAM129A,C8orf48,CDK5R1,FAM81A,GDPD3,SMAGP,POU5F1B,CHST2
0,0.151049,4e-05,0.000134,1e-05,2.6e-05,1.1e-05,1.3e-05,0.084494,0.054011,0.000189,...,0.000203,0.035019,0.001401,0.003907,0.003498,0.024968,0.062578,0.035116,0.028046,0.072614
1,0.205594,9.1e-05,0.000311,2.5e-05,2.7e-05,3e-05,0.002648,0.038608,0.12246,0.001665,...,0.000138,0.024438,0.03511,0.010645,0.002935,0.001028,0.004933,0.028763,0.017356,0.001358
2,0.155245,0.000163,2.3e-05,0.0,9e-06,0.000133,1.2e-05,0.040506,0.048503,6e-05,...,0.0,0.023546,0.02749,0.005341,0.006084,0.000675,0.008072,0.005663,0.006149,0.000215
3,0.324476,0.307263,5.2e-05,0.040901,0.009725,0.002734,0.091327,0.120253,0.250267,7.6e-05,...,0.000584,0.093274,0.026855,0.057706,0.028175,0.008835,0.217401,0.561669,0.056437,0.003622
4,0.276923,0.000269,0.0,7e-06,5e-05,7.5e-05,1.8e-05,0.130696,0.131551,1.1e-05,...,0.000106,0.380474,0.01025,0.051971,0.003297,0.000892,0.016598,0.044406,0.028391,0.054567
