In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

In [2]:
tcga_train_file_location = 'data/train_tcga_expression_matrix_processed.tsv.gz'
target_train_file_location = 'data/train_target_expression_matrix_processed.tsv.gz'
gtex_train_file_location = 'data/train_gtex_expression_matrix_processed.tsv.gz'
tcga_tybalt_file_location = 'data/pancan_scaled_zeroone_rnaseq.tsv.gz'

In [3]:
tcga_df = pd.read_table(tcga_train_file_location)
target_df = pd.read_table(target_train_file_location)
gtex_df = pd.read_table(gtex_train_file_location)

In [4]:
tcga_df = tcga_df.dropna()
target_df = target_df.dropna()
gtex_df = gtex_df.dropna()

In [13]:
# Commit from https://github.com/cognoma/genes
genes_commit = 'ad9631bb4e77e2cdc5413b0d77cb8f7e93fc5bee'

In [14]:
url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/genes.tsv'.format(genes_commit)
gene_df = pd.read_table(url)

# Only consider protein-coding genes
gene_df = (
    gene_df.query("gene_type == 'protein-coding'")
)

print(gene_df.shape)
gene_df.head(2)


(20395, 7)


Unnamed: 0,entrez_gene_id,symbol,description,chromosome,gene_type,synonyms,aliases
0,1,A1BG,alpha-1-B glycoprotein,19,protein-coding,A1B|ABG|GAB|HYST2477,alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...
1,2,A2M,alpha-2-macroglobulin,12,protein-coding,A2MD|CPAMD5|FWP007|S863-7,alpha-2-macroglobulin|C3 and PZP-like alpha-2-...


In [15]:
gene_dict = {str(i):j for i,j in zip(gene_df['entrez_gene_id'], gene_df['symbol'])}

In [16]:
tcga_gene_column_ids = [i for i in tcga_df.columns[1:] if i in gene_dict.keys()]
target_gene_column_ids = [i for i in target_df.columns[1:] if i in gene_dict.keys()]
gtex_gene_column_ids = [i for i in gtex_df.columns[1:] if i in gene_dict.keys()]

tcga_gene_column_names = [gene_dict[i] for i in tcga_df.columns[1:] if i in gene_dict.keys()]
target_gene_column_names = [gene_dict[i] for i in target_df.columns[1:] if i in gene_dict.keys()]
gtex_gene_column_names = [gene_dict[i] for i in gtex_df.columns[1:] if i in gene_dict.keys()]

In [38]:
tcga_df_sorted = tcga_df[tcga_gene_column_ids]
target_df_sorted = target_df[target_gene_column_ids]
gtex_df_sorted = gtex_df[gtex_gene_column_ids]

In [39]:
tcga_df_sorted = tcga_df_sorted.rename(columns={i:j for i, j in zip(tcga_gene_column_ids, tcga_gene_column_names)})
target_df_sorted = target_df_sorted.rename(columns={i:j for i, j in zip(target_gene_column_ids, target_gene_column_names)})
gtex_df_sorted = gtex_df_sorted.rename(columns={i:j for i, j in zip(gtex_gene_column_ids, gtex_gene_column_names)})

In [40]:
target_df_sorted.head(4)

Unnamed: 0,A1BG,A2M,NAT1,NAT2,SERPINA3,AADAC,AAMP,AANAT,AARS,ABAT,...,LINC00694,CH507-42P11.6,GAGE10,PRR33,POTEB3,TBC1D3I,CCNYL3,UPK3B,LRRC53,KLF18
0,4.69,7.37,0.455,-6.51,-3.03,-9.97,5.84,0.058,5.16,3.59,...,-3.17,-0.913,-9.97,-0.783,-9.97,1.47,-9.97,-0.619,-9.97,-9.97
1,4.52,-0.86,0.099,-9.97,-8.24,-9.97,4.15,-3.46,3.35,1.11,...,3.09,-9.97,-9.97,1.18,-9.97,-3.05,-9.97,-2.18,-9.97,-9.97
2,7.16,-0.913,1.2,-9.97,-9.97,-9.97,4.21,-9.97,3.88,0.527,...,-0.834,-9.97,-9.97,-3.46,-9.97,-9.97,-9.97,-9.97,-9.97,-9.97
3,5.19,6.63,-1.06,-4.61,-2.12,-5.57,6.55,-1.35,5.49,4.79,...,-5.01,-2.47,-3.46,-1.32,-9.97,-0.512,-9.97,0.228,-4.04,-9.97


Read column names from tybalt tcga data


In [41]:
tcga_df_tybalt = pd.read_table(tcga_tybalt_file_location)

In [42]:
tcga_df_columns_filterd = [i for i in tcga_df_tybalt.columns[1:] if i in tcga_df_sorted]
target_df_columns_filterd = [i for i in tcga_df_tybalt.columns[1:] if i in target_df_sorted]
gtex_df_columns_filterd = [i for i in tcga_df_tybalt.columns[1:] if i in gtex_df_sorted]

In [43]:
tcga_df_sorted = tcga_df_sorted[tcga_df_columns_filterd]
target_df_sorted = target_df_sorted[target_df_columns_filterd]
gtex_df_sorted = gtex_df_sorted[gtex_df_columns_filterd]

In [44]:
#tcga_df_sorted = tcga_df_sorted / tcga_df_sorted.std()
#tcga_df_sorted = np.log(tcga_df_sorted + 1)

In [45]:
#target_df_sorted = target_df_sorted / target_df_sorted.std()
#target_df_sorted = np.log(target_df_sorted + 1)

In [46]:
#gtex_df_sorted = gtex_df_sorted / gtex_df_sorted.std()
#gtex_df_sorted = np.log(gtex_df_sorted + 1)

In [47]:
# Scale RNAseq data using z-scores
tcga_df_sort = preprocessing.MinMaxScaler().fit_transform(tcga_df_sorted)
target_df_sort = preprocessing.MinMaxScaler().fit_transform(target_df_sorted)
gtex_df_sort = preprocessing.MinMaxScaler().fit_transform(gtex_df_sorted)

tcga_df_sort = pd.DataFrame(tcga_df_sort,
                                columns=tcga_df_sorted.columns)
target_df_sort = pd.DataFrame(target_df_sort,
                                columns=target_df_sorted.columns)
gtex_df_sort = pd.DataFrame(gtex_df_sort,
                                columns=gtex_df_sorted.columns)

In [48]:
tcga_df_sort = tcga_df_sort.dropna()
target_df_sort = target_df_sort.dropna()
gtex_df_sort = gtex_df_sort.dropna()

In [60]:
print(tcga_df_sort.shape)
print(target_df_sort.shape)
print(gtex_df_sort.shape)

(9954, 4065)
(660, 4493)
(10519, 4479)


In [65]:
def intersection_of_three_lists(list1, list2, list3):
    # Convert the lists to sets
    set1 = set(list1)
    set2 = set(list2)
    set3 = set(list3)

    # Find the intersection of the three sets
    intersection = set1.intersection(set2, set3)

    # Convert the intersection back to a list
    intersection_list = list(intersection)

    return intersection_list

# Example usage
list1 = [1, 2, 3, 4, 5]
list2 = [2, 4, 6, 8]
list3 = [1, 2, 4, 7]

result = intersection_of_three_lists(list1, list2, list3)
print("Intersection:", result)


Intersection: [2, 4]


In [66]:
common_genes = intersection_of_three_lists(tcga_df_sort.columns, target_df_sort.columns, gtex_df_sort.columns)

In [69]:
tcga_df_sort = tcga_df_sort[common_genes]
target_df_sort = target_df_sort[common_genes]
gtex_df_sort = gtex_df_sort[common_genes]

In [70]:
tcga_df_sort.to_csv('data/rescaled_minmax_tcga_df_sort.tsv.gz', sep='\t', compression='gzip')
target_df_sort.to_csv('data/rescaled_minmax_target_df_sort.tsv.gz', sep='\t', compression='gzip')
gtex_df_sort.to_csv('data/rescaled_minmax_gtex_df_sort.tsv.gz', sep='\t', compression='gzip')

In [71]:
gtex_df_sort['KRT5'].head(4)

0    0.000040
1    0.000091
2    0.000163
3    0.307263
Name: KRT5, dtype: float64

In [72]:
target_df_sort.head()

Unnamed: 0,NUDT10,STOX2,TRIM10,SAA1,LRRC31,GRIN1,FAM72D,ATCAY,KCNT1,VSIG1,...,COL4A6,TCEAL5,LRIG1,GRM8,PKNOX2,FAM184A,SOD3,SPAG4,PCDHAC1,BAIAP2L2
0,0.845023,0.820495,0.322707,0.0,0.480807,0.961859,0.800974,0.899633,0.667855,0.737949,...,0.235214,0.878825,0.658215,0.9,0.719877,0.892072,0.72282,0.874555,0.681886,0.860394
1,0.683634,0.584594,0.637801,0.0,0.0,0.206198,0.76015,0.0,0.410888,0.469763,...,0.386132,0.30355,0.449493,0.0,0.299523,0.461026,0.378902,0.770563,0.0,0.613386
2,0.0,0.0,0.782043,0.0,0.0,0.0,0.802322,0.0,0.0,0.0,...,0.0,0.0,0.770791,0.0,0.0,0.453031,0.0,0.84747,0.0,0.875591
3,0.917168,0.825309,0.0,0.0,0.426522,0.766389,0.871161,0.97246,0.368114,0.825241,...,0.364378,0.955936,0.492901,0.836364,0.70531,0.936709,0.702368,0.867427,0.567365,0.945669
4,0.734803,0.741816,0.0,0.366508,0.519581,0.353397,0.786292,0.21175,0.384316,0.625767,...,0.851801,0.905141,0.883367,0.648606,0.957794,0.774151,0.919268,0.961511,0.693039,0.782858


In [73]:
tcga_df_sort.head()

Unnamed: 0,NUDT10,STOX2,TRIM10,SAA1,LRRC31,GRIN1,FAM72D,ATCAY,KCNT1,VSIG1,...,COL4A6,TCEAL5,LRIG1,GRM8,PKNOX2,FAM184A,SOD3,SPAG4,PCDHAC1,BAIAP2L2
0,0.010398,0.010045,6e-05,0.002066038,0.001961,0.001141,0.057238,3.7e-05,0.000686,0.000108,...,0.008247,0.027951,0.05061,0.000566,0.00241,0.027414,0.007775,0.0069,0.009055,0.002555
1,0.002684,0.108628,0.000299,6.730837e-06,0.053782,0.000321,0.011429,6.4e-05,0.0108,0.001179,...,0.002528,0.004978,0.067007,0.00291,0.011035,0.159884,0.020307,0.01578,0.028547,0.000908
2,0.000936,0.003245,0.001579,1.61327e-05,0.000261,3.8e-05,0.319048,1.8e-05,0.000299,0.000109,...,1.2e-05,0.001016,0.006065,0.00035,0.002278,0.091022,0.002926,0.00418,0.000963,0.000515
3,0.051981,0.052009,0.000283,7.373827e-07,0.000134,0.0,0.009714,0.0,0.00051,8e-06,...,0.060192,0.025673,0.042412,0.000295,0.017635,0.011193,0.026698,0.00244,0.006865,0.004009
4,0.001629,0.016776,0.00037,0.0003894035,3e-05,0.00025,0.049524,3.8e-05,0.000281,0.000445,...,5.5e-05,0.00811,0.01031,0.000412,0.157344,0.00167,0.008151,0.01986,0.02788,0.036184


In [74]:
gtex_df_sort.head()

Unnamed: 0,NUDT10,STOX2,TRIM10,SAA1,LRRC31,GRIN1,FAM72D,ATCAY,KCNT1,VSIG1,...,COL4A6,TCEAL5,LRIG1,GRM8,PKNOX2,FAM184A,SOD3,SPAG4,PCDHAC1,BAIAP2L2
0,0.010769,0.016115,0.067705,3e-06,0.000385,8e-06,0.001724,8.3e-05,7.3e-05,4.7e-05,...,0.002783,0.000271,0.008541,0.003669,0.019206,0.049302,0.030613,0.035231,0.0,0.359042
1,0.001487,0.068504,0.00019,3.4e-05,0.000351,0.000414,0.001177,0.000413,0.000626,8.5e-05,...,0.00022,0.000409,0.116403,0.001947,0.043968,0.032093,0.01829,0.191538,0.061562,0.007165
2,0.000897,0.025564,0.0,3.5e-05,0.0,6.9e-05,0.0,0.001183,0.029,9.6e-05,...,0.000248,0.000184,0.076454,0.0,0.098413,0.005837,0.001239,0.002138,0.0,0.001322
3,0.006103,0.233858,0.0,2.7e-05,0.0,0.001065,0.034764,0.000511,0.003078,0.000714,...,0.002379,0.001407,0.082116,0.003023,0.015095,0.021581,0.053871,0.012692,0.004288,0.009301
4,0.003214,0.035696,0.0,0.022475,0.000812,0.0,0.00815,0.0,0.000191,0.000123,...,0.001123,0.0,0.197245,0.0,0.017619,0.028837,0.112581,0.006077,0.046554,0.007722


In [75]:
target_df_sort.shape

(660, 4046)

In [76]:
tcga_df_sort.shape

(9954, 4046)

In [77]:
gtex_df_sort.shape

(10519, 4046)