In [36]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

In [37]:
tcga_train_file_location = 'data/train_tcga_expression_matrix_processed.tsv.gz'
target_train_file_location = 'data/train_target_expression_matrix_processed.tsv.gz'
gtex_train_file_location = 'data/train_gtex_expression_matrix_processed.tsv.gz'
tcga_tybalt_file_location = 'data/pancan_scaled_zeroone_rnaseq.tsv.gz'

In [38]:
tcga_df = pd.read_table(tcga_train_file_location)
target_df = pd.read_table(target_train_file_location)
gtex_df = pd.read_table(gtex_train_file_location)

In [39]:
tcga_df.drop(columns=tcga_df.columns[0], axis=1,  inplace=True)
target_df.drop(columns=target_df.columns[0], axis=1,  inplace=True)
gtex_df.drop(columns=gtex_df.columns[0], axis=1,  inplace=True)
tcga_df = tcga_df.dropna()
target_df = target_df.dropna()
gtex_df = gtex_df.dropna()

In [40]:
scaler = StandardScaler()

In [41]:
# Scale RNAseq data using z-scores
tcga_df_sort = preprocessing.MinMaxScaler().fit_transform(tcga_df)
target_df_sort = preprocessing.MinMaxScaler().fit_transform(target_df)
gtex_df_sort = preprocessing.MinMaxScaler().fit_transform(gtex_df)

tcga_df_sort = pd.DataFrame(tcga_df_sort,
                                columns=tcga_df.columns)
target_df_sort = pd.DataFrame(target_df_sort,
                                columns=target_df.columns)
gtex_df_sort = pd.DataFrame(gtex_df_sort,
                                columns=gtex_df.columns)

In [42]:
# Calculate standard deviation of each column
std_tcga = tcga_df_sort.std()
std_target = target_df_sort.std()
std_gtex = gtex_df_sort.std()

# Sort the standard deviations in descending order
std_sorted_tcga = std_tcga.sort_values(ascending=False)
std_sorted_target = std_target.sort_values(ascending=False)
std_sorted_gtex = std_gtex.sort_values(ascending=False)

# Print the columns with the highest standard deviations
tcga_df_sort = tcga_df_sort[std_sorted_tcga.index[:5000]]
target_df_sort = target_df_sort[std_sorted_target.index[:5000]]
gtex_df_sort = gtex_df_sort[std_sorted_gtex.index[:5000]]

In [43]:
tcga_df_sort.to_csv('data/rescaled_5000_tcga_df_sort.tsv.gz', sep='\t', compression='gzip')
target_df_sort.to_csv('data/rescaled_5000_target_df_sort.tsv.gz', sep='\t', compression='gzip')
gtex_df_sort.to_csv('data/rescaled_5000_gtex_df_sort.tsv.gz', sep='\t', compression='gzip')

In [44]:
tcga_df.head()

Unnamed: 0,1,10,100,1000,10000,10001,10002,10003,100037417,10004,...,9987,9988,9989,999,9990,9991,9992,9993,9994,9997
0,202.0,28.5,329.0,84.5,492.0,448.0,4.59,14.7,337.0,129.0,...,3430.0,717.0,1800.0,6360.0,299.0,2310.0,10.6,3190.0,337.0,892.0
1,77.5,22.5,74.5,13.1,784.0,333.0,2.54,176.0,153.0,68.3,...,6050.0,923.0,2490.0,11300.0,1150.0,4030.0,9.08,2890.0,316.0,301.0
2,152.0,0.0,3020.0,26.6,486.0,497.0,0.0,8.47,348.0,91.6,...,4930.0,897.0,861.0,39.7,464.0,3320.0,0.0,1330.0,606.0,558.0
3,80.5,40.0,70.6,284.0,2420.0,325.0,1.2,91.4,231.0,241.0,...,3890.0,737.0,1410.0,10.9,1120.0,1990.0,5.24,3090.0,673.0,263.0
4,319.0,0.0,422.0,184.0,423.0,392.0,0.945,2.36,585.0,143.0,...,1930.0,328.0,1340.0,7010.0,450.0,563.0,10.9,3780.0,37.3,1120.0
