In [1]:
import pandas as pd 
import numpy as np 
from amlvae.data.ExprProcessor import ExprProcessor
from amlvae.data.ClinProcessor import ClinProcessor
from matplotlib import pyplot as plt
import torch 
import os 

# auto reimport 
%load_ext autoreload
%autoreload 2

In [2]:
expr_long1 = pd.read_csv('../../data/aml_train.csv')
expr_long2 = pd.read_csv('../../data/aml_validation.csv')
expr_long3 = pd.read_csv('../../data/aml_test.csv')

In [3]:
eproc = ExprProcessor(expr_long1, target='tpm_unstranded') # options: 'unstranded','stranded_first','stranded_second','tpm_unstranded','fpkm_unstranded','fpkm_uq_unstranded'
eproc.select_genes_('variance', top_n=5000) # options: 'tcga', 'variance' 
eproc.normalize_('zscore') # options: 'minmax', 'zscore'

In [4]:
X_train, train_ids = eproc.get_data()
X_train = torch.tensor(X_train, dtype=torch.float32)

assert X_train.shape[0] == len(train_ids), 'X_train and train_ids do not match in length'
assert not torch.isnan(X_train).any(), 'X_train contains NaN values'
assert not torch.isinf(X_train).any(), 'X_train contains Inf values'

print('min value:', X_train.min())
print('max value:', X_train.max())
print('mean value:', X_train.mean())
print('std value:', X_train.std())

min value: tensor(-11.8888)
max value: tensor(16.8045)
mean value: tensor(1.5289e-09)
std value: tensor(0.9990)


In [5]:
X_val, val_ids = eproc.process_new(expr_long2)
X_val = torch.tensor(X_val, dtype=torch.float32)

assert X_val.shape[0] == len(val_ids), 'X_val and val_ids do not match in length'
assert not torch.isnan(X_val).any(), 'X_val contains NaN values'
assert not torch.isinf(X_val).any(), 'X_val contains Inf values'
print('min value:', X_val.min())
print('max value:', X_val.max())
print('mean value:', X_val.mean())
print('std value:', X_val.std())

min value: tensor(-9.8833)
max value: tensor(17.4198)
mean value: tensor(-0.2803)
std value: tensor(1.2015)


In [6]:
X_test, test_ids = eproc.process_new(expr_long3)
X_test = torch.tensor(X_test, dtype=torch.float32)

assert X_test.shape[0] == len(test_ids), 'X_test and test_ids do not match in length'
assert not torch.isnan(X_test).any(), 'X_test contains NaN values'
assert not torch.isinf(X_test).any(), 'X_test contains Inf values'
print('min value:', X_test.min())
print('max value:', X_test.max())
print('mean value:', X_test.mean())
print('std value:', X_test.std())

min value: tensor(-5.6350)
max value: tensor(10.9079)
mean value: tensor(-0.0441)
std value: tensor(1.0665)


In [7]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

torch.Size([511, 5000])
torch.Size([151, 5000])
torch.Size([55, 5000])


In [8]:
X = torch.cat([X_train, X_val, X_test], dim=0).detach().numpy()
ids = np.concatenate([train_ids, val_ids, test_ids], axis=0)

df = pd.DataFrame(X, index=ids, columns = eproc.selected_genes)
df.head()


Unnamed: 0,S100A9,MT-CO2,LYZ,MT-ND4,MT-CO3,MPO,MT-CO1,SRGN,HBB,MT-ND1,...,PANX2,AGPAT3,ZNF331,CDK11B,TBC1D14,E2F3,COX6C,KRT1,PAM,LINC02604
001454b2-aff9-4659-85a6-73fb8092589a,-0.652122,-0.51968,0.046059,0.274374,0.211972,0.125731,0.62899,-1.499215,-0.684449,-1.299649,...,-0.355553,1.248538,0.354875,-0.141903,1.289538,-0.448482,-0.152845,-0.292104,0.495891,1.081057
002cacd9-c03b-4526-a380-0701f41c4a9e,-2.163392,0.058464,-2.791249,0.044888,1.019568,-0.248765,1.458651,-2.953973,0.830195,0.014082,...,0.967753,0.604003,0.666343,-1.22423,-0.2439,-1.937637,0.51502,-0.048977,0.854153,1.226441
006e5777-2603-4db7-a1d6-8c8085c5e3e5,0.047106,-2.044368,0.424308,-1.311193,-1.24436,0.548423,-0.632209,0.137187,0.183577,-1.884045,...,0.601502,0.525237,0.001848,-0.197807,1.480494,0.918404,-0.077817,-0.341272,1.098997,1.517063
00870f33-cab3-4c23-bd0d-8903a5a9789e,0.841024,-0.328335,-0.066645,-0.536656,0.433866,-0.604071,-0.425134,0.571021,0.475116,0.485768,...,2.221498,-0.515301,1.342852,0.562208,-1.651345,0.345054,-0.544466,-0.59007,-1.12366,0.27441
00b535f6-064a-4dcf-ab14-387a54eedeee,0.688142,-0.855246,0.751074,-1.789957,-0.428534,-0.209685,-1.011842,1.19391,0.350485,-0.97611,...,0.102146,1.449877,-0.861223,0.470082,-1.445936,-1.334725,2.229457,-0.256749,-0.489086,0.624312


In [9]:
os.makedirs('../data/', exist_ok=True)
df.to_csv('../data/aml_expr.csv')
torch.save({'train_ids': train_ids, 'val_ids': val_ids, 'test_ids': test_ids}, '../data/aml_partitions.pt')