In [1]:
import os
import pandas as pd 
import numpy as np
import torch
import pytorch_lightning as L
import torch.utils.data as data
from data.preprocess import ConcatDataset
from scipy.stats import pearsonr
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm
  warn(f"Failed to load image Python extension: {e}")


In [2]:

''' Set seeds for replicability  -Ensure that all operations are deterministic on GPU (if used) for reproducibility '''
np.random.seed(1235)
torch.manual_seed(1235)
L.seed_everything(1235)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

''' Set PATHs '''
PATH_data = "Data"

''' Load data '''
X1 = np.loadtxt(os.path.join(PATH_data, "TCGA",'TCGA_mRNAs_processed.csv'), delimiter=",")
X2 = np.loadtxt(os.path.join(PATH_data, "TCGA",'TCGA_miRNAs_processed.csv'), delimiter=",")
X1 = torch.from_numpy(X1).to(torch.float32)
X2 = torch.from_numpy(X2).to(torch.float32)
traits = np.loadtxt(os.path.join(PATH_data, "TCGA",'TCGA_clinic.csv'), delimiter=",", skiprows=1, usecols=(1,2,3,4,5))
# Get traits
Y = traits[:, -1]
# Take only age as confounder and scale
conf = traits[:, 1] 
conf = (conf - np.min(conf)) / (np.max(conf) - np.min(conf))
print('Shape of confounders:', conf.shape)

''' Split into training and validation sets '''
n_samples = X1.shape[0]
indices = np.random.permutation(n_samples)
train_idx, val_idx, test_idx = indices[:2100], indices[2100:2700], indices[2700:]

##### I am not a big fan of that as we also want to test other metrices using the test set... let's remove it for now and check how to implement it...
# # we test on the whole dataset for clustering
# train_idx = np.concatenate((train_idx, test_idx))
# test_idx = indices
X1_train, X1_val, X1_test = X1[train_idx,:], X1[val_idx,:], X1[test_idx,:]
X2_train, X2_val, X2_test = X2[train_idx,:], X2[val_idx,:], X2[test_idx,:] 
conf_train, conf_val, conf_test = conf[train_idx,], conf[val_idx,], conf[test_idx,] 
Y_test = Y[test_idx]


Global seed set to 1235


Shape of confounders: (3024,)


In [3]:
''' 
Load model & calculated corr coefficient
'''

from models.adversarial_XVAE import XVAE_w_advNet

dic_res = dict()
for epoch in ["epoch1", "epoch50"]:

    ckpt_xvae_path = f"{os.getcwd()}/lightning_logs/advTraining/XVAE_adv_pingpong/{epoch}/checkpoints"
    ckpt_xvae_file = f"{ckpt_xvae_path}/{os.listdir(ckpt_xvae_path)[0]}"
    xvae = XVAE_w_advNet.load_from_checkpoint(ckpt_xvae_file)

    z = xvae.xvae_pre.generate_embedding(X1_test, X2_test).detach().numpy()

    corr_conf = [np.abs(np.corrcoef(z.T, conf_test.T)[:-1,-1])]
    dic_res[epoch] = corr_conf[0]



 Training adv XVAE 




 Training adv XVAE 




In [8]:
df_res = pd.DataFrame.from_dict(dic_res)
(df_res["epoch1"] - df_res["epoch50"]).mean() / df_res["epoch1"].mean()

0.16443965247323153

In [32]:
# ''' Absolute correlation to confounding variables '''
# #corr_conf = [np.abs(np.corrcoef(LF.T, conf[:,i].T)[:-1,-1]) for i in range(conf.shape[1])]
# corr_conf = [np.abs(np.corrcoef(z.T, conf_test.T)[:-1,-1])]
# fig, ax = plt.subplots(figsize=(15,5))
# im = plt.imshow(corr_conf, cmap='hot', interpolation='nearest')
# labels = ['Age']
# labels_onehot = ['Age']
# ax.set_yticks(np.arange(1), labels=labels_onehot)
# ax.tick_params(axis='both', labelsize=10)
# plt.colorbar(im)