This notebook performs data augmentation using only patient data, with unconditional GANs.

Reference: https://forge.ibisc.univ-evry.fr/alacan/GANs-for-transcriptomics

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader
import sys

In [2]:
sys.path.append("../src/baselines/")
sys.path.append("../src/metrics/")

In [3]:
from gan import GAN

In [4]:
from configs import CONFIG_WGAN as CONFIG

In [5]:
CONFIG["device"] = torch.device("cuda:0")

In [6]:
sample_id = 2

In [7]:
print(CONFIG)

{'latent_dim': 128, 'x_dim': 7776, 'embedded_dim': 2, 'numerical_dim': 3, 'hidden_dim1_g': 256, 'hidden_dim2_g': 512, 'hidden_dim3_g': 1024, 'hidden_dim1_d': 512, 'hidden_dim2_d': 256, 'hidden_dim3_d': None, 'output_dim': 1, 'vocab_size': 24, 'categorical': 'tissue_type', 'activation': 'leaky_relu', 'negative_slope': 0.05, 'optimizer': 'adam', 'lr_g': 0.0001, 'lr_d': 0.001, 'batch_size': 256, 'epochs': 800, 'iters_critic': 5, 'lambda_penalty': 10, 'nb_principal_components': 2000, 'prob_success': 0, 'norm_scale': 0.5, 'epochs_checkpoints': [], 'checkpoint_dir': '../src/baselines/gan/checkpoints', 'log_dir': '../src/baselines/gan/logs/', 'fig_dir': '../src/baselines/gan/figures', 'step': 100, 'pca_applied': True, 'device': device(type='cuda', index=0)}


In [8]:
gan_model = GAN(CONFIG)

In [9]:
# Patient dataset
tcga_train_df = pd.read_csv(f"../data/diffusion_pretraining/tcga_diffusion_train_sample{sample_id}.csv", index_col=0)
tcga_test_df = pd.read_csv(f"../data/diffusion_pretraining/tcga_diffusion_test_sample{sample_id}.csv", index_col=0)
tcga_train_dataset = TensorDataset(torch.tensor(tcga_train_df.values), torch.tensor(tcga_train_df.values))
tcga_test_dataset = TensorDataset(torch.tensor(tcga_test_df.values), torch.tensor(tcga_test_df.values))
tcga_train_dataloader = DataLoader(tcga_train_dataset, batch_size=256, shuffle=True)
tcga_test_dataloader = DataLoader(tcga_test_dataset, batch_size=256, shuffle=False)

In [10]:
gan_model.train(
    TrainDataLoader=tcga_train_dataloader,
    ValDataLoader=tcga_test_dataloader,
    z_dim=CONFIG['latent_dim'], 
                epochs=CONFIG['epochs'], 
                categorical=None,
                # iters_critic=CONFIG['iters_critic'], 
                # lambda_penalty=CONFIG['lambda_penalty'], 
                step = CONFIG['step'],
                verbose=True, 
                checkpoint_dir=CONFIG['checkpoint_dir'], 
                log_dir=CONFIG['log_dir'], 
                fig_dir = CONFIG['fig_dir'],
                prob_success=CONFIG['prob_success'], 
                norm_scale=CONFIG['norm_scale'],
                optimizer = CONFIG['optimizer'],
                lr_g = CONFIG['lr_g'],
                lr_d = CONFIG['lr_d'],
               nb_principal_components = CONFIG['nb_principal_components'],
               config=CONFIG,
               hyperparameters_search=False
               )

Directory '../src/baselines/gan/logs/20240417-165204' created
Directory '../src/baselines/gan/checkpoints/20240417-165204' created
Directory '../src/baselines/gan/figures/20240417-165204' created
Time of training: 34.956 sec = 0.5826 minute(s) = 0.0097 hour(s)
--------------------
Discriminator saved at ../src/baselines/gan/checkpoints/20240417-165204/_disc.pt and generator saved at ../src/baselines/gan/checkpoints/20240417-165204/_gen.pt.


In [11]:
x_real, x_gen = gan_model.real_fake_data(tcga_train_dataloader, z_dim=CONFIG['latent_dim'])
x_gen.shape, x_real.shape

((476, 7776), (476, 7776))

In [12]:
pd.DataFrame(x_gen, columns = tcga_train_df.columns, index=tcga_train_df.index).to_csv(f"/data/ajayago/druid/intermediate/cs6220/baselines/augmented_gan_tcga_sample{sample_id}.csv")

Inference on cell lines - not very relevant 

In [11]:
cl_train_df = pd.read_csv(f"../data/diffusion_pretraining/cl_diffusion_train_sample{sample_id}.csv", index_col=0)
cl_train_df.shape

(1569, 7776)

In [12]:
cl_test_df = pd.read_csv(f"../data/diffusion_pretraining/cl_diffusion_test_sample{sample_id}.csv", index_col=0)
cl_test_df.shape

(175, 7776)

In [13]:
cl_train_dataset = TensorDataset(torch.tensor(cl_train_df.values), torch.tensor(cl_train_df.values))
cl_train_dataloader = DataLoader(cl_train_dataset, batch_size=256, shuffle=True)

In [14]:
cl_test_dataset = TensorDataset(torch.tensor(cl_test_df.values), torch.tensor(cl_test_df.values))
cl_test_dataloader = DataLoader(cl_test_dataset, batch_size=256, shuffle=True)

In [13]:
x_real_cl, x_gen_cl = gan_model.real_fake_data(cl_train_dataloader, z_dim=CONFIG['latent_dim'])
x_gen_cl.shape, x_real_cl.shape

((1569, 7776), (1569, 7776))

In [22]:
x_real_cl_test, x_gen_cl_test = gan_model.real_fake_data(cl_test_dataloader, z_dim=CONFIG['latent_dim'])
x_gen_cl_test.shape, x_real_cl_test.shape

((175, 7776), (175, 7776))

In [25]:
np.concatenate((x_gen_cl, x_gen_cl_test)).shape

(1744, 7776)

In [29]:
new_idx = list(cl_train_df.index) + list(cl_test_df.index)
len(new_idx)

1744

In [30]:
# pd.DataFrame(np.concatenate((x_gen_cl, x_gen_cl_test)), columns = cl_train_df.columns, index=new_idx).to_csv("/data/ajayago/druid/intermediate/cs6220/baselines/augmented_gan.csv")