In [1]:
import random
from datetime import datetime
from random import sample

import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from sklearn.preprocessing import StandardScaler
from torch.autograd import Variable
from torch.distributions.kl import kl_divergence
from torch.distributions.multivariate_normal import MultivariateNormal
from torch.utils.data import DataLoader, Dataset

# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from torchvision import datasets, transforms
from torchvision.transforms import ToTensor
from torchvision.utils import save_image

import VAE_tybalt
from VAE_tybalt import VAE

In [2]:
# This is normalized TCGA data from Tybalt github
tcga_tybalt_file_location = "data/pancan_scaled_zeroone_rnaseq.tsv.gz"

In [3]:
tcga_rnaseq_df = pd.read_table(tcga_tybalt_file_location)
tcga_rnaseq_df.drop(columns=tcga_rnaseq_df.columns[0], axis=1, inplace=True)
tcga_rnaseq_df = tcga_rnaseq_df.dropna()
print(tcga_rnaseq_df.shape)
tcga_rnaseq_df.head(2)

(10459, 5000)


Unnamed: 0,RPS4Y1,XIST,KRT5,AGR2,CEACAM5,KRT6A,KRT14,CEACAM6,DDX3Y,KDM5D,...,FAM129A,C8orf48,CDK5R1,FAM81A,C13orf18,GDPD3,SMAGP,C2orf85,POU5F1B,CHST2
0,0.678296,0.28991,0.03423,0.0,0.0,0.084731,0.031863,0.037709,0.746797,0.687833,...,0.44061,0.428782,0.732819,0.63434,0.580662,0.294313,0.458134,0.478219,0.168263,0.638497
1,0.200633,0.654917,0.181993,0.0,0.0,0.100606,0.050011,0.092586,0.103725,0.140642,...,0.620658,0.363207,0.592269,0.602755,0.610192,0.374569,0.72242,0.271356,0.160465,0.60256


In [4]:
# Define function to add gussian noise with different variance to each element of a pandas dataframe
def add_gaussian_noise(df, variances):
    assert len(variances) == len(
        df.columns
    ), "Number of variances must match the number of columns in the DataFrame."

    # Create a DataFrame with the same shape as the input DataFrame, filled with Gaussian random noise
    noise = pd.DataFrame(
        np.random.normal(0, np.sqrt(variances), size=df.shape), columns=df.columns
    )

    # Add the noise to the input DataFrame
    noisy_df = df + noise

    return noisy_df

In [5]:
variances = np.linspace(0, 0.2, len(tcga_rnaseq_df.columns))
np.random.shuffle(variances)

In [6]:
noisy_tcga_rnaseq_df = add_gaussian_noise(tcga_rnaseq_df, variances)

In [7]:
noisy_tcga_rnaseq_df.head()

Unnamed: 0,RPS4Y1,XIST,KRT5,AGR2,CEACAM5,KRT6A,KRT14,CEACAM6,DDX3Y,KDM5D,...,FAM129A,C8orf48,CDK5R1,FAM81A,C13orf18,GDPD3,SMAGP,C2orf85,POU5F1B,CHST2
0,1.0029,0.07654,0.026738,-0.187752,0.170346,0.140029,0.103744,0.343436,0.796252,0.584724,...,0.588091,0.499679,0.795139,0.476082,0.332138,0.445158,0.753875,0.026667,-0.218744,0.820836
1,0.208271,0.493415,-0.511877,0.20762,0.167754,0.430211,0.020135,0.836289,0.079512,0.505806,...,0.734051,0.601302,0.720078,0.361085,0.284556,0.509014,0.426944,1.055141,-0.306562,0.33948
2,1.042233,0.569753,-0.194156,0.244873,-0.442123,0.126243,0.037735,-0.7001,0.655012,0.42245,...,0.308693,0.752764,0.684626,0.035813,0.203002,0.484434,0.346695,0.665758,0.403595,0.085154
3,0.70605,0.354596,0.181197,0.149144,-0.183885,-0.063603,-0.00754,-0.912117,0.477399,1.00524,...,0.462627,0.419944,0.932775,1.262112,0.98261,0.53377,0.729427,-0.173217,0.207326,0.202362
4,0.78777,0.017812,0.044124,-0.216968,-0.292206,-0.270384,0.115107,-0.120718,0.690189,0.949843,...,0.583401,0.166573,0.827972,0.216888,-0.244023,0.05073,-0.137345,0.165989,0.136734,1.402816


In [8]:
tcga_rnaseq_df.head(3)

Unnamed: 0,RPS4Y1,XIST,KRT5,AGR2,CEACAM5,KRT6A,KRT14,CEACAM6,DDX3Y,KDM5D,...,FAM129A,C8orf48,CDK5R1,FAM81A,C13orf18,GDPD3,SMAGP,C2orf85,POU5F1B,CHST2
0,0.678296,0.28991,0.03423,0.0,0.0,0.084731,0.031863,0.037709,0.746797,0.687833,...,0.44061,0.428782,0.732819,0.63434,0.580662,0.294313,0.458134,0.478219,0.168263,0.638497
1,0.200633,0.654917,0.181993,0.0,0.0,0.100606,0.050011,0.092586,0.103725,0.140642,...,0.620658,0.363207,0.592269,0.602755,0.610192,0.374569,0.72242,0.271356,0.160465,0.60256
2,0.78598,0.140842,0.081082,0.0,0.0,0.0,0.0,0.0,0.730648,0.657189,...,0.437658,0.471489,0.868774,0.471141,0.487212,0.385521,0.466642,0.784059,0.160797,0.557074


In [9]:
test_set_percent = 0.2
tcga_df_test = tcga_rnaseq_df.sample(frac=test_set_percent)
tcga_df_train = tcga_rnaseq_df.drop(tcga_df_test.index)

In [10]:
test_set_percent = 0.2
noisy_tcga_df_test = noisy_tcga_rnaseq_df.sample(frac=test_set_percent)
noisy_tcga_df_train = noisy_tcga_rnaseq_df.drop(noisy_tcga_df_test.index)

In [11]:
# Define custom dataset class
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data.iloc[idx].values, dtype=torch.float32)

In [12]:
train_dataset = CustomDataset(tcga_df_train)
test_dataset = CustomDataset(tcga_df_test)
noisy_train_dataset = CustomDataset(noisy_tcga_df_train)
noisy_test_dataset = CustomDataset(noisy_tcga_df_test)

In [13]:
train_loader = torch.utils.data.DataLoader(
    dataset=train_dataset, batch_size=32, shuffle=True
)
validation_loader = torch.utils.data.DataLoader(
    dataset=test_dataset, batch_size=32, shuffle=False
)
noisy_train_loader = torch.utils.data.DataLoader(
    dataset=noisy_train_dataset, batch_size=32, shuffle=True
)
noisy_validation_loader = torch.utils.data.DataLoader(
    dataset=noisy_test_dataset, batch_size=32, shuffle=False
)

In [14]:
def train_one_epoch(epoch_index, tb_writer, train_loader):
    running_loss = 0.0
    last_loss = 0.0

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(train_loader):
        # Every data instance
        data = data.to(DEVICE)

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        recon_batch, latent = model(data)

        # Compute the loss and its gradients
        loss = VAE.loss_function_dist(recon_batch, data, latent, input_dim)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 100 == 99:
            last_loss = running_loss / 100.0  # loss per batch
            print("  batch {} loss: {}".format(i + 1, last_loss))
            tb_x = epoch_index * len(train_loader) + i + 1
            tb_writer.add_scalar("Loss/train", last_loss, tb_x)
            running_loss = 0.0

    return last_loss

In [15]:
!rm -rf runs/*

In [16]:
# build model
input_dim = tcga_rnaseq_df.shape[1]
# vae = VAE(input_dim=input_dim, hidden_dim=[100,100], z_dim=100)

model = VAE(input_dim=5000, hidden_dim=[512, 256], z_dim=100)
# if torch.backends.mps.is_available():
#     DEVICE = 'mps'
# else:
# train_loader = torch.utils.data.DataLoader(dataset=torch.Tensor(torch.randn(30, 5000)), batch_size=100, shuffle=True)

DEVICE = "cpu"

model.to(DEVICE)

optimizer = optim.Adam(model.parameters(), lr=0.0005)


# Initializing in a separate cell so we can easily add more epochs to the same run
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
writer = SummaryWriter("runs/tcga_trainer_{}".format(timestamp))
epoch_number = 0

EPOCHS = 40

best_vloss = 1_000_000.0

for epoch in range(EPOCHS):
    print("EPOCH {}:".format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss = train_one_epoch(epoch_number, writer, train_loader)

    # We don't need gradients on to do reporting
    model.train(False)

    running_vloss = 0.0
    for i, vdata in enumerate(validation_loader):
        vinputs = vdata
        voutputs, latent = model(vinputs)

        vloss = VAE.loss_function_dist(voutputs, vinputs, latent, input_dim)
        running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    print("LOSS train {} valid {}".format(avg_loss, avg_vloss))

    # Log the running loss averaged per batch
    # for both training and validation
    writer.add_scalars(
        "Training vs. Validation Loss",
        {"Training": avg_loss, "Validation": avg_vloss},
        epoch_number + 1,
    )
    writer.flush()

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = "model_{}_{}".format(timestamp, epoch_number)
        # torch.save(vae.state_dict(), model_path)

    epoch_number += 1

EPOCH 1:
  batch 100 loss: 96026.9003125
  batch 200 loss: 91149.7521875
LOSS train 91149.7521875 valid 100409.96875
EPOCH 2:
  batch 100 loss: 90055.60265625
  batch 200 loss: 89582.620546875
LOSS train 89582.620546875 valid 94886.1875
EPOCH 3:
  batch 100 loss: 89173.324140625
  batch 200 loss: 88848.005078125
LOSS train 88848.005078125 valid 94675.2578125
EPOCH 4:
  batch 100 loss: 88708.456796875
  batch 200 loss: 88551.723359375
LOSS train 88551.723359375 valid 93217.359375
EPOCH 5:
  batch 100 loss: 88349.42546875
  batch 200 loss: 88439.39890625
LOSS train 88439.39890625 valid 94100.8125
EPOCH 6:
  batch 100 loss: 88335.243984375
  batch 200 loss: 88206.891875
LOSS train 88206.891875 valid 93082.2890625
EPOCH 7:
  batch 100 loss: 88062.640703125
  batch 200 loss: 88132.57484375
LOSS train 88132.57484375 valid 92943.3984375
EPOCH 8:
  batch 100 loss: 87861.62875
  batch 200 loss: 88076.188203125
LOSS train 88076.188203125 valid 93667.171875
EPOCH 9:
  batch 100 loss: 87952.724609

### Look at learning with noisy dataset

In [17]:
# build model
input_dim = tcga_rnaseq_df.shape[1]
# vae = VAE(input_dim=input_dim, hidden_dim=[100,100], z_dim=100)
model = VAE(input_dim=5000, hidden_dim=[512, 256], z_dim=100)
# if torch.backends.mps.is_available():
#     DEVICE = 'mps'
# else:
# train_loader = torch.utils.data.DataLoader(dataset=torch.Tensor(torch.randn(30, 5000)), batch_size=100, shuffle=True)

DEVICE = "cpu"

model.to(DEVICE)

optimizer = optim.Adam(model.parameters(), lr=0.0005)


# Initializing in a separate cell so we can easily add more epochs to the same run
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
writer = SummaryWriter("runs/tcga_trainer_{}".format(timestamp))
epoch_number = 0

EPOCHS = 40

best_vloss = 1_000_000.0

for epoch in range(EPOCHS):
    print("EPOCH {}:".format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss = train_one_epoch(epoch_number, writer, noisy_train_loader)

    # We don't need gradients on to do reporting
    model.train(False)

    running_vloss = 0.0
    for i, vdata in enumerate(noisy_validation_loader):
        vinputs = vdata
        voutputs, latent = model(vinputs)

        vloss = VAE.loss_function_dist(voutputs, vinputs, latent, input_dim)
        running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    print("LOSS train {} valid {}".format(avg_loss, avg_vloss))

    # Log the running loss averaged per batch
    # for both training and validation
    writer.add_scalars(
        "Training vs. Validation Loss",
        {"Training": avg_loss, "Validation": avg_vloss},
        epoch_number + 1,
    )
    writer.flush()

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = "model_{}_{}".format(timestamp, epoch_number)
        # torch.save(vae.state_dict(), model_path)

    epoch_number += 1

EPOCH 1:
  batch 100 loss: 96568.7828125
  batch 200 loss: 92017.350078125
LOSS train 92017.350078125 valid 99313.8515625
EPOCH 2:
  batch 100 loss: 90339.232734375
  batch 200 loss: 90166.429921875
LOSS train 90166.429921875 valid 94253.2109375
EPOCH 3:
  batch 100 loss: 89617.91296875
  batch 200 loss: 89359.778125
LOSS train 89359.778125 valid 93638.671875
EPOCH 4:
  batch 100 loss: 89011.6009375
  batch 200 loss: 89024.31328125
LOSS train 89024.31328125 valid 93340.5625
EPOCH 5:
  batch 100 loss: 88667.4715625
  batch 200 loss: 88737.743515625
LOSS train 88737.743515625 valid 93178.9375
EPOCH 6:
  batch 100 loss: 88371.89921875
  batch 200 loss: 88544.034140625
LOSS train 88544.034140625 valid 92935.25
EPOCH 7:
  batch 100 loss: 88469.122890625
  batch 200 loss: 88161.65546875
LOSS train 88161.65546875 valid 92567.1796875
EPOCH 8:
  batch 100 loss: 88014.2909375
  batch 200 loss: 88287.259765625
LOSS train 88287.259765625 valid 92807.734375
EPOCH 9:
  batch 100 loss: 87938.80046875