# Testing DRPredICT validation

**Authorship:**
Adam Klie, *03/29/2022*
***
**Description:**
Notebook for testing the validation of DRPredICT architectures

<div class="alert alert-block alert-warning">
<b>TODOs</b>:
<ul>
    <b><li></li></b>
    </ul>
</div>

In [3]:
import numpy as np
import pandas as pd
import torch

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

import sys
sys.path.append("../drpredict")

# Random dataset
<div class="alert alert-info" role="alert">
  <b>Just test out the basic function of our eugene architectures</b>
</div>

## Instantiate DRPredICT architecture: VanillaAE

In [4]:
from VanillaAE import VanillaAE

In [5]:
encoder_args = {"activation": "sigmoid"}
decoder_args = {"dropout_rate": 0.2}

In [6]:
mutAE = VanillaAE(1000, 10, hidden_dims=[500], encoder_kwargs=encoder_args, decoder_kwargs=decoder_args)

In [7]:
x = torch.randn(10, 1000)
out = mutAE(x)
out.shape

torch.Size([10, 1000])

## Load data

In [8]:
from torch.utils.data import dataset
from torch.utils.data import DataLoader

### Training set

In [9]:
training_dataset = dataset.TensorDataset(torch.randn(1000, 1000))
training_dataloader = DataLoader(training_dataset, batch_size=32, num_workers=4)
training_dataset[0][0].shape

torch.Size([1000])

In [10]:
for i_batch, batch in enumerate(training_dataloader):
    x = batch[0]
    outs = mutAE(x)
    print(x.shape, outs.shape)
    if i_batch==3:
        break

torch.Size([32, 1000]) torch.Size([32, 1000])
torch.Size([32, 1000]) torch.Size([32, 1000])
torch.Size([32, 1000]) torch.Size([32, 1000])
torch.Size([32, 1000]) torch.Size([32, 1000])


### Validation set

In [11]:
validation_dataset = dataset.TensorDataset(torch.randn(100, 1000))
validation_dataloader = DataLoader(validation_dataset, batch_size=32, num_workers=4)
validation_dataset[0][0].shape

torch.Size([1000])

In [12]:
for i_batch, batch in enumerate(validation_dataloader):
    x = batch[0]
    outs = mutAE(x)
    print(x.shape, outs.shape)
    if i_batch==3:
        break

torch.Size([32, 1000]) torch.Size([32, 1000])
torch.Size([32, 1000]) torch.Size([32, 1000])
torch.Size([32, 1000]) torch.Size([32, 1000])
torch.Size([4, 1000]) torch.Size([4, 1000])


## Training with PyTorch Lightning

In [13]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger

In [14]:
logger = TensorBoardLogger("random_test", name="VanillaAE")
trainer = pl.Trainer(gpus=1, max_epochs=10, logger=logger)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [19]:
trainer.fit(model=mutAE, train_dataloader=validation_dataloader, val_dataloaders=validation_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Set SLURM handle signals.

  | Name    | Type                 | Params
-------------------------------------------------
0 | encoder | FullyConnectedModule | 505 K 
1 | decoder | FullyConnectedModule | 506 K 
-------------------------------------------------
1.0 M     Trainable params
0         Non-trainable params
1.0 M     Total params
4.048     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [26]:
trainer.predict(model=mutAE, dataloaders=validation_dataloader)[0].shape

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 4it [00:00, ?it/s]

torch.Size([32, 10])

# MPRADataset

## Instantiate EUGENE architecture: dsEUGENE

In [33]:
from nn_utils import init_weights
from dsEUGENE import dsEUGENE

cnn=dict(input_len=66, channels=[4, 16], conv_kernels=[15, 5], pool_kernels=[1, 1])
rnn=dict(output_dim=32, batch_first=True)
fc=dict(output_dim=1)

eugene = dsEUGENE(conv_kwargs=cnn, rnn_kwargs=rnn, fc_kwargs=fc)
init_weights(eugene)

## Load data

In [34]:
from load_data import load_csv, load_numpy
from MPRADataset import MPRADataset
from torchvision import transforms
from transforms import ReverseComplement, Augment, OneHotEncode, ToTensor

TRAIN_SEQ = "/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/seqs/0.09-0.4_seqs-train-0.9.txt"
TRAIN_LABEL = "/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/binary/0.09-0.4_y-train-0.9_binary.txt"
train_seqs, train_targets = load_numpy(TRAIN_SEQ, TRAIN_LABEL, is_seq_text=True)
len(train_seqs), train_seqs[0], len(train_targets), train_targets[0]

VAL_SEQ = "/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/seqs/0.09-0.4_seqs-test-0.1.txt"
VAL_LABEL = "/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/binary/0.09-0.4_y-test-0.1_binary.txt"
val_seqs, val_targets = load_numpy(VAL_SEQ, VAL_LABEL, is_seq_text=True)
len(val_seqs), val_seqs[0], len(val_targets), val_targets[0]

# Compose different data transforms for this particular load
data_transform = transforms.Compose([
    Augment(randomize_linker_p=0.1, enhancer="WT-otx-a"), ReverseComplement(ohe_encoded=False), OneHotEncode(), ToTensor(transpose=True)
])

# Instantiate a Dataset
train_dataset = MPRADataset(train_seqs, train_targets, transform=data_transform)
val_dataset = MPRADataset(val_seqs, val_targets, transform=data_transform)

# Instantiate a DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=512, shuffle=False, num_workers=4)

for i_batch, batch in enumerate(train_dataloader):
    x, x_rev_comp, y = batch["sequence"], batch["reverse_complement"], batch["target"]
    outs = eugene(x, x_rev_comp)
    print(x.shape, x_rev_comp.shape, y.shape, outs.shape)
    if i_batch==3:
        break
        
for i_batch, batch in enumerate(val_dataloader):
    x, x_rev_comp, y = batch["sequence"], batch["reverse_complement"], batch["target"]
    outs = eugene(x, x_rev_comp)
    print(x.shape, x_rev_comp.shape, y.shape, outs.shape)
    if i_batch==3:
        break

torch.Size([512, 4, 66]) torch.Size([512, 4, 66]) torch.Size([512]) torch.Size([512, 1])
torch.Size([512, 4, 66]) torch.Size([512, 4, 66]) torch.Size([512]) torch.Size([512, 1])
torch.Size([512, 4, 66]) torch.Size([512, 4, 66]) torch.Size([512]) torch.Size([512, 1])
torch.Size([512, 4, 66]) torch.Size([512, 4, 66]) torch.Size([512]) torch.Size([512, 1])
torch.Size([512, 4, 66]) torch.Size([512, 4, 66]) torch.Size([512]) torch.Size([512, 1])
torch.Size([512, 4, 66]) torch.Size([512, 4, 66]) torch.Size([512]) torch.Size([512, 1])
torch.Size([512, 4, 66]) torch.Size([512, 4, 66]) torch.Size([512]) torch.Size([512, 1])
torch.Size([512, 4, 66]) torch.Size([512, 4, 66]) torch.Size([512]) torch.Size([512, 1])


## validation with PyTorch Lightning

In [88]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger

In [89]:
logger = TensorBoardLogger("0.18-0.4_test", name="dsEUGENE")
trainer = pl.Trainer(gpus=1, max_epochs=10, logger=logger)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


In [92]:
trainer.fit(model=eugene, train_dataloader=train_dataloader, val_dataloaders=val_dataloader)

Set SLURM handle signals.

  | Name         | Type                 | Params
------------------------------------------------------
0 | convnet      | BasicConv1D          | 976   
1 | recurrentnet | BasicRecurrent       | 8 K   
2 | fcnet        | FullyConnectedModule | 33    
3 | accuracy     | Accuracy             | 0     
4 | auroc        | AUROC                | 0     


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]



Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

1

# MPRADataModule

In [3]:
from MPRADataModule import MPRADataModule

## Load data

In [4]:
from torchvision import transforms
from transforms import ReverseComplement, Augment, OneHotEncode, ToTensor

In [5]:
data_transform = transforms.Compose([
    Augment(randomize_linker_p=0.1, enhancer="WT-otx-a"), ReverseComplement(ohe_encoded=False), OneHotEncode(), ToTensor(transpose=True)
])

In [6]:
OLS_TSV = "/cellar/users/aklie/projects/EUGENE/data/2021_OLS_Library/2021_OLS_Library.tsv"
mod = MPRADataModule(seq_file=OLS_TSV,
                     transforms=data_transform,
                     num_workers=4,
                     batch_size=128,
                     load_kwargs=dict(target_col="ACTIVITY_SUMRNA_NUMDNA", low_thresh=0.18, high_thresh=0.4))

## Instantiate EUGENE architecture: dsEUGENE

In [7]:
from dsEUGENE import dsEUGENE
from nn_utils import init_weights

In [8]:
cnn=dict(input_len=66, channels=[4, 16], conv_kernels=[15, 5], pool_kernels=[1, 1])
rnn=dict(output_dim=32, batch_first=True)
fc=dict(output_dim=1)

In [9]:
eugene = dsEUGENE(conv_kwargs=cnn, rnn_kwargs=rnn, fc_kwargs=fc)
init_weights(eugene)
eugene



dsEUGENE(
  (convnet): BasicConv1D(
    (module): Sequential(
      (0): Conv1d(4, 16, kernel_size=(15,), stride=(1,))
      (1): ReLU(inplace=True)
      (2): MaxPool1d(kernel_size=1, stride=1, padding=0, dilation=1, ceil_mode=False)
    )
  )
  (recurrentnet): BasicRecurrent(
    (module): LSTM(32, 32, batch_first=True)
  )
  (fcnet): FullyConnectedModule(
    (module): Sequential(
      (0): Linear(in_features=32, out_features=1, bias=True)
    )
  )
  (accuracy): Accuracy()
  (auroc): AUROC()
)

## validation with PyTorch Lightning

In [10]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger

In [11]:
logger = TensorBoardLogger("0.18-0.4_test", name="dsEUGENE")
trainer = pl.Trainer(gpus=1, max_epochs=10, logger=logger)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [12]:
trainer.fit(model=eugene, datamodule=mod)

  fn(*args, **kwargs)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Set SLURM handle signals.

  | Name         | Type                 | Params
------------------------------------------------------
0 | convnet      | BasicConv1D          | 976   
1 | recurrentnet | BasicRecurrent       | 8.4 K 
2 | fcnet        | FullyConnectedModule | 33    
3 | accuracy     | Accuracy             | 0     
4 | auroc        | AUROC                | 0     
------------------------------------------------------
9.5 K     Trainable params
0         Non-trainable params
9.5 K     Total params
0.038     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]



Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

# References