In [1]:
from tqdm import tqdm
import mpramnist
from mpramnist.malinoisdataset import MalinoisDataset

import mpramnist.transforms as t
import mpramnist.target_transforms as t_t

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torchsummary import summary
import lightning.pytorch as pl
from lightning.pytorch.callbacks import TQDMProgressBar

In [2]:
left_flank = MalinoisDataset.LEFT_FLANK
right_flank = MalinoisDataset.RIGHT_FLANK

In [3]:
NUM_EPOCHS = 5
BATCH_SIZE = 1024
lr = 0.0033

In [4]:
# preprocessing
transform = t.Compose([
    t.AddFlanks(left_flank, right_flank),
    t.CenterCrop(600),
    t.Seq2Tensor(),
])

target_transform = t_t.Compose([
    t_t.Normalize(mean = 0.500, std = 1.059) # original for Malinois 
])

# load the data
train_dataset = MalinoisDataset(activity_columns = ['HepG2','SKNSH'], 
                              split = "train", 
                              filtration = "original",
                              transform = transform,
                               target_transform = target_transform) 
val_dataset = MalinoisDataset(activity_columns = ['HepG2','SKNSH'], 
                              split ="val", 
                              filtration = "original",
                              transform = transform,
                             target_transform = target_transform) 
test_dataset = MalinoisDataset(activity_columns = ['HepG2','SKNSH'], 
                              split = "test",
                              filtration = "original",
                              transform = transform,
                              target_transform = target_transform)

# encapsulate data into dataloader form
train_loader = data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = data.DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = data.DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [5]:
print(train_dataset)
print("===================")
print(test_dataset)

Dataset MalinoisDataset of size 627660 (MpraDaraset)
    Number of datapoints: 627660
    Default split folds: {'train': '1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 20, 22, Y', 'val': '19, 21, X', 'test': '7, 13'}
    Used split fold: ['1', '2', '3', '4', '5', '6', '8', '9', '10', '11', '12', '14', '15', '16', '17', '18', '20', '22', 'Y']
    Scalar features: {}
    Vector features: {}
    Cell types: ['HepG2', 'K562', 'SKNSH']
    Сell type used: ['HepG2_log2FC', 'SKNSH_log2FC']
    Target columns that can be used: {'K562_log2FC', 'HepG2_log2FC', 'SKNSH_log2FC'}
    Number of channels: 4
    Sequence size: 600
    Number of samples: {'train': 668946, 'val': 62406, 'test': 66712}
    Description: MalinoisDataset is based on 
Dataset MalinoisDataset of size 62582 (MpraDaraset)
    Number of datapoints: 62582
    Default split folds: {'train': '1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 20, 22, Y', 'val': '19, 21, X', 'test': '7, 13'}
    Used split fold: ['7', 

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
class Basset(nn.Module):
    def __init__(self, output_dim = 280):
        super().__init__()

        self.linear1_channels=1000
        self.linear2_channels=1000, 

        self.activation1 = nn.ReLU()
        self.activation = nn.ReLU()
        
        self.dropout1 = nn.Dropout(0.3)
        self.dropout2 = nn.Dropout(0.3)

        self.flatten = nn.Flatten()
        self.output_activation = nn.Sigmoid()

        # Layer 2 (convolutional), constituent parts
        self.conv1_filters = torch.nn.Parameter(torch.zeros(300, 4, 19))
        torch.nn.init.kaiming_uniform_(self.conv1_filters)
        self.batchnorm1 = nn.BatchNorm1d(300)
        self.maxpool1 = nn.MaxPool1d(3)

        # Layer 3 (convolutional), constituent parts
        self.conv2_filters = torch.nn.Parameter(torch.zeros(200, 300, 11))
        torch.nn.init.kaiming_uniform_(self.conv2_filters)
        self.batchnorm2 = nn.BatchNorm1d(200)
        self.maxpool2 = nn.MaxPool1d(4)

        # Layer 4 (convolutional), constituent parts
        self.conv3_filters = torch.nn.Parameter(torch.zeros(200, 200, 7))
        torch.nn.init.kaiming_uniform_(self.conv3_filters)
        self.batchnorm3 = nn.BatchNorm1d(200)
        self.maxpool3 = nn.MaxPool1d(2)

        # Layer 5 (fully connected), constituent parts
        self.fc4 = nn.LazyLinear(1000, bias=True)
        self.batchnorm4 = nn.BatchNorm1d(1000)
        
        # Layer 6 (fully connected), constituent parts
        self.fc5 = nn.LazyLinear(1000, bias=True)
        self.batchnorm5 = nn.BatchNorm1d(1000)

        # Output layer (fully connected), constituent parts
        self.fc6 = nn.LazyLinear(output_dim, bias=True)

    def encode(self, x):
        # Layer 1
        cnn = torch.conv1d(x, self.conv1_filters, stride=1, padding="same")
        cnn = self.batchnorm1(cnn)
        cnn = self.activation1(cnn)
        cnn = self.maxpool1(cnn)

        # Layer 2
        cnn = torch.conv1d(cnn, self.conv2_filters, stride=1, padding="same")
        cnn = self.batchnorm2(cnn)
        cnn = self.activation(cnn)
        cnn = self.maxpool2(cnn)

        # Layer 3
        cnn = torch.conv1d(cnn, self.conv3_filters, stride=1, padding="same")
        cnn = self.batchnorm3(cnn)
        cnn = self.activation(cnn)
        cnn = self.maxpool3(cnn)

        x = self.flatten(cnn)
        return x

    def decode(self, x):
        # Layer 4
        
        cnn = self.fc4(x)
        cnn = self.batchnorm4(cnn)
        cnn = self.activation(cnn)
        cnn = self.dropout1(cnn)
        
        # Layer 5
        cnn = self.fc5(cnn)
        cnn = self.batchnorm5(cnn)
        cnn = self.activation(cnn)
        x = self.dropout2(cnn)
        return x
    def classify(self, x):
        
        output = self.fc6(x)
        return output

    def forward(self, x):
        '''
        # Output layer
        logits = self.fc6(cnn)
        y_pred = self.output_activation(logits)

        return y_pred
        '''
        encoded = self.encode(x)
        decoded = self.decode(encoded)
        output  = self.classify(decoded)
        return output

In [8]:
def shannon_entropy(x):
    p_c = nn.Softmax(dim=1)(x)    
    return torch.sum(- p_c * torch.log(p_c), axis=1)
def pearson_correlation(x, y):
    vx = x - torch.mean(x, dim=0)
    vy = y - torch.mean(y, dim=0)
    pearsons = torch.sum(vx * vy, dim=0) / (torch.sqrt(torch.sum(vx ** 2, dim=0)) * torch.sqrt(torch.sum(vy ** 2, dim=0)) + 1e-10)
    return pearsons, torch.mean(pearsons)

In [9]:
from torchmetrics import PearsonCorrCoef
class MPRA_Basset(pl.LightningModule):
    
    def __init__(self,
                 output_dim = 3,
                 learning_rate=1e-4,
                 optimizer='Adam',
                 scheduler=False,
                 weight_decay=1e-6,
                 epochs=1,
                 extra_hidden_size = 100,
                 criterion = 'MSELoss',
                 last_activation='Tanh',
                 sneaky_factor=1,
                 **kwargs):
        
        super().__init__()

        self.output_dim = output_dim
        
        self.learning_rate = learning_rate
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.weight_decay = weight_decay
        self.epochs = epochs
        self.extra_hidden_size = extra_hidden_size
        self.sneaky_factor = sneaky_factor
        
        self.criterion = getattr(nn, criterion)()  
        self.last_activation = getattr(nn, last_activation)()
        
        self.basset_net = Basset()
        
        
        self.basset_last_hidden_width = self.basset_net.linear2_channels

        self.output_1 = nn.Sequential(
            nn.Linear(self.basset_last_hidden_width[0], self.extra_hidden_size),
            self.last_activation,
            nn.Linear(self.extra_hidden_size, 1)
            )
        
        self.output_2 = nn.Sequential(
            nn.Linear(self.basset_last_hidden_width[0], self.extra_hidden_size),
            self.last_activation,
            nn.Linear(self.extra_hidden_size, 1)
            )
        
        self.output_3 = nn.Sequential(
            nn.Linear(self.basset_last_hidden_width[0], self.extra_hidden_size),
            self.last_activation,
            nn.Linear(self.extra_hidden_size, 1)
            )       
        self.val_pearson = PearsonCorrCoef()
        self.example_input_array = torch.rand(1, 4, 600)
        
    def forward(self, x):
        basset_last_hidden = self.basset_net.decode(self.basset_net.encode(x))
        output_1 = self.output_1(basset_last_hidden)
        output_2 = self.output_2(basset_last_hidden)
        output_3 = self.output_3(basset_last_hidden)
        if self.output_dim == 2:
            output_1 = torch.cat((output_1, output_2), dim=1)
        elif self.output_dim == 3:
            output_1 = torch.cat((output_1, output_2, output_3), dim=1)
        return output_1
        
    def training_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self(inputs.to(device).float())

        targets = targets.squeeze(1).to(device)
        
        shannon_pred, shannon_target = shannon_entropy(outputs).to(device), shannon_entropy(targets).to(device)
        loss = self.criterion(outputs, 
                              targets) + self.sneaky_factor*self.criterion(shannon_pred, shannon_target)
        self.log('train_loss', loss, on_epoch=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self(inputs.to(device).float())

        targets = targets.squeeze(1).to(device)
        
        loss = self.criterion(outputs, 
                              targets)
        self.log('val_loss', loss, prog_bar=True)
        corr = self.val_pearson(outputs[:, 0], 
                              targets[:, 0])
        self.log("val_pearson", corr, on_epoch=True, prog_bar=True)
        return {'loss': loss, 'pred': outputs, 'target': targets}
        
    def test_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self(inputs.to(device).float())

        targets = targets.squeeze(1).to(device)
        
        loss = self.criterion(outputs, 
                              targets)
        self.log('test_loss', loss)
        corr = self.val_pearson(outputs[:, 0], 
                              targets[:, 0])
        self.log("test_pearson", 
                 corr ,
                 on_epoch=True,
                 prog_bar=True,
                 on_step=False,)
    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        if isinstance(batch, tuple) or isinstance(batch, list):
            x, _ = batch
        else:
            x = batch
        return self(x)
        
    def configure_optimizers(self):
        optimizer = getattr(torch.optim, self.optimizer)(self.parameters(), lr=self.learning_rate,
                                                         weight_decay=self.weight_decay)  
        if self.scheduler:
            lr_scheduler = {
                'scheduler' : torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=self.epochs, eta_min=1e-6),
                'name': 'learning_rate'
                           }
            return [optimizer], [lr_scheduler]
        else:
            return optimizer

In [None]:
seq_model = MPRA_Basset(output_dim = len(train_dataset[0][1]))

# Initialize a trainer
trainer = pl.Trainer(
    accelerator="gpu",
    devices=[0],
    max_epochs=NUM_EPOCHS,
    gradient_clip_val=1,
    precision='16-mixed', 
    callbacks=[TQDMProgressBar(refresh_rate=50)]
)

# Train the model
trainer.fit(seq_model,
            train_dataloaders=train_loader,
            val_dataloaders=val_loader)
trainer.test(seq_model, dataloaders=test_loader)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/home/nios/miniconda3/envs/mpra/lib/python3.12/site-packages/lightning/pytorch/utilities/model_summary/model_summary.py:477: The total number of parameters detected may be inaccurate because the model contains an instance of `UninitializedParameter`. To get an accurate number, set `self.example_input_array` in your LightningModule.

  | Name            | Type            | Params | Mode  | In sizes  | Out sizes
------------------

Sanity Checking: |                                                                                | 0/? [00:00<?, ?it/s]

/home/nios/miniconda3/envs/mpra/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Sanity Checking DataLoader 0:   0%|                                                               | 0/2 [00:00<?, ?it/s]



                                                                                                                        

/home/nios/miniconda3/envs/mpra/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=103` in the `DataLoader` to improve performance.


Epoch 0:   0%|                                                                                  | 0/613 [00:00<?, ?it/s]