In [1]:
import mpramnist
from mpramnist.fluorescencedataset import FluorescenceDataset
from mpramnist import transforms as t
from mpramnist import target_transforms as t_t

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

# Regression task

In [2]:
BATCH_SIZE = 32
NUM_WORKERS = 103
train_transform = t.Compose([
    t.Seq2Tensor()
])
val_test_transform = t.Compose([
    t.Seq2Tensor()
])
task = "regression"
activity_columns = ["JURKAT", "K562", "THP1"]
train_dataset = FluorescenceDataset(task = task, activity_columns = activity_columns, split = "train",transform = train_transform)

val_dataset = FluorescenceDataset(task = task, activity_columns = activity_columns, split = "val",transform = val_test_transform) 

test_dataset = FluorescenceDataset(task = task, activity_columns = activity_columns, split = "test", transform = val_test_transform)

# encapsulate data into dataloader form
train_loader = data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers = NUM_WORKERS)
val_loader = data.DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers = NUM_WORKERS)
test_loader = data.DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers = NUM_WORKERS)

In [3]:
print(train_dataset)
print("="*20)
print(val_dataset)
print("="*20)
print(test_dataset)

Dataset FluorescenceDataset of size 12335 (MpraDaraset)
    Number of datapoints: 12335
    Default split folds: {}
    Used split fold: train
    Scalar features: {}
    Vector features: {}
    Cell types: ['JURKAT', 'K562', 'THP1']
    Сell type used: ['JURKAT', 'K562', 'THP1']
    Target columns that can be used: {}
    Number of channels: 4
    Sequence size: 250
    Number of samples: {}
    Description: FluorescenceDataset is based on 
Dataset FluorescenceDataset of size 1416 (MpraDaraset)
    Number of datapoints: 1416
    Default split folds: {}
    Used split fold: val
    Scalar features: {}
    Vector features: {}
    Cell types: ['JURKAT', 'K562', 'THP1']
    Сell type used: ['JURKAT', 'K562', 'THP1']
    Target columns that can be used: {}
    Number of channels: 4
    Sequence size: 250
    Number of samples: {}
    Description: FluorescenceDataset is based on 
Dataset FluorescenceDataset of size 3353 (MpraDaraset)
    Number of datapoints: 3353
    Default split folds: {

In [4]:
train_dataset[0]

(tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
          0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
          1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0.

In [17]:
class Simple_Net(nn.Module):
    
    def __init__(self, seq_len, block_sizes=[16, 24, 32, 40, 48], kernel_size=3):
        
        super().__init__()
        self.seq_len = seq_len
        out_ch = 64
        nn_blocks = []
      
        for in_bs, out_bs in zip([4] + block_sizes, block_sizes):
            
            block = nn.Sequential(
                nn.Conv1d(in_bs, out_bs, kernel_size=kernel_size, padding=kernel_size // 2), # Padding для сохранения размера
                nn.SiLU(),
                nn.BatchNorm1d(out_bs)
            )
            nn_blocks.append(block)

        final_feature_size = seq_len  # Размер последовательности остаётся неизменным из-за padding
        
        self.conv_net = nn.Sequential(
            *nn_blocks,
            nn.Flatten(),
            nn.Linear(block_sizes[-1] * final_feature_size, out_ch),
            nn.SiLU(),
        )
        
        self.head = nn.Sequential(nn.Linear(out_ch, out_ch),
                                  nn.SiLU(),
                                   nn.BatchNorm1d(out_ch),
                                   nn.Linear(out_ch, 3))
        
        self.output_activation = nn.Sigmoid().cuda()

    def forward(self, x):
        
        out = self.conv_net(x)
        out = self.head(out)
        
        #return self.output_activation(out)
        return out

In [18]:
import pytorch_lightning as L
from torch.nn import functional as F
def pearson_correlation(x, y):
    vx = x - torch.mean(x, dim=0)
    vy = y - torch.mean(y, dim=0)
    pearsons = torch.sum(vx * vy, dim=0) / (torch.sqrt(torch.sum(vx ** 2, dim=0)) * torch.sqrt(torch.sum(vy ** 2, dim=0)) + 1e-10)
    return torch.mean(pearsons)
class Seq1Model(L.LightningModule):
    
    def __init__(self, seq_len, lr=3e-4):
        super().__init__()
        self.model = Simple_Net(seq_len = seq_len)
        #self.model = Basset_Net(output_dim = out_ch)
        self.loss = nn.MSELoss() 
        self.lr = lr
        #self.pearson = PearsonCorrCoef()
        
        self.val_loss = []
        self.val_pears = []
        
    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_nb):
        X, y = batch
        y_hat = self.model(X)
        loss = self.loss(y_hat, y)
        self.log("train_loss", loss, prog_bar=True,  on_step=True, on_epoch=True, logger = True)
        
        lr = self.optimizers().param_groups[0]['lr']  # Get current learning rate
        self.log('learning_rate', lr, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        
        return loss
        
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.model(x)
        loss = self.loss(y_hat, y)
        
        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.val_loss.append(loss)

        corr = pearson_correlation(y_hat, y)
        self.val_pears.append(corr)
        self.log("val_pearson", corr, on_epoch=True, prog_bar=True, on_step = False)
        
    def on_validation_epoch_end(self):

        val_loss = torch.stack(self.val_loss, dim = 0).mean()
        val_pears = torch.stack(self.val_pears, dim = 0).mean()
        
        res_str = '|' + ' {}: {:.5f} |'.format("current_epoch", self.current_epoch) 
        res_str += ' {}: {:.5f} |'.format("val_loss", val_loss)
        res_str += ' {}: {:.5f} |'.format("val_pearson", val_pears)
        border = '-'*len(res_str)
        print("\n".join(['',border, res_str, border,'']))
        self.val_loss.clear()
        self.val_pears.clear()
        return None
        
    def test_step(self, batch, _):
        x, y = batch
        y_hat = self.model(x)
        loss = self.loss(y_hat, y)
        
        self.log('test_loss', 
                 loss, 
                 prog_bar=True, 
                 on_step=False,
                 on_epoch=True)
        corr = pearson_correlation(y_hat, y)
        self.log("test_pearson", 
                 corr ,
                 on_epoch=True,
                 prog_bar=True,
                 on_step=False,)
        

    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        if isinstance(batch, tuple) or isinstance(batch, list):
            x, _ = batch
        else:
            x = batch
        return self(x)

    def configure_optimizers(self):
        
        self.optimizer = torch.optim.AdamW(self.parameters(),
                                               lr=self.lr,
                                               weight_decay = 0.01)
        
        lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(self.optimizer, # type: ignore
                                                        max_lr=self.lr,
                                                        three_phase=False, 
                                                        total_steps=self.trainer.estimated_stepping_batches, # type: ignore
                                                        pct_start=0.3,
                                                        cycle_momentum =False)
        lr_scheduler_config = {
                    "scheduler": lr_scheduler,
                    "interval": "step",
                    "frequency": 1,
                    "name": "cycle_lr"
            }
            
        return [self.optimizer], [lr_scheduler_config]
        
        #return self.optimizer

In [19]:
seq_model = Seq1Model(seq_len=len(train_dataset[0][0][0]), lr = 0.01)

# Initialize a trainer
trainer = L.Trainer(
    accelerator="gpu",
    devices=[0],
    max_epochs=5,
    gradient_clip_val=1,
    precision='16-mixed', 
    enable_progress_bar = False,
    #callbacks=[TQDMProgressBar(refresh_rate=55)]
    #logger = logger
)

# Train the model
trainer.fit(seq_model,
            train_dataloaders=train_loader,
            val_dataloaders=val_loader)
trainer.test(seq_model, dataloaders=test_loader)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Simple_Net | 786 K  | train
1 | loss  | MSELoss    | 0      | train
---------------------------------------------
786 K     Trainable params
0         Non-trainable params
786 K     Total params
3.145     Total estimated model params size (MB)
32        Modules in train mode
0         Modules in eval mode



---------------------------------------------------------------------
| current_epoch: 0.00000 | val_loss: 1.81389 | val_pearson: 0.01387 |
---------------------------------------------------------------------


---------------------------------------------------------------------
| current_epoch: 0.00000 | val_loss: 1.37569 | val_pearson: 0.28907 |
---------------------------------------------------------------------


---------------------------------------------------------------------
| current_epoch: 1.00000 | val_loss: 1.02884 | val_pearson: 0.38763 |
---------------------------------------------------------------------


---------------------------------------------------------------------
| current_epoch: 2.00000 | val_loss: 1.01533 | val_pearson: 0.43217 |
---------------------------------------------------------------------


---------------------------------------------------------------------
| current_epoch: 3.00000 | val_loss: 0.92594 | val_pearson: 0.46342 |
-----------

`Trainer.fit` stopped: `max_epochs=5` reached.



---------------------------------------------------------------------
| current_epoch: 4.00000 | val_loss: 0.94167 | val_pearson: 0.46206 |
---------------------------------------------------------------------



LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


[{'test_loss': 0.9071075916290283, 'test_pearson': 0.501738965511322}]

# Classification task

In [5]:
BATCH_SIZE = 32
NUM_WORKERS = 103
train_transform = t.Compose([
    t.Seq2Tensor()
])
val_test_transform = t.Compose([
    t.Seq2Tensor()
])
task = "classification"
activity_columns = ["JURKAT", "K562", "THP1"]
train_dataset = FluorescenceDataset(task = task, activity_columns = activity_columns, split = "train",transform = train_transform)

val_dataset = FluorescenceDataset(task = task, activity_columns = activity_columns, split = "val",transform = val_test_transform) 

test_dataset = FluorescenceDataset(task = task, activity_columns = activity_columns, split = "test", transform = val_test_transform)

# encapsulate data into dataloader form
train_loader = data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers = NUM_WORKERS)
val_loader = data.DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers = NUM_WORKERS)
test_loader = data.DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers = NUM_WORKERS)

In [6]:
print(train_dataset)
print("="*20)
print(val_dataset)
print("="*20)
print(test_dataset)

Dataset FluorescenceDataset of size 12335 (MpraDaraset)
    Number of datapoints: 12335
    Default split folds: {}
    Used split fold: train
    Scalar features: {}
    Vector features: {}
    Cell types: ['JURKAT', 'K562', 'THP1']
    Сell type used: ['JURKAT', 'K562', 'THP1']
    Target columns that can be used: {}
    Number of channels: 4
    Sequence size: 250
    Number of samples: {}
    Description: FluorescenceDataset is based on 
Dataset FluorescenceDataset of size 1416 (MpraDaraset)
    Number of datapoints: 1416
    Default split folds: {}
    Used split fold: val
    Scalar features: {}
    Vector features: {}
    Cell types: ['JURKAT', 'K562', 'THP1']
    Сell type used: ['JURKAT', 'K562', 'THP1']
    Target columns that can be used: {}
    Number of channels: 4
    Sequence size: 250
    Number of samples: {}
    Description: FluorescenceDataset is based on 
Dataset FluorescenceDataset of size 3353 (MpraDaraset)
    Number of datapoints: 3353
    Default split folds: {

In [8]:
train_dataset[1]

(tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
          1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
          0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0.,
          0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
          0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
          0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
          0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0.,
          1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0.,
          1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0.,
          0., 1.