In [1]:
import mpramnist
from mpramnist.Malinois.dataset import MalinoisDataset

from mpramnist.models import BassetBranched

from mpramnist.models import HumanLegNet
from mpramnist.models import initialize_weights
from mpramnist.trainers import LitModel_Malinois

from mpramnist import transforms as t
from mpramnist import target_transforms as t_t
import pandas as pd

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import pytorch_lightning as L

# Initiate some parameters

In [2]:
left_flank = MalinoisDataset.LEFT_FLANK
right_flank = MalinoisDataset.RIGHT_FLANK
BATCH_SIZE = 1076
NUM_WORKERS = 103

# default parametes
activity_columns = ['HepG2','SKNSH', "K562"]
stderr = ['K562_lfcSE', 'HepG2_lfcSE', 'SKNSH_lfcSE']
seq = "sequence"
stderr_threshold = 1.0,
std_multiple_cut = 6.0,
up_cutoff_move = 3.0,
duplication_cutoff = 0.5

# You wanna test original parametes?

So use this code

In [4]:
train_dataset = MalinoisDataset(split = "train",
                                filtration = "original", # use "original for author's parameters"
                                duplication_cutoff = 0.5,
                                use_original_reverse_complement = True, # this parameter paddes sequences and does rev comp
                                root = "../data/"
                               )

val_dataset = MalinoisDataset(split = "val",
                              filtration = "original", root = "../data/") 

test_dataset = MalinoisDataset(split = "test",
                              filtration = "original", root = "../data/")

In [5]:
print(train_dataset)
print("="*50)
print(val_dataset)
print("="*50)
print(test_dataset)

Dataset MalinoisDataset of size 1864176 (MpraDaraset)
    Number of datapoints: 1864176
    Used split fold: ['1', '2', '3', '4', '5', '6', '8', '9', '10', '11', '12', '14', '15', '16', '17', '18', '20', '22', 'Y']
Dataset MalinoisDataset of size 58809 (MpraDaraset)
    Number of datapoints: 58809
    Used split fold: ['19', '21', 'X']
Dataset MalinoisDataset of size 62582 (MpraDaraset)
    Number of datapoints: 62582
    Used split fold: ['7', '13']


# Use your own parameters

In [6]:
# preprocessing
train_transform = t.Compose([
    t.AddFlanks(left_flank, right_flank),
    t.CenterCrop(600),
    t.ReverseComplement(0.5),
    t.Seq2Tensor()
])
val_test_transform = t.Compose([
    t.AddFlanks(left_flank, right_flank),
    t.CenterCrop(600),
    t.Seq2Tensor()
])

In [7]:
# load the data
train_dataset = MalinoisDataset( 
                              split = "train", 
                              transform = train_transform,
                              filtration = "own",
                              duplication_cutoff = 0.5,
                              root = "../data/") 
val_dataset = MalinoisDataset(
                              split = "val",
                              filtration = "own",
                              transform = val_test_transform,
                              root = "../data/") 
test_dataset = MalinoisDataset(
                              split = "test", 
                              filtration = "own",
                              transform = val_test_transform,
                              root = "../data/")

In [8]:
train_dataset[0]

(tensor([[0., 1., 0.,  ..., 0., 1., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 1.,  ..., 1., 0., 0.]]),
 tensor([ 0.3796,  0.0046, -0.2444]))

In [9]:
print(train_dataset)
print("="*50)
print(val_dataset)
print("="*50)
print(test_dataset)

Dataset MalinoisDataset of size 932088 (MpraDaraset)
    Number of datapoints: 932088
    Used split fold: ['1', '2', '3', '4', '5', '6', '8', '9', '10', '11', '12', '14', '15', '16', '17', '18', '20', '22', 'Y']
Dataset MalinoisDataset of size 58809 (MpraDaraset)
    Number of datapoints: 58809
    Used split fold: ['19', '21', 'X']
Dataset MalinoisDataset of size 62582 (MpraDaraset)
    Number of datapoints: 62582
    Used split fold: ['7', '13']


In [10]:
in_channels = len(train_dataset[0][0])
out_channels = len(activity_columns)

In [11]:
# encapsulate data into dataloader form
train_loader = data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers = NUM_WORKERS)

val_loader = data.DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers = NUM_WORKERS)

test_loader = data.DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers = NUM_WORKERS)

In [12]:
class L1KLmixed(nn.Module):
    
    def __init__(self, reduction='mean', alpha=1.0, beta=1.0):
        super().__init__()
        
        self.reduction = reduction
        self.alpha = alpha
        self.beta  = beta
        
        self.MSE = nn.L1Loss(reduction=reduction.replace('batch',''))
        self.KL  = nn.KLDivLoss(reduction=reduction, log_target=True)
        
    def forward(self, preds, targets):
        preds_log_prob  = preds   - torch.logsumexp(preds, dim=-1, keepdim=True)
        target_log_prob = targets - torch.logsumexp(targets, dim=-1, keepdim=True)
        
        MSE_loss = self.MSE(preds, targets)
        KL_loss  = self.KL(preds_log_prob, target_log_prob)
        
        combined_loss = MSE_loss.mul(self.alpha) + \
                        KL_loss.mul(self.beta)
        
        return combined_loss.div(self.alpha+self.beta)

In [9]:
model = BassetBranched()

seq_model = LitModel_Malinois(model = model, num_outputs = out_channels,
                           loss = L1KLmixed(),
                           weight_decay = 1e-1, lr = 1e-2, print_each = 1)

NameError: name 'Conv1dNorm' is not defined

In [13]:
model = HumanLegNet(in_ch=in_channels,
                     output_dim = out_channels,
                     stem_ch=64,
                     stem_ks=11,
                     ef_ks=9,
                     ef_block_sizes=[80, 96, 112, 128],
                     pool_sizes=[2,2,2,2],
                     resize_factor=4)
model.apply(initialize_weights)

seq_model = LitModel_Malinois(model = model,
                           loss = L1KLmixed(),
                           weight_decay = 1e-2, lr = 7.05e-3, print_each = 5)

In [14]:
# Initialize a trainer
trainer = L.Trainer(
    accelerator="gpu",
    devices=[0],
    max_epochs=1,
    gradient_clip_val=1,
    precision='16-mixed', 
    enable_progress_bar = True,
    num_sanity_val_steps=0
)

Using 16bit Automatic Mixed Precision (AMP)
You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [15]:
# Train the model
trainer.fit(seq_model,
            train_dataloaders=train_loader,
            val_dataloaders=val_loader)
trainer.test(seq_model, dataloaders=test_loader)

You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name          | Type            | Params | Mode 
----------------------------------------------------------
0 | model         | HumanLegNet     | 1.3 M  | train
1 | loss          | L1KLmixed       | 0      | train
2 | train_pearson | PearsonCorrCoef | 0      | train
3 | val_pearson   | PearsonCorrCoef | 0      | train
4 | test_pearson  | PearsonCorrCoef | 0      | train
----------------------------------------------------------
1.3 M     Trainable params
0         Non-trainable params
1.3 M     Total params
5.292  

Training: |                                                                                       | 0/? [00:00…



Validation: |                                                                                     | 0/? [00:00…

`Trainer.fit` stopped: `max_epochs=1` reached.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Testing: |                                                                                        | 0/? [00:00…

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss           0.2234857976436615
      test_pearson          0.7964360117912292
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.2234857976436615, 'test_pearson': 0.7964360117912292}]