In [1]:
import mpramnist
from mpramnist.Malinois.dataset import MalinoisDataset

from mpramnist.models import Malinois
from mpramnist.trainers import LitModel_Malinois

from mpramnist import transforms as t
from mpramnist import target_transforms as t_t
import pandas as pd

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

# Initiate some parameters

In [2]:
left_flank = MalinoisDataset.LEFT_FLANK
right_flank = MalinoisDataset.RIGHT_FLANK
BATCH_SIZE = 1076
NUM_WORKERS = 103

# default parametes
activity_columns = ['HepG2','SKNSH', "K562"]
stderr = ['K562_lfcSE', 'HepG2_lfcSE', 'SKNSH_lfcSE']
seq = "sequence"
stderr_threshold = 1.0,
std_multiple_cut = 6.0,
up_cutoff_move = 3.0,
duplication_cutoff = 0.5

# You wanna test original parametes?

So use this code

In [5]:
train_dataset = MalinoisDataset(split = "train",
                                filtration = "original", # use "original for author's parameters"
                                duplication_cutoff = 0.5,
                                use_original_reverse_complement = True # this parameter paddes sequences and does rev comp
                               )

val_dataset = MalinoisDataset(split = "val",
                              filtration = "original") 

test_dataset = MalinoisDataset(split = "test",
                              filtration = "original")

# encapsulate data into dataloader form
train_loader = data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers = NUM_WORKERS)
val_loader = data.DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers = NUM_WORKERS)
test_loader = data.DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers = NUM_WORKERS)

# Use your own parameters

Use this part 

In [7]:
# preprocessing
train_transform = t.Compose([
    t.AddFlanks(left_flank, right_flank),
    t.CenterCrop(600),
    t.ReverseComplement(0.5),
    t.Seq2Tensor()
])
val_test_transform = t.Compose([
    t.AddFlanks(left_flank, right_flank),
    t.CenterCrop(600),
    t.Seq2Tensor()
])

# load the data
train_dataset = MalinoisDataset( 
                              split = "train", 
                              transform = train_transform,
                              filtration = "own",
                              duplication_cutoff = 0.5) 
val_dataset = MalinoisDataset(
                              split = "val",
                              filtration = "own",
                              transform = val_test_transform) 
test_dataset = MalinoisDataset(
                              split = "test", 
                              filtration = "own",
                              transform = val_test_transform)

# encapsulate data into dataloader form
train_loader = data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers = NUM_WORKERS)
val_loader = data.DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers = NUM_WORKERS)
test_loader = data.DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers = NUM_WORKERS)

In [8]:
train_dataset[0]

(tensor([[0., 0., 1.,  ..., 1., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 1., 0.,  ..., 0., 1., 0.]]),
 tensor([-0.1929,  0.8782, -0.4892]))

In [9]:
print(train_dataset)
print("="*50)
print(val_dataset)
print("="*50)
print(test_dataset)

Dataset MalinoisDataset of size 30 (MpraDaraset)
    Number of datapoints: 30
    Used split fold: [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 20, 22, 'Y']
Dataset MalinoisDataset of size 8858 (MpraDaraset)
    Number of datapoints: 8858
    Used split fold: [19, 21, 'X']
Dataset MalinoisDataset of size 0 (MpraDaraset)
    Number of datapoints: 0
    Used split fold: [7, 13]


In [None]:
in_channels = len(train_dataset[0][0])
out_channels = len(activity_columns)

In [9]:
class L1KLmixed(nn.Module):
    
    def __init__(self, reduction='mean', alpha=1.0, beta=1.0):
        super().__init__()
        
        self.reduction = reduction
        self.alpha = alpha
        self.beta  = beta
        
        self.MSE = nn.L1Loss(reduction=reduction.replace('batch',''))
        self.KL  = nn.KLDivLoss(reduction=reduction, log_target=True)
        
    def forward(self, preds, targets):
        preds_log_prob  = preds   - torch.logsumexp(preds, dim=-1, keepdim=True)
        target_log_prob = targets - torch.logsumexp(targets, dim=-1, keepdim=True)
        
        MSE_loss = self.MSE(preds, targets)
        KL_loss  = self.KL(preds_log_prob, target_log_prob)
        
        combined_loss = MSE_loss.mul(self.alpha) + \
                        KL_loss.mul(self.beta)
        
        return combined_loss.div(self.alpha+self.beta)

TODO MALINOIS NET
model = Malinois()

seq_model = LitModel_Malinois(model = model, num_outputs = out_channels,
                           loss = L1KLmixed(),
                           weight_decay = 1e-1, lr = 1e-2, print_each = 1)

In [13]:
# Initialize a trainer
trainer = L.Trainer(
    accelerator="gpu",
    devices=[0],
    max_epochs=5,
    gradient_clip_val=1,
    precision='16-mixed', 
    enable_progress_bar = True,
    num_sanity_val_steps=0
)

# Train the model
trainer.fit(seq_model,
            train_dataloaders=train_loader,
            val_dataloaders=val_loader)
trainer.test(seq_model, dataloaders=test_loader)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA RTX A5000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
2024-12-17 23:50:54.313526: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734468654.334549 1087034 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734468654.341053 1087034 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attemp


----------------------------------------------------------------------
| current_epoch: 0.00000 | val_loss: 0.15238 | val_pearson: -0.00240 |
----------------------------------------------------------------------


---------------------------------------------------------------------
| current_epoch: 0.00000 | val_loss: 0.11936 | val_pearson: 0.78624 |
---------------------------------------------------------------------


---------------------------------------------------------------------
| current_epoch: 1.00000 | val_loss: 0.10243 | val_pearson: 0.80358 |
---------------------------------------------------------------------


---------------------------------------------------------------------
| current_epoch: 2.00000 | val_loss: 0.09349 | val_pearson: 0.83577 |
---------------------------------------------------------------------


---------------------------------------------------------------------
| current_epoch: 3.00000 | val_loss: 0.09103 | val_pearson: 0.85425 |
--------

`Trainer.fit` stopped: `max_epochs=5` reached.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[{'test_loss': 0.0783819854259491, 'test_pearson': 0.8191081881523132}]