In [1]:
import mpramnist
from mpramnist.DeepStarr.dataset import DeepStarrDataset

from mpramnist.models import DeepStarr
from mpramnist.trainers import LitModel_DeepStarr

from mpramnist import transforms as t
from mpramnist import target_transforms as t_t

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

import pytorch_lightning as L

In [2]:
BATCH_SIZE = 1024
NUM_WORKERS = 103

activity_columns = DeepStarrDataset.ACTIVITY_COLUMNS

## Reverse-complement

In case you want to use reverse-complement as it was used in the original study, then use *use_original_reverse_complement* attribute. Default *None*

**WARNING**: in the original study, reverse-complement  was applied **only** to the training set.

For example:

In [3]:
train_transform = t.Compose([
    t.Seq2Tensor(),# t.Reverse(0.5) is not needed here
])
val_test_transform = t.Compose([
    t.Seq2Tensor(),
])
orig_rev_comp_train_dataset = DeepStarrDataset(activity_column = activity_columns,
                                               split = "train",
                                               transform = train_transform, root = "../data/")
# VAL NO CHANGES
val_dataset = DeepStarrDataset(activity_column = activity_columns, split = "val", transform = val_test_transform, root = "../data/")
# TEST NO CHANGES
test_dataset = DeepStarrDataset(activity_column = activity_columns, split = "test", transform = val_test_transform, root = "../data/") 

Note: The training set contains reverse-complement augmentation as implemented in the original study.  
• Dataset size: 2N (N original + N reverse-complemented sequences)  
• Label consistency: y_rc ≡ y_original  
• Do not reapply this transformation during preprocessing. 


In [4]:
print(len(orig_rev_comp_train_dataset))
print(len(val_dataset))
print(len(test_dataset))

402278
40570
41186


But we suggest using reverse-complement transformation by writing **transforms.Reverse(prob = 0.5)**. *prob = 0.5* means that the sequence will be reversed with probability of 0.5.

For example:

In [5]:
train_transform = t.Compose([
    t.Seq2Tensor(),
    t.ReverseComplement(0.5)
])
val_test_transform = t.Compose([
    t.Seq2Tensor(),
    t.ReverseComplement(0)
])
train_dataset = DeepStarrDataset(activity_column = activity_columns,
                                 use_original_reverse_complement = False,
                                 split = "train", transform = train_transform, root = "../data/")

val_dataset = DeepStarrDataset(activity_column = activity_columns, split = "val",transform = val_test_transform, root = "../data/") 

test_dataset = DeepStarrDataset(activity_column = activity_columns, split = "test", transform = val_test_transform, root = "../data/")

In [6]:
print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

201139
40570
41186


## Data splitting

Sequences from the **first** and **second** half of chr2R were held out for validation and testing, respectively.

The remainig chromosomes are used for training set.

You can use *"train"*, *"val"*, *"test"* to define the training, validation or test set respectively **using the same approach as the original study**

For example:

In [7]:
train_transform = t.Compose([
    t.Seq2Tensor(),
])
val_test_transform = t.Compose([
    t.Seq2Tensor(),
])
orig_train_dataset = DeepStarrDataset(activity_column = activity_columns,
                                 split = "train", transform = train_transform, root = "../data/")

val_dataset = DeepStarrDataset(activity_column = activity_columns, split = "val", transform = val_test_transform, root = "../data/") 

test_dataset = DeepStarrDataset(activity_column = activity_columns, split = "test", transform = val_test_transform, root = "../data/")

Note: The training set contains reverse-complement augmentation as implemented in the original study.  
• Dataset size: 2N (N original + N reverse-complemented sequences)  
• Label consistency: y_rc ≡ y_original  
• Do not reapply this transformation during preprocessing. 


In [8]:
print(len(orig_train_dataset))
print(len(val_dataset))
print(len(test_dataset))

402278
40570
41186


On the other hand, you can define a list of specific chromosomes that you want to use as training, validation ot test set

For example:

In [9]:
list_of_chr = DeepStarrDataset.LIST_OF_CHR
print(list_of_chr)

['chr2L', 'chr2LHet', 'chr2RHet', 'chr3L', 'chr3LHet', 'chr3R', 'chr3RHet', 'chr4', 'chrX', 'chrXHet', 'chrYHet', 'chr2R']


In [10]:
my_train_dataset = DeepStarrDataset(activity_column = activity_columns,
                                    split = ['chr2L', 'chr2LHet', 'chr2RHet',  # Reverse complement transformation is disabled for chromosome list splits. 
                                              'chr3L', 'chr3LHet', 'chr3R',    # Set use_original_reverse_complement=True to apply original paper augmentation.
                                              'chr3RHet', 'chr4'],
                                    use_original_reverse_complement = False,
                                    transform = train_transform, root = "../data/") 

my_val_dataset = DeepStarrDataset(activity_column = activity_columns, split = ['chrX', 'chrXHet', 'chrYHet'], transform = val_test_transform, root = "../data/") 

my_test_dataset = DeepStarrDataset(activity_column = activity_columns, split = 'chr2R', transform = val_test_transform, root = "../data/")

In [11]:
# encapsulate data into dataloader form
train_loader = data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers = NUM_WORKERS)

val_loader = data.DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers = NUM_WORKERS)

test_loader = data.DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers = NUM_WORKERS)

In [12]:
print(len(my_train_dataset))
print(len(my_val_dataset))
print(len(my_test_dataset))

151641
49498
81756


## Regression task

In [13]:
in_channels = len(train_dataset[0][0])
out_channels = len(activity_columns)

## Trainer

In [14]:
import pytorch_lightning as L
from torch.nn import functional as F

def pearson_correlation(x, y):
    vx = x - torch.mean(x, dim=0)
    vy = y - torch.mean(y, dim=0)
    pearsons = torch.sum(vx * vy, dim=0) / (torch.sqrt(torch.sum(vx ** 2, dim=0)) * torch.sqrt(torch.sum(vy ** 2, dim=0)) + 1e-10)
    return torch.mean(pearsons)
    
class Seq1Model(L.LightningModule):
    
    def __init__(self, in_ch, out_ch, lr=3e-4):
        super().__init__()
        
        self.model = HumanLegNet(in_ch=in_ch,
                                 output_dim = out_ch,
                                 stem_ch=64,
                                 stem_ks=11,
                                 ef_ks=9,
                                 ef_block_sizes=[80, 96, 112, 128],
                                 pool_sizes=[2,2,2,2],
                                 resize_factor=4)
        self.model.apply(initialize_weights)
    
        self.loss = nn.MSELoss() 
        self.lr = lr
        self.train_loss = torch.tensor([]).to(device)
        self.val_loss = torch.tensor([]).to(device)
        self.pred = torch.empty(0).to(device)
        self.target = torch.empty(0).to(device)
        
    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_nb):
        X, y = batch
        y_hat = self.model(X)
        
        loss = self.loss(y_hat, y)
        self.log("train_loss", loss, prog_bar=True,  on_step=True, on_epoch=True, logger = True)
        self.train_loss = torch.cat([self.train_loss, loss.unsqueeze(0)])
    
        lr = self.optimizers().param_groups[0]['lr']
        self.log('learning_rate', lr, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        
        return loss
        
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.model(x)
        loss = self.loss(y_hat, y)
        
        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.val_loss = torch.cat([self.val_loss, loss.unsqueeze(0)])

        self.pred = torch.cat([self.pred, y_hat], dim = 0)
        self.target = torch.cat([self.target, y], dim = 0)
        
    def on_validation_epoch_end(self):
        train_loss = torch.mean(self.train_loss)
        val_loss = torch.mean(self.val_loss)
        
        res_str = '|' + ' {}: {:.5f} |'.format("current_epoch", self.current_epoch) 
        res_str += ' {}: {:.5f} |'.format("val_loss", val_loss)

        corr = pearson_correlation(self.pred.clone().detach(), self.target.clone().detach())
        self.log("val_pearson", corr, on_epoch=True, prog_bar=True, on_step = False)
        res_str += ' {}: {:.5f} |'.format("val_pearson_r", corr)
    
        res_str += ' {}: {:.5f} |'.format("train_loss", train_loss)
        
        border = '-'*100 #len(res_str)
        print("\n".join(['', border, res_str, border, '']))

        self.val_loss = torch.tensor([]).to(device)
        self.pred = torch.empty(0).to(device)
        self.target = torch.empty(0).to(device)
        self.train_loss = torch.tensor([]).to(device)
        
    def test_step(self, batch, _):
        x, y = batch
        y_hat = self.model(x)
        loss = self.loss(y_hat, y)
        
        self.log('test_loss', 
                 loss, 
                 prog_bar=True, 
                 on_step=False,
                 on_epoch=True)

        self.pred = torch.cat([self.pred, y_hat], dim = 0)
        self.target = torch.cat([self.target, y], dim = 0)
        
    def on_test_epoch_end(self):
        corr = pearson_correlation(self.pred, self.target)
        self.log('test_pearson_r', corr, on_step=False, on_epoch=True, prog_bar=True)
        
    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        x, y = batch
        y_hat = self.model(x)          
        
        return {"y": y.cpu().detach().float(), "pred": y_hat.cpu().detach().float()}
    
    def train_dataloader(self):
        return data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers = NUM_WORKERS, pin_memory=True)

    def val_dataloader(self):
        return data.DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers = NUM_WORKERS, pin_memory=True)
    
    def test_dataloader(self):
        return data.DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers = NUM_WORKERS, pin_memory=True)
    
    def configure_optimizers(self):
        
        self.optimizer = torch.optim.AdamW(self.parameters(),
                                               lr=self.lr,
                                               weight_decay = 1e-6)
        
        lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(self.optimizer, # type: ignore
                                                        max_lr=self.lr,
                                                        three_phase=False, 
                                                        total_steps=self.trainer.estimated_stepping_batches, # type: ignore
                                                        pct_start=0.3,
                                                        cycle_momentum =False)
        lr_scheduler_config = {
                    "scheduler": lr_scheduler,
                    "interval": "step",
                    "frequency": 1,
                    "name": "cycle_lr"
            }
            
        return [self.optimizer], [lr_scheduler_config]
        

In [16]:
model = DeepStarr(out_channels)

seq_model = LitModel_DeepStarr(model = model,
                               num_outputs = out_channels,
                           loss = nn.MSELoss(),
                           weight_decay = 1e-6, lr = 2e-3, print_each = 1)

In [17]:
# Initialize a trainer
trainer = L.Trainer(
    accelerator="gpu",
    devices=[0],
    max_epochs=1,
    gradient_clip_val=1,
    precision='16-mixed', 
    enable_progress_bar = True,
    num_sanity_val_steps=0
)

# Train the model
trainer.fit(seq_model,
            train_dataloaders = train_loader,
            val_dataloaders = val_loader)
trainer.test(seq_model, dataloaders = test_loader)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
2025-04-12 16:26:15.504634: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-12 16:26:15.518016: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one 

Training: |                                                                                       | 0/? [00:00…

Validation: |                                                                                     | 0/? [00:00…

`Trainer.fit` stopped: `max_epochs=1` reached.



-------------------------------------------------------------------------------
| Epoch: 0 | Val Loss: 1.77629 | Val Pearson: 0.54958 | Train Pearson: 0.40323 
-------------------------------------------------------------------------------



LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Testing: |                                                                                        | 0/? [00:00…

[{'test_loss': 1.785520315170288, 'test_pearson': 0.5570375323295593}]