In [1]:
import mpramnist
from mpramnist.Sharpr.dataset import SharprDataset

from mpramnist.models import HumanLegNet
from mpramnist.models import initialize_weights
from mpramnist.trainers import LitModel_Sharpr

from mpramnist import transforms as t
from mpramnist import target_transforms as t_t

import torch
import torch.nn as nn
import torch.utils.data as data
import torch.nn.functional as F
import lightning.pytorch as L
from lightning.pytorch.callbacks import ModelCheckpoint

from torchmetrics import PearsonCorrCoef

from torch_lr_finder import LRFinder
import matplotlib.pyplot as plt

# Initiate some parameters

In [2]:
BATCH_SIZE = 1096
NUM_WORKERS = 8

In [3]:
SharprDataset.ACTIVITY_COLUMNS

['k562_minp_rep1',
 'k562_minp_rep2',
 'k562_minp_avg',
 'k562_sv40p_rep1',
 'k562_sv40p_rep2',
 'k562_sv40p_avg',
 'hepg2_minp_rep1',
 'hepg2_minp_rep2',
 'hepg2_minp_avg',
 'hepg2_sv40p_rep1',
 'hepg2_sv40p_rep2',
 'hepg2_sv40p_avg']

In [9]:
# preprocessing
train_transform = t.Compose([
    t.ReverseComplement(0.5),
    t.Seq2Tensor(),
])
test_transform = t.Compose([
    t.Seq2Tensor(), 
])

# Train

In [10]:
# load the data
activity_columns = [ 'k562_minp_avg',
                     'k562_sv40p_avg',
                     'hepg2_minp_avg',
                     'hepg2_sv40p_avg' ]
train_dataset = SharprDataset(split="train", activity_columns=activity_columns, transform=train_transform, root = "../data/")# for needed folds

val_dataset = SharprDataset( split="val",activity_columns=activity_columns, transform=test_transform, root = "../data/") # use "val" for default validation set

test_dataset = SharprDataset( split="test",activity_columns=activity_columns, transform=test_transform, root = "../data/") # use "test" for default test set

In [11]:
print(train_dataset)
print("="*50)
print(val_dataset)
print("="*50)
print(test_dataset)

Dataset SharprDataset of size 457174 (MpraDaraset)
    Number of datapoints: 457174
    Used split fold: train
Dataset SharprDataset of size 10130 (MpraDaraset)
    Number of datapoints: 10130
    Used split fold: val
Dataset SharprDataset of size 10130 (MpraDaraset)
    Number of datapoints: 10130
    Used split fold: test


In [12]:
# encapsulate data into dataloader form
train_loader = data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers = NUM_WORKERS)

val_loader = data.DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers = NUM_WORKERS)

test_loader = data.DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers = NUM_WORKERS)

In [13]:
in_channels = len(train_dataset[0][0])
out_channels = len(activity_columns)

In [14]:
model = HumanLegNet(in_ch=in_channels,
                     output_dim = out_channels,
                     stem_ch=64,
                     stem_ks=11,
                     ef_ks=9,
                     ef_block_sizes=[80, 96, 112, 128],
                     pool_sizes=[2,2,2,2],
                     resize_factor=4)
model.apply(initialize_weights)

seq_model = LitModel_Sharpr(model = model,
                           loss = nn.MSELoss(), 
                            num_outputs = out_channels,
                           activity_columns = activity_columns,
                           weight_decay = 1e-1, lr = 1e-2, print_each = 1)

In [15]:
checkpoint_callback = ModelCheckpoint(
        monitor='k562_minp_avg', 
        mode='max',  
        save_top_k=1,
        save_last=False
    )
# Initialize a trainer
trainer = L.Trainer(
    accelerator="gpu",
    devices=[0],
    max_epochs=35,
    gradient_clip_val=1,
    precision='16-mixed', 
    enable_progress_bar = False,
    num_sanity_val_steps=0,
    callbacks=[checkpoint_callback]
)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [16]:
# Train the model
trainer.fit(seq_model,
            train_dataloaders=train_loader,
            val_dataloaders=val_loader)

You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name          | Type            | Params | Mode 
----------------------------------------------------------
0 | model         | HumanLegNet     | 1.3 M  | train
1 | loss          | MSELoss         | 0      | train
2 | train_pearson | PearsonCorrCoef | 0      | train
3 | val_pearson   | PearsonCorrCoef | 0      | train
4 | test_pearson  | PearsonCorrCoef | 0      | train
----------------------------------------------------------
1.3 M     Trainable params
0         Non-trainable params
1.3 M     Total params
5.293  


----------------------------------------------------------------------------------------------------
| Epoch: 0 | Val Loss: 0.96289 | Val Pearson: k562_minp_avg : 0.2937263548374176, k562_sv40p_avg : 0.13820180296897888, hepg2_minp_avg : 0.19366231560707092, hepg2_sv40p_avg : 0.20273815095424652, | Train Pearson: 0.25051 
----------------------------------------------------------------------------------------------------


----------------------------------------------------------------------------------------------------
| Epoch: 1 | Val Loss: 1.18851 | Val Pearson: k562_minp_avg : 0.18446898460388184, k562_sv40p_avg : 0.11782119423151016, hepg2_minp_avg : 0.17350295186042786, hepg2_sv40p_avg : 0.22807031869888306, | Train Pearson: 0.32296 
----------------------------------------------------------------------------------------------------


----------------------------------------------------------------------------------------------------
| Epoch: 2 | Val Loss: 0.96305 | Val Pearso

`Trainer.fit` stopped: `max_epochs=35` reached.



----------------------------------------------------------------------------------------------------
| Epoch: 34 | Val Loss: 0.93954 | Val Pearson: k562_minp_avg : 0.3709847927093506, k562_sv40p_avg : 0.1886509656906128, hepg2_minp_avg : 0.300819456577301, hepg2_sv40p_avg : 0.2899799048900604, | Train Pearson: 0.52925 
----------------------------------------------------------------------------------------------------



In [17]:
best_model_path = checkpoint_callback.best_model_path
seq_model = LitModel_Sharpr.load_from_checkpoint(
    best_model_path,
    model = model,
    loss = nn.MSELoss(), 
    num_outputs = out_channels,
    activity_columns = activity_columns,
    weight_decay = 1e-1, lr = 1e-2, print_each = 1)
trainer.test(seq_model, dataloaders=test_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Test metric                 DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         test_loss               0.937516987323761
test_pearson_hepg2_minp_avg      0.303612619638443
test_pearson_hepg2_sv40p_avg    0.29538536071777344
 test_pearson_k562_minp_avg      0.3816615641117096
test_pearson_k562_sv40p_avg     0.20246057212352753
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.937516987323761,
  'test_pearson_k562_minp_avg': 0.3816615641117096,
  'test_pearson_k562_sv40p_avg': 0.20246057212352753,
  'test_pearson_hepg2_minp_avg': 0.303612619638443,
  'test_pearson_hepg2_sv40p_avg': 0.29538536071777344}]

In [18]:
def meaned_prediction(forw, rev, trainer, seq_model, name):

    predictions_forw = trainer.predict(seq_model, dataloaders = forw)
    targets = torch.cat([pred["target"] for pred in predictions_forw])
    y_preds_forw = torch.cat([pred["predicted"] for pred in predictions_forw])
    
    predictions_rev = trainer.predict(seq_model, dataloaders = rev)
    y_preds_rev = torch.cat([pred["predicted"] for pred in predictions_rev])
    
    mean_forw = torch.mean(torch.stack([y_preds_forw, y_preds_rev]), dim=0)
    
    pears = PearsonCorrCoef(num_outputs = out_channels)
    print(name + " Pearson correlation")
    
    return pears(mean_forw, targets)

In [22]:
forw_transform = t.Compose([
    t.Seq2Tensor()
])
rev_transform = t.Compose([
    t.ReverseComplement(1),
    t.Seq2Tensor(),
])

test_forw = SharprDataset(split = "test", activity_columns=activity_columns, transform = forw_transform, root = "../data/")
test_rev = SharprDataset(split = "test",activity_columns=activity_columns, transform = rev_transform, root = "../data/")

forw = data.DataLoader(dataset = test_forw, batch_size = BATCH_SIZE, shuffle = False, num_workers = NUM_WORKERS, pin_memory = True)
rev = data.DataLoader(dataset = test_rev, batch_size = BATCH_SIZE, shuffle = False, num_workers = NUM_WORKERS, pin_memory = True)

meaned_prediction(forw, rev, trainer, seq_model, "Sharpr")

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Sharpr Pearson correlation


tensor([0.4016, 0.2175, 0.3214, 0.3212])

In [23]:
best_model_path = checkpoint_callback.best_model_path
seq_model_finetune = LitModel_Sharpr.load_from_checkpoint(
    best_model_path,
    model = model,
    loss = nn.MSELoss(), 
    num_outputs = out_channels,
    activity_columns = activity_columns,
    weight_decay = 1e-5, lr = 1e-4, print_each = 1)

In [24]:
checkpoint_callback_finetune = ModelCheckpoint(
        monitor='k562_minp_avg', 
        mode='max',  
        save_top_k=1,
        save_last=False
    )
# Initialize a trainer
trainer = L.Trainer(
    accelerator="gpu",
    devices=[0],
    max_epochs=5,
    gradient_clip_val=1,
    precision='16-mixed', 
    enable_progress_bar = False,
    num_sanity_val_steps=0,
    callbacks=[checkpoint_callback_finetune]
)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [25]:
# Train the model
trainer.fit(seq_model_finetune,
            train_dataloaders=train_loader,
            val_dataloaders=val_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name          | Type            | Params | Mode 
----------------------------------------------------------
0 | model         | HumanLegNet     | 1.3 M  | eval 
1 | loss          | MSELoss         | 0      | train
2 | train_pearson | PearsonCorrCoef | 0      | train
3 | val_pearson   | PearsonCorrCoef | 0      | train
4 | test_pearson  | PearsonCorrCoef | 0      | train
----------------------------------------------------------
1.3 M     Trainable params
0         Non-trainable params
1.3 M     Total params
5.293     Total estimated model params size (MB)
4         Modules in train mode
116       Modules in eval mode



----------------------------------------------------------------------------------------------------
| Epoch: 0 | Val Loss: 0.90574 | Val Pearson: k562_minp_avg : 0.3833366334438324, k562_sv40p_avg : 0.20157833397388458, hepg2_minp_avg : 0.32867172360420227, hepg2_sv40p_avg : 0.33414044976234436, | Train Pearson: 0.47960 
----------------------------------------------------------------------------------------------------


----------------------------------------------------------------------------------------------------
| Epoch: 1 | Val Loss: 0.91019 | Val Pearson: k562_minp_avg : 0.38144758343696594, k562_sv40p_avg : 0.1973334103822708, hepg2_minp_avg : 0.3172764778137207, hepg2_sv40p_avg : 0.3268129229545593, | Train Pearson: 0.48596 
----------------------------------------------------------------------------------------------------


----------------------------------------------------------------------------------------------------
| Epoch: 2 | Val Loss: 0.91329 | Val Pearson: 

`Trainer.fit` stopped: `max_epochs=5` reached.



----------------------------------------------------------------------------------------------------
| Epoch: 4 | Val Loss: 0.91263 | Val Pearson: k562_minp_avg : 0.3841629922389984, k562_sv40p_avg : 0.1978197544813156, hepg2_minp_avg : 0.31945309042930603, hepg2_sv40p_avg : 0.3190837502479553, | Train Pearson: 0.49434 
----------------------------------------------------------------------------------------------------



In [27]:
best_model_path = checkpoint_callback_finetune.best_model_path
seq_model_finetune = LitModel_Sharpr.load_from_checkpoint(
    best_model_path,
    model = model,
    loss = nn.MSELoss(), 
    num_outputs = out_channels,
    activity_columns = activity_columns,
    weight_decay = 1e-5, lr = 1e-4, print_each = 1)
trainer.test(seq_model_finetune, dataloaders=test_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        Test metric                 DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         test_loss               0.9126330614089966
test_pearson_hepg2_minp_avg     0.31945309042930603
test_pearson_hepg2_sv40p_avg     0.3190837502479553
 test_pearson_k562_minp_avg      0.3841629922389984
test_pearson_k562_sv40p_avg      0.1978197544813156
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.9126330614089966,
  'test_pearson_k562_minp_avg': 0.3841629922389984,
  'test_pearson_k562_sv40p_avg': 0.1978197544813156,
  'test_pearson_hepg2_minp_avg': 0.31945309042930603,
  'test_pearson_hepg2_sv40p_avg': 0.3190837502479553}]

In [28]:
forw_transform = t.Compose([
    t.Seq2Tensor()
])
rev_transform = t.Compose([
    t.ReverseComplement(1),
    t.Seq2Tensor(),
])

test_forw = SharprDataset(split = "test", activity_columns=activity_columns, transform = forw_transform, root = "../data/")
test_rev = SharprDataset(split = "test",activity_columns=activity_columns, transform = rev_transform, root = "../data/")

forw = data.DataLoader(dataset = test_forw, batch_size = BATCH_SIZE, shuffle = False, num_workers = NUM_WORKERS, pin_memory = True)
rev = data.DataLoader(dataset = test_rev, batch_size = BATCH_SIZE, shuffle = False, num_workers = NUM_WORKERS, pin_memory = True)

meaned_prediction(forw, rev, trainer, seq_model_finetune, "Sharpr")

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Sharpr Pearson correlation


tensor([0.4083, 0.2182, 0.3536, 0.3561])