In [1]:
import mpramnist
from mpramnist.Dream.dataset import DreamDataset

import mpramnist.transforms as t
import mpramnist.target_transforms as t_t

from mpramnist.models import HumanLegNet
from mpramnist.models import initialize_weights
from mpramnist.trainers import LitModel_Dream

import pandas as pd
import torch
import torch.nn as nn
import torch.utils.data as data

import pytorch_lightning as L

In [2]:
BATCH_SIZE = 1024
NUM_WORKERS = 8

In [3]:
DreamDataset.TYPES

['high',
 'low',
 'yeast',
 'challenging',
 'random',
 'all',
 'snv',
 'perturbation',
 'tiling']

In [4]:
length = 120
plasmid = DreamDataset.PLASMID.upper()
insert_start = plasmid.find("N"*80)
right_flank = DreamDataset.RIGHT_FLANK
left_flank = plasmid[insert_start - length : insert_start]

In [5]:
# preprocessing
train_transform = t.Compose([
    t.AddFlanks(left_flank, right_flank),
    t.LeftCrop(length,length),
    t.Seq2Tensor(),
    t.ReverseComplement(0.5)
])
val_test_transform = t.Compose([
    t.AddFlanks(left_flank, right_flank),
    t.LeftCrop(length, length),
    t.Seq2Tensor(), 
    t.ReverseComplement(0)
])

# load the data
train_dataset = DreamDataset(split="train", transform = train_transform, root = "../data")                                                              
val_dataset = DreamDataset(split="val", data_type = ["all"], transform = val_test_transform, root = "../data") 
test_dataset = DreamDataset(split="test", data_type = ["all"], transform = val_test_transform, root = "../data")

In [6]:
print(train_dataset)

Dataset DreamDataset of size 6739258 (MpraDaraset)
    Number of datapoints: 6739258
    Used split fold: train


In [7]:
print(val_dataset)
print("------------")
print(test_dataset)

Dataset DreamDataset of size 9045 (MpraDaraset)
    Number of datapoints: 9045
    Used split fold: public
------------
Dataset DreamDataset of size 62058 (MpraDaraset)
    Number of datapoints: 62058
    Used split fold: private


In [9]:
# encapsulate data into dataloader form
train_loader = data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers = NUM_WORKERS)

val_loader = data.DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers = NUM_WORKERS)

test_loader = data.DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers = NUM_WORKERS)

In [10]:
in_channels = len(train_dataset[0][0])
out_channels = 1

In [12]:
model = HumanLegNet(in_ch=in_channels,
                     output_dim = out_channels,
                     stem_ch=64,
                     stem_ks=11,
                     ef_ks=9,
                     ef_block_sizes=[80, 96, 112, 128],
                     pool_sizes=[2,2,2,2],
                     resize_factor=4)
model.apply(initialize_weights)

seq_model = LitModel_Dream(model = model,
                           loss = nn.MSELoss(),
                           weight_decay = 1e-1, lr = 1e-2, print_each = 5)

In [13]:
# Initialize a trainer
trainer = L.Trainer(
    accelerator="gpu",
    devices=[0],
    max_epochs=1,
    gradient_clip_val=1,
    precision='16-mixed', 
    enable_progress_bar = True,
    num_sanity_val_steps=0
)

# Train the model
trainer.fit(seq_model,
            train_dataloaders = train_loader,
            val_dataloaders = val_loader)
trainer.test(seq_model, dataloaders = test_loader)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
2025-04-10 21:00:46.850394: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-10 21:00:46.867159: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one 

Training: |                                                                                       | 0/? [00:00…

Validation: |                                                                                     | 0/? [00:00…

`Trainer.fit` stopped: `max_epochs=1` reached.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Testing: |                                                                                        | 0/? [00:00…

[{'test_loss': 134.0399932861328, 'test_pearson': 0.8943555355072021}]