In [1]:
import mpramnist
from mpramnist.malinoisdataset import MalinoisDataset
from mpramnist.vikramdataset import VikramDataset
from mpramnist import transforms as t
from mpramnist import target_transforms as t_t
import pandas as pd

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

In [2]:
left_flank = MalinoisDataset.LEFT_FLANK
right_flank = MalinoisDataset.RIGHT_FLANK
BATCH_SIZE = 1076

In [5]:
# preprocessing
transform = t.Compose([
    #t.AddFlanks(left_flank, right_flank),
    #t.CenterCrop(600),
    t.Seq2Tensor(),
])

target_transform = t_t.Compose([
    #t_t.Normalize(mean = 0.500, std = 1.059) # original for Malinois 
])

# load the data
train_dataset = MalinoisDataset( 
                              split = "X", 
                              transform = transform,
    #duplication_cutoff = 0.5,
                               use_original_reverse_complement = False) 
val_dataset = MalinoisDataset(
                              split = 1, 
                              transform = transform) 
test_dataset = MalinoisDataset(
                              split = ["1", 2], 
                              transform = transform)

# encapsulate data into dataloader form
train_loader = data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = data.DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = data.DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [6]:
print(train_dataset)
print("="*50)
print(val_dataset)
print("="*50)
print(test_dataset)

Dataset MalinoisDataset of size 8858 (MpraDaraset)
    Number of datapoints: 8858
    Default split folds: {'train': '1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 20, 22, Y', 'val': '19, 21, X', 'test': '7, 13'}
    Used split fold: ['X']
    Scalar features: {}
    Vector features: {}
    Cell types: ['HepG2', 'K562', 'SKNSH']
    Сell type used: ['K562_log2FC', 'HepG2_log2FC', 'SKNSH_log2FC']
    Target columns that can be used: {'K562_log2FC', 'HepG2_log2FC', 'SKNSH_log2FC'}
    Number of channels: 4
    Sequence size: 200
    Number of samples: {'train': 668946, 'val': 62406, 'test': 66712}
    Description: MalinoisDataset is based on 
Dataset MalinoisDataset of size 69951 (MpraDaraset)
    Number of datapoints: 69951
    Default split folds: {'train': '1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 20, 22, Y', 'val': '19, 21, X', 'test': '7, 13'}
    Used split fold: ['1']
    Scalar features: {}
    Vector features: {}
    Cell types: ['HepG2', 'K562', 'SKNSH'

In [1]:
import mpramnist
from mpramnist.massivestarrdataset import MassiveStarrDataset
from mpramnist import transforms as t
from mpramnist import target_transforms as t_t

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

In [2]:
tasks = ["RandomEnhancer",           # 0
          "GenomicPromoter",         # 1
          "CapturePromoter",         # 2
          
          "GenomicEnhancer",         # 3
          "AtacSeq",                 # 4
          
          "differentialExpression",  # 5

          "Binary"                   # 6
         ]

In [5]:
BATCH_SIZE = 2048
NUM_WORKERS = 103
train_transform = t.Compose([
    t.Seq2Tensor()
])
val_test_transform = t.Compose([
    t.Seq2Tensor()
])
task = tasks[2]
train_dataset = MassiveStarrDataset(task = task, split = "train",transform = train_transform)

val_dataset = MassiveStarrDataset(task = task, split = "val",transform = val_test_transform) 

test_dataset = MassiveStarrDataset(task = task, split = "test", transform = val_test_transform)

# encapsulate data into dataloader form
train_loader = data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers = NUM_WORKERS)
val_loader = data.DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers = NUM_WORKERS)
test_loader = data.DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers = NUM_WORKERS)

In [6]:
print(train_dataset)
print("="*50)
print(val_dataset)
print("="*50)
print(test_dataset)

Dataset MassiveStarrDataset of size 79732 (MpraDaraset)
    Number of datapoints: 79732
    Default split folds: {}
    Used split fold: train
    Scalar features: {}
    Vector features: {}
    Cell types: None
    Сell type used: None
    Target columns that can be used: {}
    Number of channels: 4
    Sequence size: 120
    Number of samples: {}
    Description: MassiveStarrDataset is based on 
Dataset MassiveStarrDataset of size 13290 (MpraDaraset)
    Number of datapoints: 13290
    Default split folds: {}
    Used split fold: val
    Scalar features: {}
    Vector features: {}
    Cell types: None
    Сell type used: None
    Target columns that can be used: {}
    Number of channels: 4
    Sequence size: 120
    Number of samples: {}
    Description: MassiveStarrDataset is based on 
Dataset MassiveStarrDataset of size 13290 (MpraDaraset)
    Number of datapoints: 13290
    Default split folds: {}
    Used split fold: test
    Scalar features: {}
    Vector features: {}
    Cell

In [11]:
test_dataset = MassiveStarrDataset(task = task, split = ['chr2', 'chr10','chr11'], transform = val_test_transform)
print(test_dataset)

Dataset MassiveStarrDataset of size 99363 (MpraDaraset)
    Number of datapoints: 99363
    Default split folds: {}
    Used split fold: ['chr2', 'chr10', 'chr11']
    Scalar features: {}
    Vector features: {}
    Cell types: None
    Сell type used: None
    Target columns that can be used: {}
    Number of channels: 4
    Sequence size: 170
    Number of samples: {}
    Description: MassiveStarrDataset is based on 


In [12]:
test_dataset = MassiveStarrDataset(task = task, split = ['chr11'], transform = val_test_transform)
print(test_dataset)

Dataset MassiveStarrDataset of size 27537 (MpraDaraset)
    Number of datapoints: 27537
    Default split folds: {}
    Used split fold: ['chr11']
    Scalar features: {}
    Vector features: {}
    Cell types: None
    Сell type used: None
    Target columns that can be used: {}
    Number of channels: 4
    Sequence size: 170
    Number of samples: {}
    Description: MassiveStarrDataset is based on 


In [15]:
1441994 +496525 +455146

2393665

In [22]:
arr = [1,2,3,4,5,6]
l = len(arr)
m = l//2

In [28]:
i = 0
for j in arr:
    if i < m:
        print(1)
    else:
        print(0)
    i+=1

1
1
1
0
0
0
