Currently, operations with genomic regions are implemented for Agarwal, AgarwalJoint, Malinois, DeepSTARR data.

# Agarwal dataset

Agarwal data uses hg38

In [1]:
import mpramnist
from mpramnist.Agarwal.dataset import AgarwalDataset

## Common use

In [2]:
train = AgarwalDataset(split="train", cell_type="HepG2", root="../data/")
val = AgarwalDataset(split="val", cell_type="HepG2", root="../data/")
test = AgarwalDataset(split="test", cell_type="HepG2", root="../data/")
print(train)
print("------------")
print(val)
print("------------")
print(test)

Dataset AgarwalDataset of size 98336 (MpraDaraset)
    Number of datapoints: 98336
    Used split fold: [1, 2, 3, 4, 5, 6, 7, 8]
------------
Dataset AgarwalDataset of size 12292 (MpraDaraset)
    Number of datapoints: 12292
    Used split fold: [9]
------------
Dataset AgarwalDataset of size 12298 (MpraDaraset)
    Number of datapoints: 12298
    Used split fold: [10]


## Exclude genomic regions

In [3]:
test_regions = [
    {"chrom": "10", "start": 1, "end": 1000000000},
    {"chrom": "1", "start": 1, "end": 89032143},
]

test_dataset = AgarwalDataset(
    split="test", cell_type="WTC11", genomic_regions=test_regions, root="../data/"
)

train_dataset = AgarwalDataset(
    split="train",
    cell_type="WTC11",
    genomic_regions=test_regions,
    exclude_regions=True,
    root="../data/",
)

In [4]:
print(train_dataset)
print("===================")
print(test_dataset)

Dataset AgarwalDataset of size 41878 (MpraDaraset)
    Number of datapoints: 41878
    Used split fold: genomic region
Dataset AgarwalDataset of size 4307 (MpraDaraset)
    Number of datapoints: 4307
    Used split fold: genomic region


## Include genomic regions

In [5]:
test_regions = [
    {"chrom": "10", "start": 1, "end": 1000000000},
    {"chrom": "1", "start": 1, "end": 1000000000},
    {"chrom": "20", "start": 1, "end": 1000000000},
]
val_regions = [
    {"chrom": "2", "start": 1, "end": 1000000000},
    {"chrom": "3", "start": 1, "end": 1000000000},
    {"chrom": "4", "start": 1, "end": 1000000000},
    {"chrom": "Y", "start": 1, "end": 1000000000},
]
train_regions = [
    {"chrom": "5", "start": 1, "end": 1000000000},
    {"chrom": "6", "start": 1, "end": 1000000000},
    {"chrom": "7", "start": 1, "end": 1000000000},
    {"chrom": "X", "start": 1, "end": 1000000000},
]

test = AgarwalDataset(
    split="test", cell_type="K562", genomic_regions=test_regions, root="../data/"
)

val = AgarwalDataset(
    split="val", cell_type="K562", genomic_regions=val_regions, root="../data/"
)

train = AgarwalDataset(
    split="train", cell_type="K562", genomic_regions=train_regions, root="../data/"
)

  df = pd.read_csv(file_path, sep='\t')
  df = pd.read_csv(file_path, sep='\t')
  df = pd.read_csv(file_path, sep='\t')


In [6]:
print(train)
print("===================")
print(val)
print("===================")
print(test)

Dataset AgarwalDataset of size 22044 (MpraDaraset)
    Number of datapoints: 22044
    Used split fold: genomic region
Dataset AgarwalDataset of size 4674 (MpraDaraset)
    Number of datapoints: 4674
    Used split fold: genomic region
Dataset AgarwalDataset of size 10814 (MpraDaraset)
    Number of datapoints: 10814
    Used split fold: genomic region


## Download regions from BED-file

In [7]:
test_regions = ["10\t1\t1000000000", "1\t1\t1000000000", "20\t1\t1000000000"]

with open("test_regions.bed", "w") as f:
    for region in test_regions:
        f.write(region + "\n")

In [8]:
test_dataset = AgarwalDataset(
    split="test",
    cell_type="HepG2",
    genomic_regions="./test_regions.bed",
    root="../data/",
)
train_dataset = AgarwalDataset(
    split="train",
    cell_type="HepG2",
    genomic_regions="./test_regions.bed",
    exclude_regions=True,
    root="../data/",
)

In [9]:
print(train_dataset)
print("===================")
print(test_dataset)

Dataset AgarwalDataset of size 99736 (MpraDaraset)
    Number of datapoints: 99736
    Used split fold: genomic region
Dataset AgarwalDataset of size 23190 (MpraDaraset)
    Number of datapoints: 23190
    Used split fold: genomic region


# AgarwalJoint dataset

AgarwalJoint data uses hg38

In [1]:
import mpramnist
from mpramnist.AgarwalJoint.dataset import AgarwalJointDataset

In [6]:
test_regions = [
    {"chrom": "10", "start": 1, "end": 1000000000},
    {"chrom": "1", "start": 1, "end": 1000000000},
    {"chrom": "20", "start": 1, "end": 1000000000},
]

test_dataset = AgarwalJointDataset(
    split="test",
    cell_type=["HepG2", "K562", "WTC11"],
    genomic_regions=test_regions,
    root="../data/",
)

train_dataset = AgarwalJointDataset(
    split="train",
    cell_type=["HepG2", "K562", "WTC11"],
    genomic_regions=test_regions,
    exclude_regions=True,
    root="../data/",
)

In [7]:
print(train_dataset)
print("------------")
print(test_dataset)

Dataset AgarwalJointDataset of size 45527 (MpraDaraset)
    Number of datapoints: 45527
    Used split fold: genomic region
------------
Dataset AgarwalJointDataset of size 9811 (MpraDaraset)
    Number of datapoints: 9811
    Used split fold: genomic region


# Malinois dataset

Malinois data uses hg19

In [1]:
import mpramnist
from mpramnist.Malinois.dataset import MalinoisDataset

In [12]:
test_regions = [
    {"chrom": "10", "start": 1, "end": 1000000000},
    {"chrom": "1", "start": 1, "end": 1000000000},
    {"chrom": "X", "start": 1, "end": 1000000000},
]
train_dataset = MalinoisDataset(
    split="train",
    filtration="none",
    root="../data/",
    genomic_regions=test_regions,
    exclude_regions=True,
)
test_dataset = MalinoisDataset(
    split="test", filtration="none", root="../data/", genomic_regions=test_regions
)

In [13]:
print(train_dataset)
print("------------")
print(test_dataset)

Dataset MalinoisDataset of size 665160 (MpraDaraset)
    Number of datapoints: 665160
    Used split fold: genomic region
------------
Dataset MalinoisDataset of size 118440 (MpraDaraset)
    Number of datapoints: 118440
    Used split fold: genomic region


In [14]:
train_dataset = MalinoisDataset(
    split="train",
    filtration="none",
    root="../data/",
    genomic_regions="./test_regions.bed",
    exclude_regions=True,
)
test_dataset = MalinoisDataset(
    split="test",
    filtration="none",
    root="../data/",
    genomic_regions="./test_regions.bed",
)

In [15]:
print(train_dataset)
print("------------")
print(test_dataset)

Dataset MalinoisDataset of size 654074 (MpraDaraset)
    Number of datapoints: 654074
    Used split fold: genomic region
------------
Dataset MalinoisDataset of size 129526 (MpraDaraset)
    Number of datapoints: 129526
    Used split fold: genomic region


# DeepSTARR dataset

Malinois data uses BDGP R5/dm3

In [1]:
import mpramnist
from mpramnist.DeepStarr.dataset import DeepStarrDataset

In [2]:
val_regions = [
    {"chrom": "chr2R", "start": 1, "end": 21144049//2},
]
test_regions = [
    {"chrom": "chr2R", "start": 21144049//2, "end": 21144049},
]
val_test = val_regions + test_regions
train_dataset = DeepStarrDataset(
    split="train",
    root="../data/",
    use_original_reverse_complement=True,
    genomic_regions=val_test,
    exclude_regions=True,
)
val_dataset = DeepStarrDataset(
    split="val",  root="../data/", genomic_regions=val_regions
)
test_dataset = DeepStarrDataset(
    split="test",  root="../data/", genomic_regions=test_regions
)

In [3]:
print(train_dataset)
print("===================")
print(val_dataset)
print("===================")
print(test_dataset)

Dataset DeepStarrDataset of size 402322 (MpraDaraset)
    Number of datapoints: 402322
    Used split fold: genomic region
Dataset DeepStarrDataset of size 40562 (MpraDaraset)
    Number of datapoints: 40562
    Used split fold: genomic region
Dataset DeepStarrDataset of size 41172 (MpraDaraset)
    Number of datapoints: 41172
    Used split fold: genomic region
