In [1]:
import mpramnist
from mpramnist.malinoisdataset import MalinoisDataset
from mpramnist.vikramdataset import VikramDataset
from mpramnist import transforms as t
from mpramnist import target_transforms as t_t
import pandas as pd

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

In [2]:
left_flank = MalinoisDataset.LEFT_FLANK
right_flank = MalinoisDataset.RIGHT_FLANK
BATCH_SIZE = 1076

In [5]:
# preprocessing
transform = t.Compose([
    #t.AddFlanks(left_flank, right_flank),
    #t.CenterCrop(600),
    t.Seq2Tensor(),
])

target_transform = t_t.Compose([
    #t_t.Normalize(mean = 0.500, std = 1.059) # original for Malinois 
])

# load the data
train_dataset = MalinoisDataset( 
                              split = "X", 
                              transform = transform,
    #duplication_cutoff = 0.5,
                               use_original_reverse_complement = False) 
val_dataset = MalinoisDataset(
                              split = 1, 
                              transform = transform) 
test_dataset = MalinoisDataset(
                              split = ["1", 2], 
                              transform = transform)

# encapsulate data into dataloader form
train_loader = data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = data.DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = data.DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [6]:
print(train_dataset)
print("="*50)
print(val_dataset)
print("="*50)
print(test_dataset)

Dataset MalinoisDataset of size 8858 (MpraDaraset)
    Number of datapoints: 8858
    Default split folds: {'train': '1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 20, 22, Y', 'val': '19, 21, X', 'test': '7, 13'}
    Used split fold: ['X']
    Scalar features: {}
    Vector features: {}
    Cell types: ['HepG2', 'K562', 'SKNSH']
    Сell type used: ['K562_log2FC', 'HepG2_log2FC', 'SKNSH_log2FC']
    Target columns that can be used: {'K562_log2FC', 'HepG2_log2FC', 'SKNSH_log2FC'}
    Number of channels: 4
    Sequence size: 200
    Number of samples: {'train': 668946, 'val': 62406, 'test': 66712}
    Description: MalinoisDataset is based on 
Dataset MalinoisDataset of size 69951 (MpraDaraset)
    Number of datapoints: 69951
    Default split folds: {'train': '1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 20, 22, Y', 'val': '19, 21, X', 'test': '7, 13'}
    Used split fold: ['1']
    Scalar features: {}
    Vector features: {}
    Cell types: ['HepG2', 'K562', 'SKNSH'

In [1]:
import mpramnist
from mpramnist.massivestarrdataset import MassiveStarrDataset
from mpramnist import transforms as t
from mpramnist import target_transforms as t_t

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

In [2]:
tasks = ["RandomEnhancer",           # 0
          "GenomicPromoter",         # 1
          "CapturePromoter",         # 2
          
          "GenomicEnhancer",         # 3
          "AtacSeq",                 # 4
          
          "differentialExpression",  # 5

          "Binary"                   # 6
         ]

In [5]:
BATCH_SIZE = 2048
NUM_WORKERS = 103
train_transform = t.Compose([
    t.Seq2Tensor()
])
val_test_transform = t.Compose([
    t.Seq2Tensor()
])
task = tasks[2]
train_dataset = MassiveStarrDataset(task = task, split = "train",transform = train_transform)

val_dataset = MassiveStarrDataset(task = task, split = "val",transform = val_test_transform) 

test_dataset = MassiveStarrDataset(task = task, split = "test", transform = val_test_transform)

# encapsulate data into dataloader form
train_loader = data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers = NUM_WORKERS)
val_loader = data.DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers = NUM_WORKERS)
test_loader = data.DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers = NUM_WORKERS)

In [6]:
print(train_dataset)
print("="*50)
print(val_dataset)
print("="*50)
print(test_dataset)

Dataset MassiveStarrDataset of size 79732 (MpraDaraset)
    Number of datapoints: 79732
    Default split folds: {}
    Used split fold: train
    Scalar features: {}
    Vector features: {}
    Cell types: None
    Сell type used: None
    Target columns that can be used: {}
    Number of channels: 4
    Sequence size: 120
    Number of samples: {}
    Description: MassiveStarrDataset is based on 
Dataset MassiveStarrDataset of size 13290 (MpraDaraset)
    Number of datapoints: 13290
    Default split folds: {}
    Used split fold: val
    Scalar features: {}
    Vector features: {}
    Cell types: None
    Сell type used: None
    Target columns that can be used: {}
    Number of channels: 4
    Sequence size: 120
    Number of samples: {}
    Description: MassiveStarrDataset is based on 
Dataset MassiveStarrDataset of size 13290 (MpraDaraset)
    Number of datapoints: 13290
    Default split folds: {}
    Used split fold: test
    Scalar features: {}
    Vector features: {}
    Cell

In [11]:
test_dataset = MassiveStarrDataset(task = task, split = ['chr2', 'chr10','chr11'], transform = val_test_transform)
print(test_dataset)

Dataset MassiveStarrDataset of size 99363 (MpraDaraset)
    Number of datapoints: 99363
    Default split folds: {}
    Used split fold: ['chr2', 'chr10', 'chr11']
    Scalar features: {}
    Vector features: {}
    Cell types: None
    Сell type used: None
    Target columns that can be used: {}
    Number of channels: 4
    Sequence size: 170
    Number of samples: {}
    Description: MassiveStarrDataset is based on 


In [12]:
test_dataset = MassiveStarrDataset(task = task, split = ['chr11'], transform = val_test_transform)
print(test_dataset)

Dataset MassiveStarrDataset of size 27537 (MpraDaraset)
    Number of datapoints: 27537
    Default split folds: {}
    Used split fold: ['chr11']
    Scalar features: {}
    Vector features: {}
    Cell types: None
    Сell type used: None
    Target columns that can be used: {}
    Number of channels: 4
    Sequence size: 170
    Number of samples: {}
    Description: MassiveStarrDataset is based on 


In [15]:
1441994 +496525 +455146

2393665

In [22]:
arr = [1,2,3,4,5,6]
l = len(arr)
m = l//2

In [28]:
i = 0
for j in arr:
    if i < m:
        print(1)
    else:
        print(0)
    i+=1

1
1
1
0
0
0


In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random

In [23]:
def quick_split(df, split_frac=0.8, verbose=False):
    cols = df.columns 
    df = df.reset_index()

    # shuffle indices
    idxs = list(range(df.shape[0]))
    random.shuffle(idxs)

    # split shuffled index list by split_frac
    split = int(len(idxs)*split_frac)
    train_idxs = idxs[:split]
    test_idxs = idxs[split:]
    
    # split dfs and return
    train_df = df[df.index.isin(train_idxs)]
    test_df = df[df.index.isin(test_idxs)]


    train_df = train_df.reset_index()
    test_df = test_df.reset_index()
    return train_df[cols], test_df[cols]

In [11]:
data = pd.read_csv('../datasets/EvfratovDataset/20long.csv')
data

Unnamed: 0,seq,F1 probability,F2 probability,F3 probability,F4 probability,F5 probability,F6 probability,F7 probability,F8 probability
0,UGAAUUAGGAGGGUAUAGAAAUG,0.0,0.0,0.0,0.0,0.0,0.0,0.073984,0.926016
1,AAUACGAGAGGAGGAAGGCAAUG,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1.000000
2,CACAUAACUGGAGACACAGCAUG,0.0,0.0,0.0,0.0,0.0,0.0,0.008088,0.991912
3,AUGUAAUAGGGAGGAGAAGAAUG,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1.000000
4,UUACUACGUGGAGAAAAGAGAUG,0.0,0.0,0.0,0.0,0.0,0.0,0.016631,0.983369
...,...,...,...,...,...,...,...,...,...
11687,AUUCAUGGGUGAGAAAAUAUAUG,0.0,0.0,0.0,0.0,0.0,1.0,0.000000,0.000000
11688,GUUAUUAAAAGAGAGCAAACAUG,0.0,0.0,0.0,0.0,0.0,1.0,0.000000,0.000000
11689,UACUCACAGAGAGUAAUGAUAUG,0.0,0.0,0.0,0.0,0.0,1.0,0.000000,0.000000
11690,UGCUACAGAAUAAUUACAAGAUG,0.0,0.0,0.0,0.0,0.0,1.0,0.000000,0.000000


In [7]:
data = pd.read_csv('../datasets/EvfratovDataset/23_long_seqs.tsv',sep = "\t")
data

Unnamed: 0,sequence,F1,F2,F3,F4,F5,F6,F7,F8
0,UGAAUUAGGAGGGUAUAGAAAUG,0,0,0,0,0,0,244,3054
1,AAUACGAGAGGAGGAAGGCAAUG,0,0,0,0,0,0,0,2942
2,CACAUAACUGGAGACACAGCAUG,0,0,0,0,0,0,22,2698
3,AUGUAAUAGGGAGGAGAAGAAUG,0,0,0,0,0,0,0,2316
4,UUACUACGUGGAGAAAAGAGAUG,0,0,0,0,0,0,39,2306
...,...,...,...,...,...,...,...,...,...
11687,AAAAAACGAGGAUUGAAGAAAUG,0,0,0,0,0,21,0,0
11688,AAAAAAAUAGCUCCAAAUGAAUG,0,18,0,0,0,0,0,0
11689,AAAAAAAGAGGACUAUGAAAAUG,0,0,0,0,0,0,27,0
11690,AAAAAAAGAAGGGAAAAAGGAUG,0,0,35,0,0,0,0,0


In [9]:
sequences = data['sequence']
df_counts = data.drop(columns=['sequence'])

# Convert counts to distributions
row_sums = df_counts.sum(axis=1)
df = df_counts.div(row_sums, axis=0).fillna(0)
row_sums
df

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8
0,0.0,0.0,0.0,0.0,0.0,0.0,0.073984,0.926016
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1.000000
2,0.0,0.0,0.0,0.0,0.0,0.0,0.008088,0.991912
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1.000000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.016631,0.983369
...,...,...,...,...,...,...,...,...
11687,0.0,0.0,0.0,0.0,0.0,1.0,0.000000,0.000000
11688,0.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.000000
11689,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.000000
11690,0.0,0.0,1.0,0.0,0.0,0.0,0.000000,0.000000


In [28]:
full_train_df, test_df = quick_split(data)
train_df, val_df = quick_split(full_train_df)

In [29]:
print("Train:", train_df.shape)
print("Val:", val_df.shape)
print("Test:", test_df.shape)

Train: (7608, 9)
Val: (1903, 9)
Test: (2378, 9)


In [30]:
train_df.to_csv(f'../datasets/EvfratovDataset/33_train.tsv', sep="\t", index = False) 
val_df.to_csv(f'../datasets/EvfratovDataset/33_val.tsv', sep="\t", index = False) 
test_df.to_csv(f'../datasets/EvfratovDataset/33_test.tsv', sep="\t", index = False) 

In [41]:
data = pd.read_csv('../datasets/DeepStarrDataset/train.tsv', sep = "\t")

In [42]:
len(data)

402278

In [55]:
import pyfastx
def get_names_train(name, split):
    data = pd.read_csv(f'../datasets/DeepStarrDataset/{split}.tsv',sep = "\t")
    train = pyfastx.Fastx(f'../datasets/DeepStarrDataset/{name}')
    dicts = {}
    for name, seq in train:
        dicts[seq] = name
    data["name"] = data["sequence"].map(dicts)
    data["chr"] = data['name'].str.split('_').str[0]
    data["split"] = split
    return data

In [56]:
train_all = get_names_train("Sequences_Train.fa", "train")
train = train_all[:201139]

In [57]:
train_all

Unnamed: 0,sequence,Dev_log2,Hk_log2,name,chr,split
0,ATTCAGATTGCCTCTCATTGTCTCACCCATATTATGGGAACCAAAT...,5.711541,1.362522,chr2L_5587_5835_+_positive_peaks,chr2L,train
1,AAATGGCCGCTCAAGAAAAGGCTCGAATATATATTGCCTGCCTCTC...,5.153053,1.671419,chr2L_5778_6026_+_positive_peaks,chr2L,train
2,ATAAGGATCAAAAAGTCCTGATTTCCGAAATGGCGGTTCTCCTTCA...,2.537589,0.290201,chr2L_14226_14474_+_positive_peaks,chr2L,train
3,TTTCCATGACTGACTGGAATGGGTGGAGAACATCGCTTTGGGAGTG...,1.608880,4.097828,chr2L_18618_18866_+_positive_peaks,chr2L,train
4,TCTATCGACCCATAGCCGTAGTCGCTAGACCCGCCCTTCGGAGCAT...,2.767123,0.393657,chr2L_34121_34369_+_positive_peaks,chr2L,train
...,...,...,...,...,...,...
402273,ATGCTTTGAGACAAAACATTGCTAAAAATTAATAAGAAAAATCAAT...,-1.077962,0.168792,chrYHet_310301_310549_-_negative,chrYHet,train
402274,AGTTCCAATGGATTTTTGGGAGCTGCACTTTTTTCTAGCTTCACGT...,-0.618530,1.198438,chrYHet_337801_338049_-_negative,chrYHet,train
402275,TGCTGCTGCCGATATTGCTGTTGTTGCTGCTCCTGGTGTGAGTGTA...,1.229467,-1.708453,chrYHet_341501_341749_-_negative,chrYHet,train
402276,AAACTAATCCATGCTTCTTTCTTTCTTCCAGGGCTAAACGCCAGGG...,1.507001,-1.527881,chrYHet_341601_341849_-_negative,chrYHet,train


In [58]:
train

Unnamed: 0,sequence,Dev_log2,Hk_log2,name,chr,split
0,ATTCAGATTGCCTCTCATTGTCTCACCCATATTATGGGAACCAAAT...,5.711541,1.362522,chr2L_5587_5835_+_positive_peaks,chr2L,train
1,AAATGGCCGCTCAAGAAAAGGCTCGAATATATATTGCCTGCCTCTC...,5.153053,1.671419,chr2L_5778_6026_+_positive_peaks,chr2L,train
2,ATAAGGATCAAAAAGTCCTGATTTCCGAAATGGCGGTTCTCCTTCA...,2.537589,0.290201,chr2L_14226_14474_+_positive_peaks,chr2L,train
3,TTTCCATGACTGACTGGAATGGGTGGAGAACATCGCTTTGGGAGTG...,1.608880,4.097828,chr2L_18618_18866_+_positive_peaks,chr2L,train
4,TCTATCGACCCATAGCCGTAGTCGCTAGACCCGCCCTTCGGAGCAT...,2.767123,0.393657,chr2L_34121_34369_+_positive_peaks,chr2L,train
...,...,...,...,...,...,...
201134,TTTTTGTATATTTGTATATTTATATTTATTATGACAAAGTGTCACA...,-1.077962,0.168792,chrYHet_310301_310549_+_negative,chrYHet,train
201135,GTTAAGATTGAGCACGTTACCGGTCGCATTATAGTCAGATCCGACT...,-0.618530,1.198438,chrYHet_337801_338049_+_negative,chrYHet,train
201136,CAGTATCGGTCCCTTTGGCGATGCGATGCCATAGGGACCGCCTAGA...,1.229467,-1.708453,chrYHet_341501_341749_+_negative,chrYHet,train
201137,GTAGCCATGCGCAGTCAATCGGCCGGTGACAACAATCTTGCCATTC...,1.507001,-1.527881,chrYHet_341601_341849_+_negative,chrYHet,train


In [59]:
import pyfastx
def get_names_val_test(name, split):
    data = pd.read_csv(f'../datasets/DeepStarrDataset/{split}.tsv',sep = "\t")
    train = pyfastx.Fastx(f'../datasets/DeepStarrDataset/{name}')
    names = []
    chrr = []
    for name,seq in train:
        names.append(name)
        chrr.append(name.split("_")[0])
    data["name"] = names
    data["chr"] = chrr
    data["split"] = split
    return data

In [60]:
val = get_names_val_test("Sequences_Val.fa", "val")
test = get_names_val_test("Sequences_Test.fa", "test")

In [61]:
df = pd.concat([train, val, test], axis=0, ignore_index=True)

In [62]:
df

Unnamed: 0,sequence,Dev_log2,Hk_log2,name,chr,split
0,ATTCAGATTGCCTCTCATTGTCTCACCCATATTATGGGAACCAAAT...,5.711541,1.362522,chr2L_5587_5835_+_positive_peaks,chr2L,train
1,AAATGGCCGCTCAAGAAAAGGCTCGAATATATATTGCCTGCCTCTC...,5.153053,1.671419,chr2L_5778_6026_+_positive_peaks,chr2L,train
2,ATAAGGATCAAAAAGTCCTGATTTCCGAAATGGCGGTTCTCCTTCA...,2.537589,0.290201,chr2L_14226_14474_+_positive_peaks,chr2L,train
3,TTTCCATGACTGACTGGAATGGGTGGAGAACATCGCTTTGGGAGTG...,1.608880,4.097828,chr2L_18618_18866_+_positive_peaks,chr2L,train
4,TCTATCGACCCATAGCCGTAGTCGCTAGACCCGCCCTTCGGAGCAT...,2.767123,0.393657,chr2L_34121_34369_+_positive_peaks,chr2L,train
...,...,...,...,...,...,...
282890,CGGGATTGTCTATTTAAGTCACTCAGCTCCCTTGCTATACCCAAGA...,0.104630,-0.644837,chr2R_21141601_21141849_-_negative,chr2R,test
282891,GCACTAGCTGAGTAACAGGTATTTGATCGTTGGGGAACTCTCGTTT...,-1.318970,0.663313,chr2R_21141901_21142149_-_negative,chr2R,test
282892,TGAAAGTGTGTGCGTTCTGTTCTCTGTACTTTTCGGTGTAAAAGTA...,0.681030,-2.151505,chr2R_21142401_21142649_-_negative,chr2R,test
282893,GCGCCGTGTTAAACACAAGTTTTTTGGCGGAATGCCTATTTAATCT...,1.144430,-1.877330,chr2R_21142501_21142749_-_negative,chr2R,test


In [63]:
df.chr.unique()

array(['chr2L', 'chr2LHet', 'chr2RHet', 'chr3L', 'chr3LHet', 'chr3R',
       'chr3RHet', 'chr4', 'chrX', 'chrXHet', 'chrYHet', 'chr2R'],
      dtype=object)

In [64]:
df.to_csv('../datasets/DeepStarrDataset/all_chr.tsv', sep="\t", index = False) 