In [1]:
import pandas as pd
import os
# from torchtext.data import Dataset, BucketIterator
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import torch

In [2]:
#Modified from https://github.com/aladdinpersson/Machine-Learning-Collection/blob/master/ML/Pytorch/more_advanced/image_captioning/get_loader.py
class Vocabulary:
    def __init__(self, freq_thres=1):
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.freq_threshold = freq_thres

    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer_seq(fasta_seq):
#         print(fasta_seq)
        return [str(x) for x in list(fasta_seq)]

    def build_vocabulary(self, seq_list):
        frequencies = {}
        idx = 4
        for idx1, base in enumerate(list('acgut')):
            self.stoi[base] = idx+idx1
            self.itos[idx+idx1] = base

#         for each_seq in seq_list:
#             for base in self.tokenizer_seq(each_seq):
#                 base = base.lower()
#                 if base in self.stoi:
#                     continue
#                 if base not in frequencies:
#                     frequencies[base] = 1

#                 else:
#                     frequencies[base] += 1

#                 if frequencies[base] == self.freq_threshold:
#                     self.stoi[base] = idx
#                     self.itos[idx] = base
#                     idx += 1

    def numericalize(self, fasta_seq):
        tokenized_seq = self.tokenizer_seq(fasta_seq.lower())

        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_seq
        ]

In [3]:
class SequenceDataset(Dataset):
    def __init__(self, filename, freq_threshold=5):
        self.df = pd.read_csv(filename, header=None)

        # Get Sequences (miRNA and Target mRNA)
        # Dataset Column Positions - miRNA, mRNA, miRNA_Seq, mRNA_Seq, Relative_score
        self.mirna = self.df.iloc[:, 2]
        self.mrna = self.df.iloc[:, 3]
        self.rel_score = self.df.iloc[:, -1]
        
        #concatenating row-wise to create a combined vocabulary
        all_seq = self.mirna[:] + self.mrna
        
        # Initialize vocabulary and build vocab
        self.vocab = Vocabulary(freq_threshold)
        self.vocab.build_vocabulary(all_seq.tolist())

    def __len__(self):
        return len(self.df)

    def numericalize_seq(self,seq):
        numericalized_seq = [self.vocab.stoi["<SOS>"]]
        numericalized_seq += self.vocab.numericalize(seq)
        numericalized_seq.append(self.vocab.stoi["<EOS>"])
        return numericalized_seq
    def get_vocabulary(self):
        return self.vocab.stoi

    def __getitem__(self, index):
        mirna, mrna, score = torch.tensor(self.numericalize_seq(self.mirna[index])), torch.tensor(self.numericalize_seq(self.mrna[index])),torch.tensor(self.rel_score[index])
#         mirna, mrna, score = mirna.unsqueeze(0), mrna.unsqueeze(0), score.unsqueeze(0)
#         print(mirna.size(), mrna.size())
        return mirna, mrna, score


In [4]:
class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
#         imgs = [item[0].unsqueeze(0) for item in batch]
#         imgs = torch.cat(imgs, dim=0)
#         targets = [item[1] for item in batch]
#         targets = pad_sequence(targets, batch_first=False, padding_value=self.pad_idx)
        
        mirna = [item[0] for item in batch]
        mrna = [item[1] for item in batch]
        
        mirna = pad_sequence(mirna, batch_first=True, padding_value=self.pad_idx)
        mrna = pad_sequence(mrna, batch_first=True, padding_value=self.pad_idx)

        return mirna, mrna, [item[2] for item in batch]

In [5]:
# Returns a ready Loader and the Dataset Class for the Sequence
def get_loader(
    seq_csv,
    batch_size=5,
    num_workers=8,
    shuffle=True,
    pin_memory=True
):
    dataset = SequenceDataset(filename=seq_csv)

    pad_idx = dataset.vocab.stoi["<PAD>"]

    loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=shuffle,
        pin_memory=pin_memory,
        collate_fn=MyCollate(pad_idx=pad_idx)
    )

    return loader, dataset

In [6]:
loader, dataset = get_loader("dogtest.csv")

In [9]:
for idx, (mirna, mrna, score) in enumerate(loader):
    if idx == 2:
        break
    print(idx, mirna[0], "targets", mrna[0])
# data = next(iter(loader))

0 tensor([1, 4, 7, 4, 7, 4, 5, 4, 6, 6, 6, 6, 6, 4, 6, 4, 5, 7, 5, 7, 7, 4, 7, 2]) targets tensor([1, 8, 6,  ..., 0, 0, 0])
1 tensor([1, 4, 7, 4, 7, 4, 5, 4, 6, 6, 6, 6, 6, 4, 6, 4, 5, 7, 5, 7, 7, 4, 7, 2]) targets tensor([1, 6, 6,  ..., 0, 0, 0])


In [None]:
data[1].size()