In [None]:
import os
import random

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

In [None]:

def adjust_string(s, length, fill_char="N"):
    if len(s) > length:
        return s[:length]
    else:
        return s.ljust(length, fill_char)


nt_map = {
    "A": [1, 0, 0, 0],
    "U": [0, 1, 0, 0],
    "T": [0, 1, 0, 0],
    "C": [0, 0, 1, 0],
    "G": [0, 0, 0, 1],
    "N": [0, 0, 0, 0]
}


def one_hot_encode_rna(seq, ):
    seq = seq.upper()

    encoded = []
    for nt in seq:
        if nt in nt_map:
            encoded.extend(nt_map[nt])
        else:
            encoded.extend(nt_map["N"])

    return np.array(encoded)


base_number_dict = {"A": 1, "G": 2, "C": 3, "U": 4, "T": 4}


def get_tokens(seq):
    seq = seq.upper()
    tokens = []
    for base in seq:
        if base in base_number_dict:
            tokens.append(base_number_dict[base])
        else:
            tokens.append(0)
    return torch.LongTensor(tokens)


class PairedDataset(Dataset):
    def __init__(self, seqs1, seqs2, labels):
        self.seqs1 = seqs1
        self.seqs2 = seqs2
        self.labels = labels

    def __len__(self):
        return len(self.seqs1)

    def __getitem__(self, i):
        seq1 = adjust_string(self.seqs1[i], 30)
        seq2 = adjust_string(self.seqs2[i], 40)
        x = one_hot_encode_rna(seq1 + seq2)
        return torch.as_tensor(x).float(), torch.as_tensor(self.labels[i]).float()

In [None]:

class MLP(nn.Module):
    def __init__(self):
        super().__init__()

        self.linear1 = nn.Linear(280, 400)
        self.linear2 = nn.Linear(400, 350)
        self.linear3 = nn.Linear(350, 300)
        self.linear4 = nn.Linear(300, 150)
        self.linear5 = nn.Linear(150, 100)
        self.linear6 = nn.Linear(100, 100)
        self.linear7 = nn.Linear(100, 50)
        self.linear8 = nn.Linear(50, 50)
        self.linear9 = nn.Linear(50, 1)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = self.dropout(self.relu(self.linear1(x)))
        x = self.dropout(self.relu(self.linear2(x)))
        x = self.dropout(self.relu(self.linear3(x)))
        x = self.dropout(self.relu(self.linear4(x)))
        x = self.dropout(self.relu(self.linear5(x)))
        x = self.dropout(self.relu(self.linear6(x)))
        x = self.dropout(self.relu(self.linear7(x)))
        x = self.dropout(self.relu(self.linear8(x)))
        x = self.linear9(x)
        x = F.sigmoid(x)

        return x.squeeze()


In [None]:

# data_dir = "mirna-data"
# train_file = os.path.join(data_dir, "miRAW/train_seed_1234_40.txt")
# val_file = os.path.join(data_dir, "miRAW/valid_seed_1234_20.txt")
# epochs = 10

from google.colab import drive

drive.mount('/content/drive')
data_dir = "drive/MyDrive/mirna-data"
train_file = os.path.join(data_dir, "miRAW/train_seed_1234.txt")
val_file = os.path.join(data_dir, "miRAW/valid_seed_1234.txt")
epochs = 100

batch_size = 32
lr = 0.001
filters = 320
kernel_size = 12

seed = 1234
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

device = "cuda" if torch.cuda.is_available() else "cpu"

train_df = pd.read_csv(train_file, sep="\t", header=None)
train_dataset = PairedDataset(train_df[0].to_list(), train_df[1].to_list(),
                              train_df[2].to_list())
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True
)

val_df = pd.read_csv(val_file, sep="\t", header=None)
val_dataset = PairedDataset(val_df[0].to_list(), val_df[1].to_list(),
                            val_df[2].to_list())
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size
)


In [None]:
model = MLP()
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=lr)

In [None]:

for epoch in range(epochs):
    model.train()
    train_loss = 0
    for x, y in train_loader:
        x = x.to(device)
        y = y.to(device)
        y_pred = model(x)

        batch_loss = F.binary_cross_entropy(y_pred, y)
        batch_loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        train_loss += batch_loss.item()

    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for x, y in val_loader:
            x = x.to(device)
            y = y.to(device)
            y_pred = model(x)
            batch_loss = F.binary_cross_entropy(y_pred, y)
            val_loss += batch_loss.item()

            predicted = (y_pred > 0.5).float()
            total += y.size(0)
            correct += (predicted == y).sum().item()

    val_acc = correct / total

    print(
        f"Epoch: {epoch + 1} -- loss: {train_loss:.4f}, val_loss: {val_loss:.4f}, accuracy: {val_acc:.4f}")
