In [None]:
import os
import random

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

In [None]:

def adjust_string(s, length, fill_char="N"):
    if len(s) > length:
        return s[:length]
    else:
        return s.ljust(length, fill_char)


base_number_dict = {"A": 1, "G": 2, "C": 3, "U": 4, "T": 4}


def get_tokens(seq):
    seq = seq.upper()
    tokens = []
    for base in seq:
        if base in base_number_dict:
            tokens.append(base_number_dict[base])
        else:
            tokens.append(0)
    return torch.LongTensor(tokens)


class PairedDataset(Dataset):
    def __init__(self, seqs1, seqs2, labels):
        self.seqs1 = seqs1
        self.seqs2 = seqs2
        self.labels = labels

    def __len__(self):
        return len(self.seqs1)

    def __getitem__(self, i):
        seq1 = adjust_string(self.seqs1[i], 30)
        seq2 = adjust_string(self.seqs2[i], 40)
        x = get_tokens(seq1 + seq2)
        return torch.as_tensor(x), torch.as_tensor(
            self.labels[i]).float()

In [None]:

class CNNBiRNN(nn.Module):
    def __init__(self, vocab_size=5, embed_dim=5, filters=320, kernel_size=12,
                 lstm_units=32, dense_units=16):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv1d = nn.Conv1d(embed_dim, filters, kernel_size)
        self.dropout = nn.Dropout(0.2)
        self.pooling = nn.MaxPool1d(2)
        self.lstm = nn.LSTM(filters, lstm_units, batch_first=True,
                            bidirectional=True)
        self.dense1 = nn.Linear(lstm_units * 2, dense_units)
        self.dense2 = nn.Linear(dense_units, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)  # (batch_size, seq_len, embed_dim)
        x = x.permute(0, 2, 1)  # (batch_size, embed_dim, seq_len)
        x = self.relu(self.conv1d(x))
        x = self.dropout(x)
        x = self.pooling(x)
        x = self.dropout(x)
        x = x.permute(0, 2, 1)  # (batch_size, new_seq_len, filters)
        x, _ = self.lstm(x)
        x = x[:, -1, :]
        x = self.dropout(x)
        x = self.relu(self.dense1(x))
        x = self.dropout(x)
        x = self.sigmoid(self.dense2(x))
        x = x.squeeze()
        return x

In [None]:

# data_dir = "mirna-data"
# train_file = os.path.join(data_dir, "miRAW/train_seed_1234_40.txt")
# val_file = os.path.join(data_dir, "miRAW/valid_seed_1234_20.txt")
# epochs = 10

from google.colab import drive

drive.mount('/content/drive')
data_dir = "drive/MyDrive/mirna-data"
train_file = os.path.join(data_dir, "miRAW/train_seed_1234.txt")
val_file = os.path.join(data_dir, "miRAW/valid_seed_1234.txt")
epochs = 100

batch_size = 32
lr = 0.001
filters = 320
kernel_size = 12
dropout = 0.2

seed = 1234
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

device = "cuda" if torch.cuda.is_available() else "cpu"

train_df = pd.read_csv(train_file, sep="\t", header=None)
train_dataset = PairedDataset(train_df[0].to_list(), train_df[1].to_list(),
                              train_df[2].to_list())
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True
)

val_df = pd.read_csv(val_file, sep="\t", header=None)
val_dataset = PairedDataset(val_df[0].to_list(), val_df[1].to_list(),
                            val_df[2].to_list())
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size
)


In [None]:
model = CNNBiRNN(vocab_size=5, embed_dim=5, filters=filters,
                 kernel_size=kernel_size, lstm_units=32, dense_units=16)
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=lr)

In [None]:

for epoch in range(epochs):
    model.train()
    train_loss = 0
    for x, y in train_loader:
        x = x.to(device)
        y = y.to(device)
        y_pred = model(x)
        batch_loss = F.binary_cross_entropy(y_pred, y)
        batch_loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        train_loss += batch_loss.item()

    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for x, y in val_loader:
            x = x.to(device)
            y = y.to(device)
            y_pred = model(x)
            batch_loss = F.binary_cross_entropy(y_pred, y)
            val_loss += batch_loss.item()

            predicted = (y_pred > 0.5).float()
            total += y.size(0)
            correct += (predicted == y).sum().item()

    val_acc = correct / total

    print(
        f"Epoch: {epoch + 1} -- loss: {train_loss:.4f}, val_loss: {val_loss:.4f}, accuracy: {val_acc:.4f}")
