In [None]:
import os
import random

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader

from tec import EmbeddingTransform, ContactCNN, ModelInteraction
from tec import PairedDataset
from tec import collate_paired_sequences

In [None]:
# data_dir = "mirna-data"
# train_file = os.path.join(data_dir, "miRAW/train_seed_1234_40.txt")
# val_file = os.path.join(data_dir, "miRAW/valid_seed_1234_20.txt")
# epochs = 10

from google.colab import drive

drive.mount('/content/drive')
data_dir = "drive/MyDrive/mirna-data"
train_file = os.path.join(data_dir, "miRAW/train_seed_1234.txt")
val_file = os.path.join(data_dir, "miRAW/valid_seed_1234.txt")
epochs = 50

# Embedding model
input_dim = 256  # dimension of bert (per base of RNA)
projection_dim = 128  # dimension of embedding projection layer
dropout_p = 0.2  # parameter p for embedding dropout layer
nhead = 2  # number of heads for Transformer Encoder
num_layers = 2  # number of layers for Transformer Encoder

# Training
seed = 1234
batch_size = 32
weight_decay = 0
lr = 0.001

device = "cuda" if torch.cuda.is_available() else "cpu"

save_prefix = True
out_dir = "result"  # output file path

In [None]:
# set the random seed
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.enabled = True

In [None]:

train_df = pd.read_csv(train_file, sep="\t", header=None)
train_dataset = PairedDataset(train_df[0].to_list(), train_df[1].to_list(),
                              train_df[2].to_list())
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=collate_paired_sequences,
    shuffle=True,
)

val_df = pd.read_csv(val_file, sep="\t", header=None)
val_dataset = PairedDataset(val_df[0].to_list(), val_df[1].to_list(),
                            val_df[2].to_list())
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=collate_paired_sequences,
    shuffle=False,
)

In [None]:

embedding_transform = EmbeddingTransform(
    input_dim, projection_dim, dropout=dropout_p,
    nhead=nhead, num_layers=num_layers
)

# Create contact model
contact_model = ContactCNN(projection_dim)

# Create the full model
model = ModelInteraction(embedding_transform, contact_model)

model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)


In [None]:
from sklearn.metrics import accuracy_score

losses = []
val_losses = []

for epoch in range(epochs):
    batch_losses = []
    for x1, x2, y in train_loader:
        model.train()
        x1 = x1.to(device)
        x2 = x2.to(device)
        y = y.to(device)

        yhat = model(x1, x2)
        batch_loss = F.binary_cross_entropy(yhat, y)
        batch_loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        batch_losses.append(batch_loss.item())

    loss = np.mean(batch_losses)
    losses.append(loss)

    model.eval()
    with torch.no_grad():
        batch_losses = []
        n = 0
        accuracy_average = 0
        for x1, x2, y in val_loader:
            x1 = x1.to(device)
            x2 = x2.to(device)
            y = y.to(device)
            yhat = model(x1, x2)
            batch_loss = F.binary_cross_entropy(yhat, y)
            batch_losses.append(batch_loss.item())
            accuracy = accuracy_score(y.cpu(), (yhat.cpu() > 0.5).int())
            accuracy_average = (accuracy_average * n + accuracy * x1.shape[0]) / (n + x1.shape[0])
            n = n + x1.shape[0]

        val_loss = np.mean(batch_losses)
        val_losses.append(val_loss)

    print(
        f"Epoch: {epoch + 1} -- loss: {loss}, val_loss: {val_loss}, accuracy: {accuracy_average:.4f}")