In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from genomic_benchmarks.dataset_getters.pytorch_datasets import HumanNontataPromoters

import importlib
import src.dataset
importlib.reload(src.dataset)
from src.dataset import collate_fn

from src.model import DNA_Classifier

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Using gpu else cpu
print(f"Using device: {device}")
# Load dataset
print("Downloading dataset")
train_dataset = HumanNontataPromoters(split="train", version = 0)
val_dataset = HumanNontataPromoters(split="test", version =0)

# 32 batch means DNA length of 32 is feeded to the model.
train_loader = DataLoader(train_dataset, batch_size = 32, shuffle = True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size = 32, shuffle = False, collate_fn=collate_fn)

model = DNA_Classifier(seq_length=251).to(device) # Sequence length is 251 for this library and dataset

optimizer = optim.Adam(model.parameters(), lr=0.001) # Adam is used for this model
criterion = nn.BCELoss() # Binary cross entropy is used as a loss function as it is only a yes/no problem

print("Training is started")
NUM_EPOCH = 10 # For test less number of epochs is used. For better results I will use more epochs.

for epochs in range(NUM_EPOCH):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in train_loader:
        #Moving data to GPU/CPU
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Reseting the gradient to zero
        optimizer.zero_grad()

        outputs = model(inputs) #[32,1] matrix
        loss = criterion(outputs.squeeze(), labels) # Squeezing it to make [32,1] vector

        loss.backward() # Backpropagation to make the model smarter
        optimizer.step() # Updating the weights to make it slightly better
        running_loss += loss.item()

        predicted = (outputs.squeeze() > 0.5).float() # Converting probabilities to binary outputs
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    epoch_loss = running_loss / len(train_loader)
    epoch_acc = 100 * correct / total
    print(f"Epoch [{epochs+1}/{NUM_EPOCH}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.2f}%")
print("Training is finished")

Using device: cpu
Downloading dataset
Training is started
Epoch [1/10], Loss: 0.4537, Accuracy: 78.28%
Epoch [2/10], Loss: 0.3699, Accuracy: 83.54%
Epoch [3/10], Loss: 0.3418, Accuracy: 85.06%
Epoch [4/10], Loss: 0.3260, Accuracy: 86.05%
Epoch [5/10], Loss: 0.3127, Accuracy: 86.62%
Epoch [6/10], Loss: 0.3048, Accuracy: 87.05%
Epoch [7/10], Loss: 0.2974, Accuracy: 87.38%
Epoch [8/10], Loss: 0.2910, Accuracy: 87.88%
Epoch [9/10], Loss: 0.2898, Accuracy: 87.72%
Epoch [10/10], Loss: 0.2850, Accuracy: 88.07%
Training is finished


In [12]:
model_path = "dna_classifier.pth"
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

Model saved to dna_classifier.pth
