In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device:",device)

from tqdm.notebook import tqdm

Device: cuda


In [2]:
!pip install torchsummaryX

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from torchsummaryX import summary

In [4]:
class AdomainDataset(Dataset):
    def __init__(self, filename, padding_token='X', missing_token='-'):
        sequences, labels = self.load_file(filename)
        self.sequences = sequences
        self.labels = labels
        self.padding_token = padding_token
        self.missing_token = missing_token
        self.label_map = self.build_label_map(labels)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        sequence = self.sequences[index]
        label = self.labels[index]
        return sequence, label

    def load_file(self, filename):
        sequences = []
        labels = []
        with open(filename, 'r') as file:
            for line in file:
                sequence, label = line.strip().split('\t')
                sequences.append(sequence)
                labels.append(label)
        return sequences, labels

    def collate_fn(self,batch):
      sequences, labels = zip(*batch)

    # Converting sequences to numerical representation (one-hot encoding)
      char_to_index = {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11,
                     'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19, '-': 20, 'X':21}

    
      numerical_labels = [self.label_map[label] for label in labels]

    # Padding sequences to a fixed length and handle missing amino acids
      num_chars = len(char_to_index)
      encoded_sequences = [[char_to_index[char] if char in char_to_index else char_to_index[self.missing_token] for char in sequence] for sequence in sequences]
      padded_sequences = torch.nn.utils.rnn.pad_sequence([torch.tensor(encoded_sequence) for encoded_sequence in encoded_sequences], batch_first=True, padding_value=char_to_index[self.padding_token])
      padded_sequences = torch.nn.functional.one_hot(padded_sequences, num_classes=num_chars)

      numerical_labels = torch.tensor(numerical_labels)

      return padded_sequences, numerical_labels


    def build_label_map(self, labels):
      unique_labels = sorted(set(labels))
      label_map = {label: index for index, label in enumerate(unique_labels)}
      return label_map  

In [17]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

# Define the file path for the dataset
filename = '/content/a_domains.tsv'

# Create an instance of the AdomainDataset
dataset = AdomainDataset(filename)

## Split the dataset into train, val, and test sets
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    dataset, [train_size, val_size, test_size]
)


train_loader = torch.utils.data.DataLoader(
    dataset     = train_dataset,
    collate_fn= dataset.collate_fn, 
    num_workers = 4,
    batch_size  = 32, 
    pin_memory  = True,
    shuffle     = True
)

val_loader = torch.utils.data.DataLoader(
    dataset     = val_dataset, 
    collate_fn = dataset.collate_fn,
    num_workers = 2,
    batch_size  = 32,
    pin_memory  = True,
    shuffle     = False
)

test_loader = torch.utils.data.DataLoader(
    dataset     = test_dataset,
    collate_fn =  dataset.collate_fn,
    num_workers = 2, 
    batch_size  = 32, 
    pin_memory  = True, 
    shuffle     = False
)


In [22]:
filename = '/content/a_domains.tsv'
dataset = AdomainDataset(filename)

# Iterating through the dataset
for sequence, label in dataset:
    print(sequence, label)

sequence amino_acid
LFTTFDVCYQESSLITAGEHNHYGPSETHVVTTC pro
SWNLFDAFALTTVFMLGGEMNAYGPTESSVMATY phe
LVFAFDASVWDGTLITAGSVNGYGPTESTVCATL phe
YWASFDLTVTSTKLIVGGEFNEYGPTETVVGCMI asn
HWMTFDASVWELQMFCGGEINLYGPTETTIDATY gln
NWKLFDAFVLSTTGTLGGEVNEYGPTESSVVATW tyr
LHTGFDAMTFEGWLIVGGEWNGYGPTENTTFSTC val
VAWAFDVSTGDREAILGADINSYGVTEACIDTSY orn
CWRFFDGFVASAYGTLGGENNEYGPTENSVVTTI leu
LNSGFDAVTFEGFLYVGGEHNIYGPTENTTFSTF ile
VNTSFDGSVFDGFILFGGEIHVYGPTESTVYATY ile
TDISFDLSVYDGNSLLSGDISLGGATEGSIWSIY cys
LWHAFDAFVWEPFLLTGGDVNNYGPTENTVVATS leu
LGLAFDASVKQADMIVGGDTNVYGPTECCVDAAS glu
YAFVFDAFSEEPSCISGGDYNSYGPTEATVCATY lys
ATWAFDVFAGDRESIMGSDINSYGVTEASVDSGY orn
VNTSFDGSVFDGFIFLGGEIHVYGPTESTVYATY ile
LAVAFDASAFEPTLVSAGSINAYGPTETTVCATA phe
YWFSFDLGYTSSKLVLGGEMNHYGPTETTIGSVF asp
YAVSADLGNTTAKIILGGEFNHYGPTETTIGVMV asp
YWASFDLTVTSTKLIVGGEYNEYGPTETVVGCMI asn
LFFAFDASVWEMTLITAGSINAYGPTETSICATI phe
LFFAFDASVWEMTLITAGSINAYGPTETTICATT phe
LFTTFDVCYQESSFITAGEHNHYGPSETHVVTTY pro
LNTGFDALTFEGWLIVGGDWNGYGPTENTTFSTC val
VAWAF

In [21]:
print("Train Data:")
for batch_idx, batch in enumerate(train_loader):
    sequences, labels = batch
    print("Batch Sequences:", sequences.shape)
    print("Batch Labels:", labels.shape)

    
    first_sequence = sequences[0]
    first_label = labels[0]

    print("First Sequence:", first_sequence)
    print("First Label:", first_label)
    break  

Train Data:
Batch Sequences: torch.Size([32, 34, 22])
Batch Labels: torch.Size([32])
First Sequence: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [11]:
class ADomainModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size,num_layers, dropout):
        super(ADomainModel, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size,num_layers=num_layers, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input):
        input = input.float()
        output, _ = self.lstm(input)
        output = output[:, -1, :]  
        output = self.fc(output)
        return output

In [13]:
input_size = 22  # Number of amino acid types including the missing token and padding token
hidden_size = 256
batch_size = 32
num_layers = 2
dropout = 0.2
output_size = len(dataset.label_map)
model = ADomainModel(input_size, hidden_size, output_size,num_layers, dropout).to(device)
print(model)

ADomainModel(
  (lstm): LSTM(22, 256, num_layers=2, dropout=0.2)
  (fc): Linear(in_features=256, out_features=37, bias=True)
)


In [14]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the criterion (loss function)
criterion = nn.CrossEntropyLoss()
batch_size = 32

# Define the optimizer
learning_rate = 0.01
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Function for training the model
def train(model, dataloader, criterion, optimizer):
    model.train()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for inputs, labels in dataloader:
        optimizer.zero_grad()
        #print(inputs.shape)
        inputs = inputs.to(device)
        #inputs = inputs.view(batch_size, -1)
        
        labels = labels.to(device)
        #print(labels.shape)
        outputs = model(inputs)
        #print(outputs.shape)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        _, predicted = torch.max(outputs.data, 1)
        total_predictions += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()

    train_loss = running_loss / len(dataloader)
    train_accuracy = correct_predictions / total_predictions

    return train_loss, train_accuracy

# Function for validating the model
def validate(model, dataloader, criterion):
    model.eval()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total_predictions += labels.size(0)
            correct_predictions += (predicted == labels).sum().item()

    val_loss = running_loss / len(dataloader)
    val_accuracy = correct_predictions / total_predictions

    return val_loss, val_accuracy

# Training loop
num_epochs = 500

for epoch in range(num_epochs):
    train_loss, train_accuracy = train(model, train_loader, criterion, optimizer)
    val_loss, val_accuracy = validate(model, val_loader, criterion)

    print(f"Epoch {epoch+1}/{num_epochs}:")
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

Epoch 1/500:
Train Loss: 3.3608, Train Accuracy: 0.1152
Validation Loss: 3.3288, Validation Accuracy: 0.0423
Epoch 2/500:
Train Loss: 3.1278, Train Accuracy: 0.1257
Validation Loss: 3.5327, Validation Accuracy: 0.0423
Epoch 3/500:
Train Loss: 3.0641, Train Accuracy: 0.1361
Validation Loss: 3.3664, Validation Accuracy: 0.0423
Epoch 4/500:
Train Loss: 3.0272, Train Accuracy: 0.1501
Validation Loss: 3.3132, Validation Accuracy: 0.0563
Epoch 5/500:
Train Loss: 2.9335, Train Accuracy: 0.2042
Validation Loss: 2.9800, Validation Accuracy: 0.2113
Epoch 6/500:
Train Loss: 2.8690, Train Accuracy: 0.2496
Validation Loss: 2.8587, Validation Accuracy: 0.2535
Epoch 7/500:
Train Loss: 2.8161, Train Accuracy: 0.2496
Validation Loss: 2.8922, Validation Accuracy: 0.2535
Epoch 8/500:
Train Loss: 2.7634, Train Accuracy: 0.2548
Validation Loss: 2.8365, Validation Accuracy: 0.2535
Epoch 9/500:
Train Loss: 2.7054, Train Accuracy: 0.2688
Validation Loss: 2.9687, Validation Accuracy: 0.2394
Epoch 10/500:
Train

In [18]:
def test(model, dataloader, criterion):
    model.eval()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in dataloader:
            inputs, labels = batch
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total_predictions += labels.size(0)
            correct_predictions += (predicted == labels).sum().item()

    test_loss = running_loss / len(dataloader)
    test_accuracy = correct_predictions / total_predictions

    return test_loss, test_accuracy


# Test loop
test_loss, test_accuracy = test(model, test_loader, criterion)

print("Test Results:")
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")


Test Results:
Test Loss: 2.8182, Test Accuracy: 0.2877


In [19]:
def test(model, dataloader, criterion):
    model.eval()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    predictions = []

    with torch.no_grad():
        for batch in dataloader:
            inputs, labels = batch
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total_predictions += labels.size(0)
            correct_predictions += (predicted == labels).sum().item()

            # Append the predicted labels to the predictions list
            predictions.extend(predicted.tolist())

    test_loss = running_loss / len(dataloader)
    test_accuracy = correct_predictions / total_predictions

    return test_loss, test_accuracy, predictions



test_loss, test_accuracy, predictions = test(model, test_loader, criterion)

print("Test Results:")
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")


print("Predictions:")
print(predictions[:batch_size])


Test Results:
Test Loss: 2.8182, Test Accuracy: 0.2877
Predictions:
[7, 29, 33, 3, 3, 3, 19, 18, 18, 29, 19, 33, 29, 29, 3, 29, 22, 33, 3, 3, 29, 14, 19, 29, 29, 3, 3, 33, 3, 8, 33, 3]
