In [None]:
%pip install torch
%pip install pandas
%pip install matplotlib

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader


In [None]:
# Define amino acid and secondary structure mappings
amino_acid_vocab = {'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 
                    'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 
                    'T': 17, 'V': 18, 'W': 19, 'Y': 20, 'X': 0}  # X for padding
sst3_vocab = {'C': 0, 'E': 1, 'H': 2}  # Coil, Beta-sheet, Alpha-helix


#Function to one hot encode a sequence

def one_hot_encode(sequence):

    one_hot = np.zeros((len(sequence), len(amino_acid_vocab)), dtype=int)
    for idx, aa in enumerate(sequence):
        one_hot[idx, amino_acid_vocab.get(aa, 0)] = 1
    return one_hot


#Function to extract one hot encoded sequences, and remove those close to the edge.

def extract_features_and_labels(sequences, structures, window_size=15):

    X, y = [], []
    half_window = window_size // 2
    
    for sequence, structure in zip(sequences, structures):
        padded_sequence = 'X' * half_window + sequence + 'X' * half_window  # Padding with 'X'
        for i in range(len(sequence)):
            window = padded_sequence[i:i + window_size]  # Extract window
            if 'X' not in window:  # Ignore windows with padding (if any)
                one_hot_window = one_hot_encode(window)  # One-hot encode the window
                X.append(one_hot_window.flatten())  # Flatten to 1D
                y.append(structure[i])  # Label for the center amino acid
    
    return np.array(X), np.array(y)




In [None]:
# Dataset class to load sequences and labels
class ProteinDataset(Dataset):
    def __init__(self, data_file, window_size=15):
        self.data = pd.read_csv(data_file)
        self.window_size = window_size
        self.X, self.y = self.prepare_data()
    
    def prepare_data(self):
        sequences = self.data['seq'].tolist()
        structures = self.data['sst3'].tolist()
        X, y = extract_features_and_labels(sequences, structures, self.window_size)
        return X, y
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        # Convert to tensors
        input_seq = torch.tensor(self.X[idx], dtype=torch.float32)
        target_sst3 = torch.tensor(sst3_vocab[self.y[idx]], dtype=torch.long)
        
        return input_seq, target_sst3

# Model definition
class ProteinSecondaryStructureModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, n_heads, num_encoder_layers, hidden_dim, output_dim, dropout=0):
        super(ProteinSecondaryStructureModel, self).__init__()
        
        self.fc1 = nn.Linear(input_dim, embedding_dim)  # Input layer to embedding
        encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=n_heads, dim_feedforward=hidden_dim, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        self.fc2 = nn.Linear(embedding_dim, output_dim)  # Output layer

    def forward(self, src):
        src = self.fc1(src)  # Convert input to embedding dimension
        seq_len = src.size(0)  # Assuming input is (seq_len, batch_size, input_dim)
        src = src.unsqueeze(1)  # Add batch dimension
        transformer_output = self.transformer_encoder(src)
        output = self.fc2(transformer_output)
        return output


In [None]:


# Function for training the model
def train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs, output_dim):
    train_losses = []
    train_accuracies = []
    test_accuracies = []
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct_train = 0
        total_train = 0

        # Training loop
        for i, (input_seq, target_sst3) in enumerate(train_loader):
            optimizer.zero_grad()

            # Forward pass
            output = model(input_seq)

            # Reshape for loss computation
            output = output.view(-1, output_dim)  # (batch_size*seq_len, output_dim)
            target_sst3 = target_sst3.view(-1)  # (batch_size*seq_len)

            # Compute loss
            loss = criterion(output, target_sst3)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            # Compute training accuracy
            _, predicted = torch.max(output, 1)
            correct_train += (predicted == target_sst3).sum().item()
            total_train += target_sst3.size(0)

        avg_train_loss = running_loss / len(train_loader)
        train_accuracy = 100 * correct_train / total_train
        
        # Store training loss and accuracy
        train_losses.append(avg_train_loss)
        train_accuracies.append(train_accuracy)
        
        # Evaluate on test set after each epoch
        test_loss, test_accuracy = evaluate_model(model, test_loader, criterion, output_dim)
        test_accuracies.append(test_accuracy)
        
        print(f"Epoch [{epoch+1}/{num_epochs}]")
        print(f"Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%")
        print(f"Test Accuracy: {test_accuracy:.2f}%")

    return train_losses, train_accuracies, test_accuracies

# Function for evaluating the model
def evaluate_model(model, test_loader, criterion, output_dim):
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for input_seq, target_sst3 in test_loader:
            output = model(input_seq)
            output = output.view(-1, output_dim)
            target_sst3 = target_sst3.view(-1)

            # Compute accuracy
            _, predicted = torch.max(output, 1)
            correct += (predicted == target_sst3).sum().item()
            total += target_sst3.size(0)
    
    accuracy = 100 * correct / total
    return None, accuracy  # Returning `None` for loss since we don't track test loss

# Function to plot training and test accuracy
def plot_metrics(train_losses, train_accuracies, test_accuracies):
    epochs = range(1, len(train_losses) + 1)
    
    plt.figure(figsize=(12, 5))
    
    # Plot training loss
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, label='Training Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training Loss over Epochs')
    plt.legend()
    
    # Plot accuracy
    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_accuracies, label='Training Accuracy')
    plt.plot(epochs, test_accuracies, label='Test Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy (%)')
    plt.title('Accuracy over Epochs')
    plt.legend()
    
    plt.show()

In [None]:
# Function to initialize and run the model
def run_experiment(train_file, test_file, num_epochs=10, batch_size=32, input_dim=315, embedding_dim=128, n_heads=8, num_encoder_layers=4, hidden_dim=256, dropout=0.1):
    output_dim = len(sst3_vocab)
    
    # Initialize dataset and dataloader
    train_dataset = ProteinDataset(train_file, window_size=15)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    test_dataset = ProteinDataset(test_file, window_size=15)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    # Initialize model
    model = ProteinSecondaryStructureModel(input_dim=input_dim, embedding_dim=embedding_dim, n_heads=n_heads, num_encoder_layers=num_encoder_layers, hidden_dim=hidden_dim, output_dim=output_dim, dropout=dropout)
    
    # Define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-4)
    
    # Train the model
    print("Starting training...")
    train_losses, train_accuracies, test_accuracies = train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs, output_dim)
    
    plot_metrics(train_losses, train_accuracies, test_accuracies)

In [None]:
# Example usage of the run_experiment function
train_file = 'training_data.csv'
test_file = 'test_data.csv'
    
# Set hyperparameters
num_epochs = 50
batch_size = 32
input_dim = 15 * len(amino_acid_vocab)
embedding_dim = 128
n_heads = 8
num_encoder_layers = 4
hidden_dim = 256
dropout = 0
    
run_experiment(
    train_file, test_file, 
    num_epochs=num_epochs, 
    batch_size=batch_size, 
    input_dim=input_dim,
    embedding_dim=embedding_dim, 
    n_heads=n_heads, 
    num_encoder_layers=num_encoder_layers, 
    hidden_dim=hidden_dim, 
    dropout=dropout
    )