DEEP LEARNING COURSEWORK
NAME- ANJALI GEDAM
GUID- 2925297G

IMPORT LIBRARIES

In [1]:
import numpy as np 
import pandas as pd 
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import numpy as np
import os
import re

FILES PATH

In [2]:
seqs_train_path = '/kaggle/input/deep-learning-for-msc-202324/seqs_train.csv'
labels_train_path = '/kaggle/input/deep-learning-for-msc-202324/labels_train.csv'
train_path = '/kaggle/input/deep-learning-for-msc-202324/train'
seqs_test_path = '/kaggle/input/deep-learning-for-msc-202324/seqs_test.csv'
test_path = '/kaggle/input/deep-learning-for-msc-202324/test'

MAPPING

In [3]:
amino_acid_mapping = {
    'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9,
    'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19,
    'X': 20, 'B': 21, 'Z': 22, 'J': 23, '-': 24
}

sec_struct_mapping = {'H': 0, 'E': 1, 'C': 2}  

PROTIEN DATASET

In [4]:
class ProteinDataset(Dataset):
    def __init__(self, seq_file, data_dir, label_file=None, normalize='min-max'):
        self.seqs = pd.read_csv(seq_file)
        
        self.data = {}
        for filename in os.listdir(data_dir):
            if filename.endswith(".csv"):
                protein_id = re.split(r'_train|_test', filename)[0]
                self.data[protein_id] = pd.read_csv(os.path.join(data_dir, filename))
        
        if label_file:
            self.labels = pd.read_csv(label_file)
        else:
            self.labels = None
        
        self.amino_acid_map = amino_acid_mapping
        self.normalize_method = normalize
        
    def encode_sequence(self, seq):
        encoded_seq = np.zeros((len(seq), len(self.amino_acid_map)), dtype=int)
        for i, aa in enumerate(seq):
            index = self.amino_acid_map.get(aa, self.amino_acid_map['X'])
            encoded_seq[i, index] = 1
        return encoded_seq
    
    def normalize_data(self, data):
        numeric_cols = data[:, 2:]
        data_numeric = numeric_cols.astype(np.float32)

        if self.normalize_method == 'min-max':
            min_vals = data_numeric.min(axis=0)
            max_vals = data_numeric.max(axis=0)
            data_range = np.where(max_vals - min_vals == 0, 1, max_vals - min_vals)
            normalized_data = (data_numeric - min_vals) / data_range
        elif self.normalize_method == 'z-score':
            mean_vals = data_numeric.mean(axis=0)
            std_vals = data_numeric.std(axis=0)
            std_vals = np.where(std_vals == 0, 1, std_vals)
            normalized_data = (data_numeric - mean_vals) / std_vals
        else:
            normalized_data = data_numeric

        return normalized_data
    
    def __len__(self):
        return len(self.seqs)
    
    def __getitem__(self, idx):
        protein_id = self.seqs.iloc[idx]['PDB_ID']
        sequence = self.seqs.iloc[idx]['SEQUENCE']
        encoded_sequence = self.encode_sequence(sequence)
        data = self.data[protein_id].values
        normalized_data = self.normalize_data(data)

        if self.labels is not None:
            label_seq = self.labels.iloc[idx]['SEC_STRUCT']
            label_numeric = [sec_struct_mapping[char] for char in label_seq]
            label_tensor = torch.tensor(label_numeric, dtype=torch.long)
            return (
                protein_id,
                torch.tensor(encoded_sequence, dtype=torch.float32),
                torch.tensor(normalized_data, dtype=torch.float32),
                label_tensor
            )

        return (
            protein_id,
            torch.tensor(encoded_sequence, dtype=torch.float32),
            torch.tensor(normalized_data, dtype=torch.float32)
        )

COLLATE FUNCTION WITH AND WITHOUT LABELS

In [5]:
def collate_fn_no(batch):
    ids, seqs, data = zip(*batch)
    seqs_padded = pad_sequence([seq.clone().detach() for seq in seqs], batch_first=True)
    data_padded = torch.tensor(data)
    return ids, seqs_padded, data_padded

def collate_fn(batch):
    _, seqs, data, labels = zip(*batch)
    seqs_padded = pad_sequence([seq.clone().detach() for seq in seqs], batch_first=True)
    data_padded = pad_sequence([d.clone().detach() for d in data], batch_first=True)
    if labels[0] is not None:
        labels_padded = pad_sequence([label.clone().detach() for label in labels], batch_first=True)
    else:
        labels_padded = None
    
    mask = [torch.ones(len(label), dtype=torch.uint8) for label in labels]
    mask_padded = pad_sequence(mask, batch_first=True, padding_value=0)
    
    return seqs_padded, data_padded, labels_padded, mask_padded

FULLY CONVOLUTIONAL NETWORK

In [6]:
class FullyConvolutionalNetwork(nn.Module):
    def __init__(self, num_classes=3, input_channels=20):
        super(FullyConvolutionalNetwork, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=input_channels, out_channels=64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1)
        self.final_conv = nn.Conv1d(in_channels=256, out_channels=num_classes, kernel_size=1)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.final_conv(x)
        x = x.transpose(1, 2)
        return x

TRAINING MODEL

In [7]:
def train_model(model, criterion, optimizer, train_loader, num_epochs=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct_preds = 0
        total_preds = 0

        for seqs, data, labels, _ in train_loader:
            inputs = data.permute(0, 2, 1).to(device)
            
            optimizer.zero_grad()

            outputs = model(inputs)
            labels = labels.to(device)
            loss = criterion(outputs.transpose(1, 2), labels)

            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)

            _, predicted = torch.max(outputs, 2)
            correct_preds += (predicted == labels).sum().item()
            total_preds += labels.numel()

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_acc = correct_preds / total_preds
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}')

VALIDATION MODEL

In [8]:
def validate_model(model, criterion, val_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    running_loss = 0.0
    correct_preds = 0
    total_preds = 0

    with torch.no_grad():
        for seqs, data, labels, _ in val_loader:
            inputs = data.permute(0, 2, 1).to(device)

            outputs = model(inputs)
            labels = labels.to(device)
            loss = criterion(outputs.transpose(1, 2), labels)
            running_loss += loss.item() * inputs.size(0)

            _, predicted = torch.max(outputs, 2)
            correct_preds += (predicted == labels).sum().item()
            total_preds += labels.numel()

    val_loss = running_loss / len(val_loader.dataset)
    val_acc = correct_preds / total_preds
    print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}')

In [9]:
dataset = ProteinDataset(seq_file=seqs_train_path, data_dir=train_path, label_file=labels_train_path)

train_loader = DataLoader(dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

model = FullyConvolutionalNetwork()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001, weight_decay=0.0)

num_epochs = 10

train_model(model, criterion, optimizer, train_loader, num_epochs)

Epoch 1/10, Loss: 0.3962, Accuracy: 0.8416
Epoch 2/10, Loss: 0.3512, Accuracy: 0.8615
Epoch 3/10, Loss: 0.3389, Accuracy: 0.8666
Epoch 4/10, Loss: 0.3316, Accuracy: 0.8696
Epoch 5/10, Loss: 0.3264, Accuracy: 0.8716
Epoch 6/10, Loss: 0.3222, Accuracy: 0.8732
Epoch 7/10, Loss: 0.3187, Accuracy: 0.8746
Epoch 8/10, Loss: 0.3156, Accuracy: 0.8757
Epoch 9/10, Loss: 0.3129, Accuracy: 0.8767
Epoch 10/10, Loss: 0.3104, Accuracy: 0.8776


TESTING MODEL

In [10]:
def test_model(model, test_dataset, output_file='submission.csv'):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    predictions = []
    with torch.no_grad():
     
        for i in range(len(test_dataset)):
            pdb_id, _, data = test_dataset[i]
            input_data = data.unsqueeze(0).permute(0, 2, 1).to(device)

            outputs = model(input_data)
            _, predicted = torch.max(outputs, 2)

          
            seq_len = data.shape[0]
            for j in range(seq_len):
                test_id = f"{pdb_id}_{j + 1}"
                structure_label = ['H', 'E', 'C'][predicted[0, j].item()]
                
                predictions.append([test_id, structure_label])

    pd.DataFrame(predictions, columns=['ID', 'STRUCTURE']).to_csv(output_file, index=False)
    print(f'Submission file saved to {output_file}')

In [11]:
test_dataset = ProteinDataset(seq_file=seqs_test_path, data_dir=test_path)
test_model(model, test_dataset)

Submission file saved to submission.csv
