In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd /content/drive/MyDrive/CE7412-Project-main

/content/drive/MyDrive/CE7412-Project-main


In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [4]:
import pandas as pd

In [5]:
import os

In [6]:
batch_size = 4
device = 'cuda:0'

In [7]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Conv + LSTM 

In [8]:
class ConvLSTM(nn.Module):
    def __init__(self, n_features, n_hidden, seq_len, n_layers):
        super(ConvLSTM, self).__init__()
        self.n_hidden = n_hidden
        self.seq_len = seq_len
        self.n_layers = n_layers
        self.c1 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size = 2, stride = 1) # Add a 1D CNN layer
        self.lstm = nn.LSTM(
            input_size=n_features,
            hidden_size=n_hidden,
            num_layers=n_layers
        )
        self.linear = nn.Linear(in_features=n_hidden, out_features=1)
    def reset_hidden_state(self):
        self.hidden = (
            torch.zeros(self.n_layers, self.seq_len-1, self.n_hidden),
            torch.zeros(self.n_layers, self.seq_len-1, self.n_hidden)
        )
    def forward(self, sequences):
        sequences = self.c1(sequences.view(len(sequences), 1, -1))
        self.hidden = self.reset_hidden_state()
        lstm_out, self.hidden = self.lstm(
            sequences.view(len(sequences), self.seq_len-1, -1),
            self.hidden
        )
        last_time_step = lstm_out.view(self.seq_len-1, len(sequences), self.n_hidden)[-1]
        y_pred = self.linear(last_time_step)
        outputs.squeeze(1)
        return y_pred

# Read Data

In [9]:
import pickle
import numpy as np

In [11]:
#train data
virus_data = []
for idx, file in enumerate(os.listdir('data/training/')):
    array = pd.read_pickle(os.path.join('data/training',file))
    for i in array:
        virus_data.append(np.array([idx,np.array(i)]))

#val data

# virus_data_val = []
# for idx, file in enumerate(os.listdir('data/validation/')):
#     array = pd.read_pickle(os.path.join('data/validation',file))
#     for i in array:
#         virus_data_val.append([idx,i])

# encodings_ = []
# labels_ = []
# for label, fasta_filename in enumerate(os.listdir('data/training/')):
#     with open(os.path.join('data/training',fasta_filename), 'rb') as pkfile:
#         file_encodings = pickle.load(pkfile)
#         encodings_.extend([np.array(enc) for enc in file_encodings])  # Convert nested lists to NumPy arrays
#         labels_.extend([label] * len(file_encodings))
#         del file_encodings

# encodings = np.array(encodings_, dtype=object)
# labels = np.array(labels_)
# del encodings_
# del labels_
# # Find the maximum sequence length
# max_seq_len = max([enc.shape[0] for enc in encodings])

# # Pad encodings with zeros to create tensors
# encodings_padded = []
# for enc in encodings:
#     pad_rows = max_seq_len - enc.shape[0]
#     enc_padded = np.pad(enc, (0, pad_rows), mode='constant', constant_values=0)
#     encodings_padded.append(enc_padded)
# del encodings
# encodings = np.stack(encodings_padded)
# del encodings_padded

  virus_data.append(np.array([idx,np.array(i)]))
  virus_data.append(np.array([idx,np.array(i)]))
  virus_data.append(np.array([idx,np.array(i)]))
  virus_data.append(np.array([idx,np.array(i)]))
  virus_data.append(np.array([idx,np.array(i)]))
  virus_data.append(np.array([idx,np.array(i)]))


In [21]:
virus_data = np.array(virus_data)

In [None]:
max_seq_len = max([enc.shape[0] for enc in virus_data[...,1]])

# Pad encodings with zeros to create tensors
encodings_padded = []
for enc in  virus_data[...,1]:
    pad_rows = max_seq_len - enc.shape[0]
    enc_padded = np.pad(enc, (0, pad_rows), mode='constant', constant_values=0)
    encodings_padded.append(enc_padded)


In [None]:
def normalize_encodings(encodings):
    mean = np.mean(encodings)
    std = np.std(encodings)
    return (encodings - mean) / std

encodings = normalize_encodings(encodings_padded)

# Dataset

In [13]:
class VirusDataset(Dataset):
    def __init__(self,virus_data):
        self.data = [i[1] for i in virus_data]
        self.label = [i[0] for i in virus_data]
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        return torch.Tensor(self.data[idx]),self.label[idx]

In [14]:
train_dataset = VirusDataset(virus_data)
# val_dataset = VirusDataset(virus_data_val)

In [15]:
train_loader = DataLoader(train_dataset, batch_size=batch_size,shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=batch_size,shuffle=True)

# Training

In [20]:
model = ConvLSTM(n_features = 1 , n_hidden = 64, seq_len = max_len, n_layers = 2)

In [21]:
model.to(device)

ConvLSTM(
  (c1): Conv1d(1, 1, kernel_size=(2,), stride=(1,))
  (lstm): LSTM(1, 64, num_layers=2)
  (linear): Linear(in_features=64, out_features=1, bias=True)
)

In [22]:
learning_rate = 0.00001
num_epochs = 20
criterion = nn.BCELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [35]:
def train_epoch(model, dataloader, criterion, optimizer):
    model.train()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for inputs, targets in dataloader:
        # Move tensors to the correct device
        inputs = inputs.to(device)
        targets = targets.type(torch.FloatTensor).to(device)

        # Forward pass
        outputs = model(inputs)
        
        # print(outputs.squeeze().shape)
        # print(targets.shape)
        loss = criterion(outputs, targets)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # Calculate the number of correct predictions
        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == targets).sum().item()
        total_predictions += targets.size(0)

    average_loss = running_loss / len(dataloader)
    accuracy = correct_predictions / total_predictions

    return average_loss, accuracy

In [36]:
for epoch in range(num_epochs):
    train_loss, train_accuracy = train_epoch(model, train_loader, criterion, optimizer)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {train_loss:.4f}")

IndexError: ignored

In [29]:
for input,label in train_loader:
  input = input.to(device)

RuntimeError: ignored

In [18]:
a = torch.rand((4, 30483))

In [19]:
a = a.to(device)

In [34]:
input.shape

torch.Size([4, 30483])