<a href="https://colab.research.google.com/github/anushahulbatte/deeplearning/blob/main/Debugging_for_task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch

if torch.__version__ != '2.5.1+cu124':
    !pip install torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124 -U --quiet
    print("PyTorch version updated to 2.5.1.")
else:
    print("PyTorch is already at the correct version (2.5.1).")

PyTorch is already at the correct version (2.5.1).


In [2]:
!pip install d2l==1.0.3 --quiet
!pip install scipy --quiet
!pip install torchmetrics --quiet

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.dataloader import DataLoader
from torch.utils.data import RandomSampler, random_split

from tqdm.auto import tqdm, trange
from matplotlib import pyplot as plt

DEVICE = torch.device("mps")

In [4]:
import random
import os
import matplotlib.pyplot as plt
import csv
import numpy as np


In [5]:
import os
! git clone https://git.wur.nl/bioinformatics/grs34806-deep-learning-project-data.git
os.chdir("grs34806-deep-learning-project-data")

fatal: destination path 'grs34806-deep-learning-project-data' already exists and is not an empty directory.


Writing a function to read the simulated data and produce lists containing the sequences and a list of 0s and 1s.

In [6]:
def read(seqfile, posfile):
  """
  Extracting the sequences from the seqfile and creating a list called datalist.
  Making a list called labellist containing 0s and 1s, where 1 denotes the
  sequence identifiers in the posfile.
  """
  # Removing the sequence identifiers
  with open(seqfile, 'r') as f:
    seq_lines = [seq_lines.strip() for seq_lines in f.readlines() if seq_lines.strip()]
    #print(seq_lines)
  seq_ids = []
  datalist = []
  current_seq = ""
  for line in seq_lines:
    if line.startswith("seq"):
      parts = line.split()
      #print(parts)
      seq_ids.append(parts[0])
      datalist.append(parts[1])

  with open(posfile, 'r') as f:
    annotated_ids = [line.strip() for line in f.readlines() if line.strip()]
    #print(annotated_ids)
  labellist = [1 if seq_id in annotated_ids else 0 for seq_id in seq_ids]
  #print(labellist)
  assert len(datalist) == len(labellist) == len(seq_ids), "Lengths of lists don't match"

  print("Length of the datalist: ", len(datalist))
  print("Length of labellist: ", len(labellist))
  print("The number of positives in labellist: ", sum(labellist))
  for i in range(min(5, len(datalist))):
        print(f"ID: {seq_ids[i]}, Has annotation: {labellist[i]}, Sequence length: {len(datalist[i])}")

  return datalist, labellist


In [7]:
datalist, labellist = read(seqfile = "/content/grs34806-deep-learning-project-data/len100_200_n1000.seq",
posfile = "/content/grs34806-deep-learning-project-data/len100_200_n1000.pos")

Length of the datalist:  1000
Length of labellist:  1000
The number of positives in labellist:  507
ID: seq1, Has annotation: 1, Sequence length: 113
ID: seq2, Has annotation: 0, Sequence length: 166
ID: seq3, Has annotation: 0, Sequence length: 197
ID: seq4, Has annotation: 1, Sequence length: 150
ID: seq5, Has annotation: 0, Sequence length: 134


Splitting the lists into training and testing datasets using the train_test_split method from sklearn.

In [9]:
import numpy as np
from sklearn.model_selection import train_test_split
def generate_train_test (datalist, labellist):
  traindatalist, testdatalist, trainlabellist, testlabellist = train_test_split(
      datalist, labellist, test_size=0.2, random_state=42
  )
  return traindatalist, testdatalist, trainlabellist, testlabellist

In [10]:
traindatalist, testdatalist, trainlabellist, testlabellist = generate_train_test(
    datalist, labellist
)
# Printing the length of the training and testing data
print("Training data size:", len(traindatalist))
print("Training labels size:", len(trainlabellist))
print("Test data size:", len(testdatalist))
print("Test labels size:", len(testlabellist))

Training data size: 800
Training labels size: 800
Test data size: 200
Test labels size: 200


Defined the provided functions.

In [11]:
def tokenize(dat, map2num, non_aa_num=20):
  """
  This function takes in the sequences of AA. It uses dictionary mapping, to map
  AA to integers. The non_aa_num make up the unknown AA, and its default value is
  20.
  """
  seq = []
  for count, i in enumerate(dat):
      seq.append([map2num.get(j, non_aa_num) for j in list(i)])
  return seq

In [12]:
def build_seq_array(lines, num_steps, non_aa_num=20):
  """
  This function applies the truncate_pad function to all the sequences.
  It then converts the sequences into tensors.
  """
  array = torch.tensor([
      truncate_pad(l, num_steps, non_aa_num) for l in lines])
  return array

In [13]:
def truncate_pad(line, num_steps, padding_token):
  """
  The line in the parameters refers to the tokenized sequence.
  num_steps is the maxmimum sequence length. The padding_token is the integer
  for padding.
  The function pads shorter sequences with padding_token.
  """
  if len(line) > num_steps:
    return line[:num_steps] #Truncate
  return line + [padding_token] * (num_steps - len(line))

In [14]:
from d2l import torch as d2l

The function also includes code for converting the tokenized seq into a 1-hot encoded representation so it can be fed as input to the CNN.

In [17]:
def load_data(batch_size, num_steps, dataset, trainlabels = None):
  mapaa2num = {aa: i for (i, aa)
                      in enumerate(list("ACDEFGHIKLMNPQRSTVWY"))}
  # Creating a dictionary which maps each AA to a unique integer
  seq, lab = dataset, trainlabels # lab is the labels which are associated with sequences

  seq = tokenize(seq, mapaa2num)
  seq_array = build_seq_array(seq, num_steps) # Shape = batch_size, num_steps

  # To convert the sequences to one-hot encoding
  # one_hot method takes a tensor and returns a tensor of shape of num_classes
  vocab_size = len(mapaa2num)
  seq_onehotencoding = F.one_hot(seq_array, num_classes=vocab_size).float()
  seq_onehotencoding = seq_onehotencoding.permute(0,2,1)
  # Shape = batch_size, vocab_size, num_steps

  data_arrays = (seq_onehotencoding, torch.tensor(lab))
  data_iter = d2l.load_array(data_arrays, batch_size)
  print("Shape of seq_array: ", seq_array.shape)
  print("Shape of data_arrays, which now considers \none-hot encoded sequence: ",
        seq_onehotencoding.shape)
  return data_iter

In [18]:
train_iter = load_data(batch_size = 25, num_steps=50, dataset=traindatalist, trainlabels=trainlabellist)
test_iter = load_data(batch_size= 25, num_steps = 50, dataset = testdatalist, trainlabels = testlabellist)

Shape of seq_array:  torch.Size([800, 50])
Shape of data_arrays, which now considers 
one-hot encoded sequence:  torch.Size([800, 20, 50])
Shape of seq_array:  torch.Size([200, 50])
Shape of data_arrays, which now considers 
one-hot encoded sequence:  torch.Size([200, 20, 50])


Building the 1D-CNN model.

In [21]:
class ProteinCNN1D(nn.Module):
  def __init__(self, vocab_size:int, context_size:int, conv_channels:int=128,
                use_bias:bool=False):
      super().__init__()
      assert context_size % 2 == 0, f'Invalid block_size, {context_size} is not an even number'
      self.vocab_size = vocab_size
      self.context_size = context_size
      self.cnn = nn.Sequential(
          nn.Conv1d(in_channels=self.vocab_size,
                    out_channels=conv_channels,
                    kernel_size=3,
                    padding='same',
                    bias=use_bias),
          nn.ReLU(),
          nn.MaxPool1d(kernel_size = 2, stride = 2),

          nn.Conv1d(in_channels=conv_channels,
                    out_channels=conv_channels,
                    kernel_size = 3,
                    padding = 'same',
                    bias = use_bias),
          nn.ReLU(),
          nn.MaxPool1d(kernel_size = 2, stride = 2),

          nn.Flatten(1, -1),
          nn.Linear(in_features = int(conv_channels*self.context_size/4),
                    out_features = 1,
                    bias = use_bias)

      )
      def forward(self, X:torch.tensor, targets: torch.tensor=None) -> tuple[torch.tensor, torch.tensor]:
        logits = self.cnn(X).squeeze(1)
        loss = None if targets is None else F.binary_cross_entropy_with_logits(
            logits, targets.float())
        return logits, loss

