<a href="https://colab.research.google.com/github/alirezash97/Pattern-Recognition-Course/blob/main/Anticancer_peptides.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00589/Anticancer_Peptides.zip'

In [None]:
!unzip '/content/Anticancer_Peptides.zip'

In [1]:
import numpy as np
from math import ceil, log2

In [2]:
import pandas as pd 

lung_data = pd.read_csv('/content/ACPs_Lung_cancer.csv')
print(len(lung_data))
lung_data.head()

901


Unnamed: 0,ID,sequence,class
0,1,AIGKFLHSAKKFGKAFVGEIMNS,mod. active
1,2,FAKALAKLAKKLL,mod. active
2,3,FAKALKALLKALKAL,mod. active
3,4,FAKFLAKFLKKAL,mod. active
4,5,FAKIIAKIAKIAKKIL,inactive - exp


In [3]:
# shuffle dataset rows

lung_data = lung_data.sample(frac=1)
lung_data.head()

Unnamed: 0,ID,sequence,class
454,455,KKEELQRSLNILTAF,inactive - virtual
820,821,TIVELVVRVWRWFLVIWVLW,inactive - virtual
304,305,EHEIQEWYKGFLRD,inactive - virtual
274,275,DTRAWALAEKLFG,inactive - virtual
8,9,FAKKLAKKLAKLAL,inactive - exp


In [4]:
lung_data['class'].unique()

array(['inactive - virtual', 'inactive - exp', 'very active',
       'mod. active'], dtype=object)

In [5]:
# find the maximum number of chars

max = 0
for i in lung_data['sequence']:
  if len(i) > max:
    max = len(i)
print(max)

38


In [6]:
# preprocessing

def binary_encoder(dataset):
  
  
  inputs = np.zeros((len(dataset), max*5)) # 5 bits for each char
  for sequence_index, sequence in enumerate(dataset['sequence']):
    for char_index, char in enumerate(sequence):
      binary_char = '{0:05b}'.format(ord(char) - 64)
      inputs[ sequence_index, char_index*5 : (char_index+1)*5 ] = [i for i in binary_char]

  

  number_of_classes = len(lung_data['class'].unique())
  labels = np.zeros(( len(dataset), number_of_classes))
  for row_index, row in enumerate(dataset['class']):
    

    if row  == 'mod. active':
      labels[row_index] = [1, 0, 0, 0]
    elif row  == 'inactive - exp':
      labels[row_index] = [0, 1, 0, 0]
    elif row  == 'very active':
      labels[row_index] = [0, 0, 1, 0]
    elif row  == 'inactive - virtual':
      labels[row_index] = [0, 0, 0, 1]
    else:
      labels[row_index] = None

  return inputs, labels



In [7]:
x, y = binary_encoder(lung_data)

In [8]:
print(x[0, :])
print(y[0])

[0. 1. 0. 1. 1. 0. 1. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 1. 0. 1. 1. 0.
 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0. 0. 0. 1. 1.
 1. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1.]


In [9]:
print('This is inputs shape: ', x.shape)
print('This is outputs shape: ', y.shape)

This is inputs shape:  (901, 190)
This is outputs shape:  (901, 4)


In [10]:
# as you can see there is no missing values in this dataset

lung_data.loc[pd.isna(lung_data['sequence']), :].index

Int64Index([], dtype='int64')

In [11]:
# split dataset

number_of_samples = x.shape[0]



x_train, y_train = x[:int(number_of_samples*0.6), :], y[:int(number_of_samples*0.6)]

x_validation, y_validation = x[int(number_of_samples*0.6):int(number_of_samples*0.8), :], y[int(number_of_samples*0.6):int(number_of_samples*0.8)]

x_test, y_test = x[int(number_of_samples*0.8):number_of_samples, :], y[int(number_of_samples*0.8):number_of_samples]


print('trainset shapes: ', x_train.shape, y_train.shape)
print('validation shapes: ', x_validation.shape, y_validation.shape)
print('test shapes: ', x_test.shape, y_test.shape)

trainset shapes:  (540, 190) (540, 4)
validation shapes:  (180, 190) (180, 4)
test shapes:  (181, 190) (181, 4)


In [17]:
from torch.utils.data import DataLoader



trainXLoader = DataLoader(x_train, batch_size=4, shuffle=False)
trainYLoader = DataLoader(y_train, batch_size=4, shuffle=False)

validationXLoader = DataLoader(x_validation, batch_size=4, shuffle=False)
validationYLoader = DataLoader(y_validation, batch_size=4, shuffle=False)

testXLoader = DataLoader(x_test, batch_size=4, shuffle=False)
testYLoader = DataLoader(y_test, batch_size=4, shuffle=False)


In [86]:
import torch.nn as nn
import torch.nn.functional as F



class Network(nn.Module):

    def __init__(self):
        super().__init__()

        self.fc1 = nn.Linear(190, 256)
        self.b1 = nn.BatchNorm1d(256)
        self.fc2 = nn.Linear(256, 512)
        self.b2 = nn.BatchNorm1d(512)
        self.fc3 = nn.Linear(512,1024)
        self.b3 = nn.BatchNorm1d(1024)
        self.fc4 = nn.Linear(1024,512)
        self.b4 = nn.BatchNorm1d(512)
        self.fc5 = nn.Linear(512,256)
        self.b5 = nn.BatchNorm1d(256)
        self.fc6 = nn.Linear(256,128)
        self.b6 = nn.BatchNorm1d(128)
        self.fc7 = nn.Linear(128,64)
        self.b7 = nn.BatchNorm1d(64)
        self.fc8 = nn.Linear(64, 4)

    def forward(self,x):

        x = F.sigmoid(self.fc1(x))
        x = self.b1(x)
        x = F.sigmoid(self.fc2(x))
        x = self.b2(x)
        x = F.sigmoid(self.fc3(x))
        x = self.b3(x)
        x = F.sigmoid(self.fc4(x))
        x = self.b4(x)
        x = F.sigmoid(self.fc5(x))
        x = self.b5(x)
        x = F.sigmoid(self.fc6(x))
        x = self.b6(x)
        x = F.sigmoid(self.fc7(x))
        x = self.b7(x)
        x = F.softmax(self.fc8(x))

        return x

net = Network()

In [87]:
import torch.optim as optim
import torch


criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.00025, momentum=0.85)

In [88]:
for epoch in range(200):  # loop over the dataset multiple times

    labeliter_train = iter(trainYLoader)
    running_loss = 0.0

    for i, data in enumerate(trainXLoader, 0):
        
        # get the inputs; data is a list of [inputs, labels]
        inputs = data.float()
        labels = labeliter_train.next()
        

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, torch.max(labels, 1)[1])
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 10 == 9:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 100))
            running_loss = 0.0

    # validation 
    labeliter_validation = iter(validationYLoader)
    batch_counter = 0
    for i, data in enumerate(validationXLoader, 0):
        
        batch_counter += 1
        # get the inputs; data is a list of [inputs, labels]
        inputs = data.float()
        labels = labeliter_validation.next()
        

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, torch.max(labels, 1)[1])
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
          # print every 2000 mini-batches
    print('////////////////  validation loss: %.3f  //////////////// ' %
      (running_loss / batch_counter))
    running_loss = 0.0 

print('Finished Training')



[1,    10] loss: 0.136
[1,    20] loss: 0.137
[1,    30] loss: 0.137
[1,    40] loss: 0.138
[1,    50] loss: 0.136
[1,    60] loss: 0.137
[1,    70] loss: 0.133
[1,    80] loss: 0.134
[1,    90] loss: 0.135
[1,   100] loss: 0.133
[1,   110] loss: 0.134
[1,   120] loss: 0.130
[1,   130] loss: 0.134
////////////////  validation loss: 1.374  //////////////// 
[2,    10] loss: 0.122
[2,    20] loss: 0.118
[2,    30] loss: 0.116
[2,    40] loss: 0.119
[2,    50] loss: 0.118
[2,    60] loss: 0.115
[2,    70] loss: 0.115
[2,    80] loss: 0.114
[2,    90] loss: 0.119
[2,   100] loss: 0.115
[2,   110] loss: 0.119
[2,   120] loss: 0.115
[2,   130] loss: 0.119
////////////////  validation loss: 1.236  //////////////// 
[3,    10] loss: 0.121
[3,    20] loss: 0.115
[3,    30] loss: 0.119
[3,    40] loss: 0.112
[3,    50] loss: 0.123
[3,    60] loss: 0.118
[3,    70] loss: 0.114
[3,    80] loss: 0.126
[3,    90] loss: 0.122
[3,   100] loss: 0.118
[3,   110] loss: 0.112
[3,   120] loss: 0.119
[3,   

In [89]:
input_iter_test = iter(testXLoader)
label_iter_test = iter(testYLoader)

correct = 0
for i in range(int(x_test.shape[0]/4)):
  test_inputs = input_iter_test.next()
  test_labels = label_iter_test.next()

  test_outputs = net(test_inputs.float())
  correct += int(torch.sum(torch.max(test_labels, 1)[1] == torch.max(test_outputs, 1)[1]))

  print('actual value: ', torch.max(test_labels, 1)[1])
  print('predicted value: ',torch.max(test_outputs, 1)[1])
  print('-------------------------------')
print('Final Accuracy on testset : ', correct / (x_test.shape[0]) )



actual value:  tensor([3, 3, 3, 0])
predicted value:  tensor([3, 3, 3, 0])
-------------------------------
actual value:  tensor([3, 2, 3, 3])
predicted value:  tensor([3, 3, 3, 0])
-------------------------------
actual value:  tensor([3, 3, 3, 3])
predicted value:  tensor([3, 3, 0, 3])
-------------------------------
actual value:  tensor([3, 3, 3, 3])
predicted value:  tensor([3, 3, 1, 0])
-------------------------------
actual value:  tensor([3, 3, 3, 1])
predicted value:  tensor([3, 3, 3, 1])
-------------------------------
actual value:  tensor([3, 2, 3, 3])
predicted value:  tensor([3, 2, 3, 3])
-------------------------------
actual value:  tensor([3, 3, 1, 3])
predicted value:  tensor([1, 3, 1, 3])
-------------------------------
actual value:  tensor([0, 3, 0, 1])
predicted value:  tensor([0, 3, 1, 3])
-------------------------------
actual value:  tensor([3, 3, 3, 3])
predicted value:  tensor([2, 3, 0, 3])
-------------------------------
actual value:  tensor([3, 3, 3, 3])
p