### CLASSIFYING ENZYMES TO FUNCTIONAL CLASSES

In [1]:
# Imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

import torch
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as func
import torch.optim as opt

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read data for SVM, kNN and RF
aac = pd.read_csv('../dataset/aac.csv', index_col=0)
ctd = pd.read_csv('../dataset/ctd.csv', index_col=0)
data = pd.read_csv('../dataset/ngram.csv', index_col=0)
data = data.join(ctd[ctd.columns[0:-1]])
data = data.join(aac[aac.columns[0:-1]])

In [3]:
y = data['class']
data = data[data.columns[0:-1]]

In [4]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3)

In [5]:
# Normalize the data
mean = X_train.mean()
std = X_train.std()
X_train = (X_train-mean)/std
X_test = (X_test-mean)/std

MAJORITY CLASSIFIER

In [68]:
from collections import Counter
major = Counter(y_train).most_common()[0][0]
major_arr = np.full(len(y_test), major)
print(sum(major_arr == y_test)/len(y_test))

0.3363216517250747


SVM CLASSIFIER

In [6]:
clf_svm = svm.SVC().fit(X_train, y_train)
results_svm = clf_svm.predict(X_test)

In [7]:
acc_svm = print(sum(results_svm == y_test)/len(y_test))

0.7677261613691931


In [8]:
confusion_matrix(y_test, results_svm)

array([[ 949,  263,    5,    0,    0,    0,    0],
       [ 108, 1645,  221,    0,    0,    0,    0],
       [   0,  132, 2344,    0,    0,    0,    0],
       [   0,    1,  513,   29,    4,    0,    0],
       [   0,    0,  146,   54,  120,   54,    0],
       [   0,    8,   46,   26,   42,  439,    1],
       [   0,    0,   28,    8,    8,   42,  126]])

kNN CLASSIFIER

In [9]:
clf_knn = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)
results_knn = clf_knn.predict(X_test)

In [10]:
acc_knn = print(sum(results_knn == y_test)/len(y_test))

0.3928280358598207


In [11]:
confusion_matrix(y_test, results_knn)

array([[ 748,  236,  189,   15,    7,   21,    1],
       [ 627,  739,  514,   24,   15,   54,    1],
       [ 633,  629, 1072,   39,   30,   70,    3],
       [ 178,  143,  139,   47,   10,   30,    0],
       [  98,   83,   94,   19,   61,   18,    1],
       [  82,  118,  151,    9,   17,  181,    4],
       [  24,   68,   54,    4,    3,   15,   44]])

RANDOM FOREST

In [12]:
clf_rf = RandomForestClassifier().fit(X_train, y_train)
results_rf = clf_rf.predict(X_test)

In [13]:
acc_rf = print(sum(results_rf == y_test)/len(y_test))

0.7599837000814996


In [14]:
confusion_matrix(y_test, results_rf)

array([[1061,  154,    2,    0,    0,    0,    0],
       [   9, 1947,   18,    0,    0,    0,    0],
       [   0,    1, 2474,    0,    0,    1,    0],
       [   0,    8,  534,    2,    1,    2,    0],
       [   0,    6,  360,    1,    2,    5,    0],
       [   1,   24,  470,    1,    0,   66,    0],
       [   1,    9,  156,    0,    0,    3,   43]])


NEURAL NETWORK

In [15]:
with open('../dataset/ID_bin.txt') as handle:
    names_bin = handle.read().strip().split('\n')
with open('../dataset/ID_pssm.txt') as handle:
    names_pssm = handle.read().strip().split('\n')
    
truths = pd.read_csv('../dataset/IdEc.csv')
truths['class'] = truths['EC'].map(lambda x: x.split('.')[0])
labels_bin = [int(t) for t in truths.loc[truths['ID'].isin(names_bin), 'class'].to_list()]
labels_pssm = [int(t) for t in truths.loc[truths['ID'].isin(names_pssm), 'class'].to_list()]

In [16]:
print(len(names_bin))
print(len(names_pssm))

24615
20099


In [39]:
def tt(arr):
    return [torch.tensor([a]) for a in arr]

In [40]:
# Read data for NN

binary = [np.loadtxt('../dataset/binary/'+name+'.txt', dtype='double') for name in names_bin]
#pssm = [np.loadtxt('../dataset/pssm_parsed/'+name+'.txt', dtype='double') for name in names_pssm]

bin_train, bin_test, ybin_train, ybin_test = train_test_split(binary, labels_bin, test_size=0.3)
#pssm_train, pssm_test, ypssm_train, ypssm_test = train_test_split(pssm, labels_pssm, test_size=0.3)

bin_train = torch.utils.data.DataLoader(tt(bin_train), shuffle=False, num_workers=2)
bin_test = torch.utils.data.DataLoader(tt(bin_test), shuffle=False, num_workers=2)

#pssm_train = torch.utils.data.DataLoader(tt(pssm_train), shuffle=False, num_workers=2)
#pssm_test = torch.utils.data.DataLoader(tt(pssm_test), shuffle=False, num_workers=2)

In [91]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(20, 16, 5)
        self.fc_gap = nn.Linear(16, 10)
        self.fc2 = nn.Linear(10, 7)
        self.fc3 = nn.Softmax()

    def forward(self, x):
        x = self.pool(func.relu(self.conv1(x)))
        x = self.pool(func.relu(self.conv2(x)))
        x = x.mean(dim=(-1, -2))
        x = func.relu(self.fc_gap(x))
        x = func.relu(self.fc2(x))
        x = self.fc3(x)
        return x


net = Net()

In [None]:
class CN(nn.Module):
    def __init__(self):

In [42]:
criterion = nn.CrossEntropyLoss()
optimizer = opt.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [92]:
def train_nn(data, labels):
    for epoch in [0]:  # loop over the dataset multiple times

        running_loss = 0.0
        for i, data in enumerate(data):
            inputs = data.float()
            optimizer.zero_grad()

            outputs = net(inputs)
            label = np.zeros([7])
            label[labels[i]-1] = 1
            loss = criterion(outputs, torch.Tensor(label))
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            if i % 2000 == 1999:    # print every 2000 mini-batches
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
                running_loss = 0.0

    print('Finished Training')

In [93]:
net_bin = train_nn(bin_train.dataset, ybin_train)


[1,  2000] loss: 1.947
[1,  4000] loss: 1.947
[1,  6000] loss: 1.947
[1,  8000] loss: 1.947
[1, 10000] loss: 1.947
[1, 12000] loss: 1.947
[1, 14000] loss: 1.947
[1, 16000] loss: 1.947
Finished Training


In [59]:
PATH = './model_NNbin.pth'
torch.save(net.state_dict(), PATH)

In [None]:
pred = []
for test in bin_test:
    pred.append(net(test.float()))