In [1]:
import copy
import pickle
import random
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from Bio import Align

from dataset import *
from utils import *
from models import *

In [2]:
data = pd.read_csv("../SARS-SAbDab_Shaun/CoV-AbDab_extract.csv")
data

Unnamed: 0,Index,AB_name,Class,Paratope,Epitope
0,1,m396,0,SYTWGGITILIANYQRDTVMGGSKWDSSDY,FSTFRGDEVRNNGQPTYGVGYQY
1,2,80R,0,SYAVISYDRDRSYDRVRSNDSTRSGSGSDFTTSRSW,RDVGGYNLYLFKGFNCYFLQSYGQPTYGVYQ
2,3,F26G19,0,YVSYTTYRYPGNDTTTEGIPQL,AFSTGDRWNNNPTYGVGYQY
3,4,1AF10,0,NWRENRYDPYYAMDIQHYQQTDSWPT,FIVKNKCVFTGVLTDIATDLDITPCSFGGSYQGVNRAGCIG
4,5,6D3,1,WHEDDYINYEYGLLAQNNFYTY,NVFAEHVNNSS
...,...,...,...,...,...
305,306,scFv_E4,1,VSGVTSSNYYPGGSYRDLSEKDVIQGISYSGQNS,RDQTGNDYYLFRKSNYQAGSTFNYFPLQSYGYGVGY
306,307,N3113,1,YEALEWSYHSRTYNPSWASGSTGDW,VATRFASYAWNKSKGGNYNYLRITEINGVKFLQS
307,308,N3113.1,1,QWYSRYNPSWASGST,ATRFASYANKDKVGNYNYLYITEINGVKFPLQSYG
308,309,RBD-chAb-15,1,GHWTTWGGSTNGYYV,YASFSTFKRGDRAWNTYGVGYQPY


In [3]:
class Cov_Dataset_infer(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
        self.label = torch.Tensor(self.df.iloc[:, 2])
        self.paratope = self.df.iloc[:, 3]
        self.epitope = self.df.iloc[:, 4]
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        
        return self.paratope[idx], self.epitope[idx], self.label[idx]

def collate_fn(batch):
    paras = [b[0] for b in batch]
    epis = [b[1] for b in batch]
    labels = torch.hstack([b[2] for b in batch])
    max_len = max(max(list(map(lambda x:len(x), paras))), max(list(map(lambda x:len(x), epis))))

    paras = ["+"+i.strip("#")+"-"+"#"*(max_len-len(i.strip("#"))) for i in paras]
    epis = ["+"+i.strip("#")+"-"+"#"*(max_len-len(i.strip("#"))) for i in epis]

    new_batch = [paras, epis, labels]

    return new_batch

In [4]:
test_dataset = Cov_Dataset_infer(df=data)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

acc_buf = []
f1_buf = []
auc_buf = []

for i in range(10):

    model = torch.load("./results/SAbDab/seq1_neg0/InteractTransformerLSTM/model_{}_best.pth".format(i))

    with torch.no_grad():
        model.eval()

        preds = []
        labels = []
        val_loss_tmp = []
        for i, (para, epi, label) in enumerate(test_loader):

            pred = model(para, epi)
    #         val_loss = criterion(pred.view(-1), label.view(-1).cuda())

            preds.append(pred.detach().cpu().view(-1))
            labels.append(label.view(-1))
    #         val_loss_tmp.append(val_loss.item())

        preds = torch.hstack(preds).view(-1)
        labels = torch.hstack(labels).view(-1)

        acc = accuracy_score(y_true=labels, y_pred=torch.round(preds))
        f1 = f1_score(y_true=labels, y_pred=torch.round(preds))
        auc = roc_auc_score(y_true=labels, y_score=preds)

#         print(acc, f1, auc)
        
        acc_buf.append(acc)
        f1_buf.append(f1)
        auc_buf.append(auc)
        
print("avg results for k-fold models")
print("auc:\t{:.4}\nf1:\t{:.4}\nauc:\t{:.4}".format(np.mean(acc_buf), np.mean(f1_buf), np.mean(auc_buf)))

avg results for k-fold models
auc:	0.2694
f1:	0.02143
auc:	0.4817


In [17]:
test_dataset = Cov_Dataset_infer(df=data)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

acc_buf = []
f1_buf = []
auc_buf = []

for i in range(10):

    model = torch.load("./results/SAbDab/seq1_neg0/demo/model_{}_best.pth".format(i))

    with torch.no_grad():
        model.eval()

        preds = []
        labels = []
        val_loss_tmp = []
        for i, (para, epi, label) in enumerate(test_loader):

            pred = model(para, epi)
    #         val_loss = criterion(pred.view(-1), label.view(-1).cuda())

            preds.append(pred.detach().cpu().view(-1))
            labels.append(label.view(-1))
    #         val_loss_tmp.append(val_loss.item())

        preds = torch.hstack(preds).view(-1)
        labels = torch.hstack(labels).view(-1)

        acc = accuracy_score(y_true=labels, y_pred=torch.round(preds))
        f1 = f1_score(y_true=labels, y_pred=torch.round(preds))
        auc = roc_auc_score(y_true=labels, y_score=preds)

#         print(acc, f1, auc)
        
        acc_buf.append(acc)
        f1_buf.append(f1)
        auc_buf.append(auc)
        
print("avg results for k-fold models")
print("auc:\t{:.4}\nf1:\t{:.4}\nauc:\t{:.4}".format(np.mean(acc_buf), np.mean(f1_buf), np.mean(auc_buf)))

avg results for k-fold models
auc:	0.2726
f1:	0.0257
auc:	0.4606


In [14]:
test_dataset = Cov_Dataset_infer(df=data)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

acc_buf = []
f1_buf = []
auc_buf = []

for i in range(1):

    model = torch.load("./results/SAbDab/seq1_neg0/SetModel/model_{}_best.pth".format(i))

    with torch.no_grad():
        model.eval()

        preds = []
        labels = []
        val_loss_tmp = []
        for i, (para, epi, label) in enumerate(test_loader):

            pred = model(para, epi)
    #         val_loss = criterion(pred.view(-1), label.view(-1).cuda())

            preds.append(pred.detach().cpu().view(-1))
            labels.append(label.view(-1))
    #         val_loss_tmp.append(val_loss.item())

        preds = torch.hstack(preds).view(-1)
        labels = torch.hstack(labels).view(-1)

        acc = accuracy_score(y_true=labels, y_pred=torch.round(preds))
        f1 = f1_score(y_true=labels, y_pred=torch.round(preds))
        auc = roc_auc_score(y_true=labels, y_score=preds)

#         print(acc, f1, auc)
        
        acc_buf.append(acc)
        f1_buf.append(f1)
        auc_buf.append(auc)
        
print("avg results for k-fold models")
print("acc:\t{:.4}\nf1:\t{:.4}\nauc:\t{:.4}".format(np.mean(acc_buf), np.mean(f1_buf), np.mean(auc_buf)))

avg results for k-fold models
acc:	0.3097
f1:	0.1705
auc:	0.4674


In [13]:
para, epi, label

(['+NWTSLSWYDDTRYQYGYNYGYF-##############',
  '+VSGVTSSNYYPGGSYRDLSEKDVIQGISYSGQNS-##',
  '+YEALEWSYHSRTYNPSWASGSTGDW-###########',
  '+QWYSRYNPSWASGST-#####################',
  '+GHWTTWGGSTNGYYV-#####################',
  '+VEFISANYYSGGSFRLPTYDSISYHDTTP-#######'],
 ['+FGFNTRVLSASFWNSNNLDSKVNPTR-##########',
  '+RDQTGNDYYLFRKSNYQAGSTFNYFPLQSYGYGVGY-',
  '+VATRFASYAWNKSKGGNYNYLRITEINGVKFLQS-##',
  '+ATRFASYANKDKVGNYNYLYITEINGVKFPLQSYG-#',
  '+YASFSTFKRGDRAWNTYGVGYQPY-############',
  '+RDERQTGNDYYLFRKSNYQAGSTFNYQYGY-######'],
 tensor([1., 1., 1., 1., 1., 1.]))