### Import libraries

In [None]:
!pip install lightgbm
import numpy as np
from tensorflow import keras
from datetime import datetime
import pandas as pd
import tensorflow as tf
!wget https://raw.githubusercontent.com/anhvt00/PIPR/master/embeddings/seq2tensor.py
from seq2tensor import s2t
import os
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.layers import concatenate, multiply
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2, l1_l2
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef,accuracy_score, precision_score,recall_score
from sklearn.manifold import TSNE
from lightgbm import LGBMClassifier
from keras.callbacks import ModelCheckpoint


from xgboost import XGBClassifier
import lightgbm as lgb
import time

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
--2022-12-21 16:29:36--  https://raw.githubusercontent.com/anhvt00/PIPR/master/embeddings/seq2tensor.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1104 (1.1K) [text/plain]
Saving to: ‘seq2tensor.py’


2022-12-21 16:29:36 (51.9 MB/s) - ‘seq2tensor.py’ saved [1104/1104]



### Conjoint Triad

In [None]:
"""Implementation of CT coding method
"""

__all__ = ['ct_code_of']

# AAC: Classification of amino acids.
AAC = {
    '1': ['A', 'G', 'V'],
    '2': ['I', 'L', 'F', 'P'],
    '3': ['Y', 'M', 'T', 'S'],
    '4': ['H', 'N', 'Q', 'W'],
    '5': ['R', 'K'],
    '6': ['D', 'E'],
    '7': ['C']
}

# AAC_R: Reverse of AAC.
AAC_R = {}
for C, AAS in AAC.items():
    for AA in AAS:
        AAC_R[AA] = C

def classification_of(AA):
    """Get classification of amino acids."""
    return AAC_R[AA]

def classification_sequence_of(PS):
    """Make classification sequence from protein sequence."""
    CS = ''
    for I, CH in enumerate(PS):
        if CH == 'U':
          continue
        CS = CS + classification_of(CH)
    return CS

def ct_code_of(PS):
    """Get CT Code of protein sequence."""
    CT_Code = [0]*343
    CS = classification_sequence_of(PS)
    for I in range(len(CS)-2):
        SubCS = CS[I:I+3]
        CT_Code_Index = int(SubCS[0]) + (int(SubCS[1])-1)*7 + (int(SubCS[2])-1)*7*7
        CT_Code[CT_Code_Index-1] = CT_Code[CT_Code_Index-1] + 1
    SUM = sum(CT_Code)
    CT_Code = [N*1.0/SUM for N in CT_Code]
    # Normalizing CT_Code
    # MIN_CODE = min(CT_Code)
    # MAX_CODE = max(CT_Code)
    # CT_Code = [(N-MIN_CODE)*1.0/(MAX_CODE-MIN_CODE) for N in CT_Code]
    return CT_Code



### Local Descriptor

In [None]:
"""Implementation of LD coding method
"""

__all__ = ['ld_code_of']

# AAC: Classification of amino acids.
AAC = {
    # '0': ['X'],
    '1': ['A', 'G', 'V'],
    '2': ['I', 'L', 'F', 'P'],
    '3': ['Y', 'M', 'T', 'S'],
    '4': ['H', 'N', 'Q', 'W'],
    '5': ['R', 'K'],
    '6': ['D', 'E'],
    '7': ['C']
}

# AAC_R: Reverse of AAC.
AAC_R = {}
for C, AAS in AAC.items():
    for AA in AAS:
        AAC_R[AA] = C

def classification_of(AA):
    """Get classification of amino acids."""
    return AAC_R[AA]

def classification_sequence_of(PS):
    """Make classification sequence from protein sequence."""
    CS = ''
    for I, CH in enumerate(PS):
        if CH == 'X':
          CS = CS
        elif CH=='U':
          continue
        else:
          CS = CS + classification_of(CH)
    return CS

def ld_info_of(CS):
    L = len(CS)
    C = {}
    T = {}
    for I, CH in enumerate(CS):
        if CH not in C:
            C[CH] = []
        C[CH].append(I+1)
        if I > 0:
            PCH = CS[I-1]
            if PCH != CH:
                if int(PCH)<int(CH):
                    TIndex = PCH + CH
                else:
                    TIndex = CH + PCH
                if TIndex not in T:
                    T[TIndex] = 0
                T[TIndex] = T[TIndex]+1
    return L, C, T

def ld_code_of_0(CS):
    RC = [0]*7
    RT = [0]*21
    RD = [0]*35
    L, C, T = ld_info_of(CS)
    for Class, Indexs in C.items():
        Len = len(Indexs)
        RC[int(Class)-1]=Len*1.0/L
        Residues = [1, int(Len*0.25), int(Len*0.5), int(Len*0.75), Len]
        # Residues = list(map(lambda x:x*1.0/L, Residues))
        Residues = list(map(lambda x:Indexs[x-1]*1.0/L, Residues))
        RD[(int(Class)-1)*5:int(Class)*5] = Residues
    for Trans, Frequency in T.items():
        PI, I = int(Trans[0])-1, int(Trans[1])-1
        Index = int((21-(6-PI)*(6-PI+1)/2)+(I-PI-1))
        RT[Index] = Frequency*1.0/(L-1)
    # return RC, RT, RD
    return RC+RT+RD

def ld_code_of(PS):
    """Get LD Code of protein sequence."""
    CS = classification_sequence_of(PS)
    L = len(CS)
    A = ld_code_of_0(CS[          0:int(L*0.25)])
    B = ld_code_of_0(CS[int(L*0.25):int(L*0.50)])
    C = ld_code_of_0(CS[int(L*0.50):int(L*0.75)])
    D = ld_code_of_0(CS[int(L*0.75):L          ])
    E = ld_code_of_0(CS[          0:int(L*0.50)])
    F = ld_code_of_0(CS[int(L*0.50):L          ])
    G = ld_code_of_0(CS[int(L*0.25):int(L*0.75)])
    H = ld_code_of_0(CS[          0:int(L*0.75)])
    I = ld_code_of_0(CS[int(L*0.25):L          ])
    J = ld_code_of_0(CS[int(L*0.125):int(L*0.875)])
    return A+B+C+D+E+F+G+H+I+J


### Auto Covariance

In [None]:
'''Implementation of AC coding method'''

__all__ = ['ac_code_of']

# PCPNS: Physicochemical property names
PCPNS = ['H1', 'H2', 'NCI', 'P1', 'P2', 'SASA', 'V']

# AAPCPVS: Physicochemical property values of amino acid
AAPCPVS = {
    'A': { 'H1': 0.62, 'H2':-0.5, 'NCI': 0.007187, 'P1': 8.1, 'P2':0.046, 'SASA':1.181, 'V': 27.5 },
    'C': { 'H1': 0.29, 'H2':-1.0, 'NCI':-0.036610, 'P1': 5.5, 'P2':0.128, 'SASA':1.461, 'V': 44.6 },
    'D': { 'H1':-0.90, 'H2': 3.0, 'NCI':-0.023820, 'P1':13.0, 'P2':0.105, 'SASA':1.587, 'V': 40.0 },
    'E': { 'H1': 0.74, 'H2': 3.0, 'NCI': 0.006802, 'P1':12.3, 'P2':0.151, 'SASA':1.862, 'V': 62.0 },
    'F': { 'H1': 1.19, 'H2':-2.5, 'NCI': 0.037552, 'P1': 5.2, 'P2':0.290, 'SASA':2.228, 'V':115.5 },
    'G': { 'H1': 0.48, 'H2': 0.0, 'NCI': 0.179052, 'P1': 9.0, 'P2':0.000, 'SASA':0.881, 'V':  0.0 },
    'H': { 'H1':-0.40, 'H2':-0.5, 'NCI':-0.010690, 'P1':10.4, 'P2':0.230, 'SASA':2.025, 'V': 79.0 },
    'I': { 'H1': 1.38, 'H2':-1.8, 'NCI': 0.021631, 'P1': 5.2, 'P2':0.186, 'SASA':1.810, 'V': 93.5 },
    'K': { 'H1':-1.50, 'H2': 3.0, 'NCI': 0.017708, 'P1':11.3, 'P2':0.219, 'SASA':2.258, 'V':100.0 },
    'L': { 'H1': 1.06, 'H2':-1.8, 'NCI': 0.051672, 'P1': 4.9, 'P2':0.186, 'SASA':1.931, 'V': 93.5 },
    'M': { 'H1': 0.64, 'H2':-1.3, 'NCI': 0.002683, 'P1': 5.7, 'P2':0.221, 'SASA':2.034, 'V': 94.1 },
    'N': { 'H1':-0.78, 'H2': 2.0, 'NCI': 0.005392, 'P1':11.6, 'P2':0.134, 'SASA':1.655, 'V': 58.7 },
    'P': { 'H1': 0.12, 'H2': 0.0, 'NCI': 0.239531, 'P1': 8.0, 'P2':0.131, 'SASA':1.468, 'V': 41.9 },
    'Q': { 'H1':-0.85, 'H2': 0.2, 'NCI': 0.049211, 'P1':10.5, 'P2':0.180, 'SASA':1.932, 'V': 80.7 },
    'R': { 'H1':-2.53, 'H2': 3.0, 'NCI': 0.043587, 'P1':10.5, 'P2':0.291, 'SASA':2.560, 'V':105.0 },
    'S': { 'H1':-0.18, 'H2': 0.3, 'NCI': 0.004627, 'P1': 9.2, 'P2':0.062, 'SASA':1.298, 'V': 29.3 },
    'T': { 'H1':-0.05, 'H2':-0.4, 'NCI': 0.003352, 'P1': 8.6, 'P2':0.108, 'SASA':1.525, 'V': 51.3 },
    'V': { 'H1': 1.08, 'H2':-1.5, 'NCI': 0.057004, 'P1': 5.9, 'P2':0.140, 'SASA':1.645, 'V': 71.5 },
    'W': { 'H1': 0.81, 'H2':-3.4, 'NCI': 0.037977, 'P1': 5.4, 'P2':0.409, 'SASA':2.663, 'V':145.5 },
    'Y': { 'H1': 0.26, 'H2':-2.3, 'NCI': 117.3000, 'P1': 6.2, 'P2':0.298, 'SASA':2.368, 'V':  0.023599 },
}

import math

def avg_sd(NUMBERS):
    AVG = sum(NUMBERS)/len(NUMBERS)
    TEM = [pow(NUMBER-AVG, 2) for NUMBER in NUMBERS]
    DEV = sum(TEM)/len(TEM)
    SD = math.sqrt(DEV)
    return (AVG, SD)

# PCPVS: Physicochemical property values
PCPVS = {'H1':[], 'H2':[], 'NCI':[], 'P1':[], 'P2':[], 'SASA':[], 'V':[]}
for AA, PCPS in AAPCPVS.items():
    for PCPN in PCPNS:
        PCPVS[PCPN].append(PCPS[PCPN])

# PCPASDS: Physicochemical property avg and sds
PCPASDS = {}
for PCP, VS in PCPVS.items():
    PCPASDS[PCP] = avg_sd(VS)

# NORMALIZED_AAPCPVS
NORMALIZED_AAPCPVS = {}
for AA, PCPS in AAPCPVS.items():
    NORMALIZED_PCPVS = {}
    for PCP, V in PCPS.items():
        NORMALIZED_PCPVS[PCP] = (V-PCPASDS[PCP][0])/PCPASDS[PCP][1]
    NORMALIZED_AAPCPVS[AA] = NORMALIZED_PCPVS

def pcp_value_of(AA, PCP):
    """Get physicochemical properties value of amino acid."""
    return NORMALIZED_AAPCPVS[AA][PCP];

def pcp_sequence_of(PS, PCP):
    """Make physicochemical properties sequence of protein sequence."""
    PCPS = []
    for I, CH in enumerate(PS):
        if CH == 'X':
          continue
        PCPS.append(pcp_value_of(CH, PCP))
    # Centralization
    AVG = sum(PCPS)/len(PCPS)
    for I, PCP in enumerate(PCPS):
        PCPS[I] = PCP - AVG
    return PCPS

def ac_values_of(PS, PCP, LAG):
    """Get ac values of protein sequence."""
    AVS = []
    PCPS = pcp_sequence_of(PS, PCP)
    for LG in range(1, LAG+1):
        SUM = 0
        for I in range(len(PCPS)-LG):
            SUM = SUM + PCPS[I]*PCPS[I+LG]
        SUM = SUM / (len(PCPS)-LG)
        AVS.append(SUM)
    return AVS

def all_ac_values_of(PS, LAG):
    """Get all ac values of protein sequence."""
    AAVS = []
    for PCP in PCPS:
        AVS = ac_values_of(PS, PCP, LAG)
        AAVS = AAVS + AVS
    return AAVS

def ac_code_of(PS):
    """Get ac code of protein sequence."""
    AC_Code = all_ac_values_of(PS, 30)
    # Normalizing AC_Code
    # MIN_CODE = min(AC_Code)
    # MAX_CODE = max(AC_Code)
    # AC_Code = [(N-MIN_CODE)*1.0/(MAX_CODE-MIN_CODE) for N in AC_Code]
    return AC_Code



### Pseudo amino acid composition

In [None]:
def paac(str_, lambda_=0):
  # str_="ATTRCDEQGGGMFSTQW"
  # lambda_ = 3
  len_=len(str_)
  tt=['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I',  'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
  A = [0.62,  -0.5, 15]
  R = [-2.53,   3, 101]
  N = [-0.78,  0.2, 58]
  D = [-0.9,    3, 59]
  C = [0.29,    -1, 47]
  E = [-0.74,   3, 73]
  Q = [-0.85,  0.2, 72]
  G =[0.48,    0, 1]
  H = [-0.4,  -0.5, 82]
  I = [1.38, -1.8, 57]
  L = [1.06,  -1.8, 57]
  K = [-1.5,    3, 73]
  M = [0.64,  -1.3, 75]
  F =[1.19, -2.5, 91]
  P = [0.12,     0, 42]
  S = [-0.18, 0.3, 31]
  T = [-0.05, -0.4, 45]
  W = [0.81, -3.4, 130] 
  Y = [0.26,  -2.3, 107]
  V = [1.08, -1.5, 43]
  X = [0, 0, 0]
  H1=[A[0],R[0],N[0],D[0],C[0],E[0],Q[0],G[0],H[0],I[0],L[0],K[0],M[0],F[0],P[0],S[0],T[0],W[0],Y[0],V[0]]
  H2=[A[1],R[1],N[1],D[1],C[1],E[1],Q[1],G[1],H[1],I[1],L[1],K[1],M[1],F[1],P[1],S[1],T[1],W[1],Y[1],V[1]]
  M=[A[2],R[2],N[2],D[2],C[2],E[2],Q[2],G[2],H[2],I[2],L[2],K[2],M[2],F[2],P[2],S[2],T[2],W[2],Y[2],V[2]]
  # Normalization
  mean_H1=np.mean(H1)
  std_H1=np.std(H1)
  H1=(H1-mean_H1)/(std_H1)



  mean_H2=np.mean(H2)
  std_H2=np.std(H2)
  H2=(H2-mean_H2)/(std_H2)

  mean_M=np.mean(M)
  std_M=np.std(M)
  M=(M-mean_M)/(std_M)
  data=np.zeros((1,len_))
  f=np.zeros((1,20))

  for j in range(len_):
      for k in range(20):
          # if strcmp(str(j),tt(k))==1
          if str_[j] == tt[k]:
              # print(j, k)
              data[:,j]=int(k)+1
              f[:,k]=f[:,k]+1
  data = data.astype('int32')
  Theta=np.zeros((lambda_,len_))
  H=np.hstack((H1,H2,M))
  H=H.reshape(3,-1)
  for i in range(lambda_):
      # for j=1:len-i
      for j in range(len_-i):
          if j+i+1<len_:
              Theta[i,j]=np.mean(np.mean((H[:, data[:,j]-1]-H[:, data[:,j+i+1]-1])**2))

  theta=np.zeros((1,lambda_))
  for j in range(lambda_):
      theta[:,j]=np.mean(Theta[j,:(len_-j-1)])

  f=f/len_
  XC=f/(1+0.05*np.sum(theta))
  XC2=(0.05*theta)/(1+0.05*np.sum(theta))

  paac = np.hstack((XC, XC2))
  paac = paac.reshape(-1,).tolist()
  return paac


# 23 dimension paac vector
# paac(seq, 3)

### Amino acid composition

In [None]:
def aac(seq):
  aa_list = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I',  'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
  sum_freq = 0
  for i in range(20):
    sum_freq += seq.count(aa_list[i])

  aa_freq = []
  for i in range(20):
    aa_freq.append(seq.count(aa_list[i])/sum_freq)
  return aa_freq

### Concatenate features

In [None]:
def encode_seq(seq):
  encoding = paac(seq) + ct_code_of(seq)
  encoding = np.array(encoding)
  encoding = encoding.reshape(-1, )
  return encoding

### Download datasets and embeddings

In [None]:
!wget https://raw.githubusercontent.com/anhvt00/S-HNBM/master/data/Golden-standard-datasets/Pan-2010/pan_pairs.tsv
!wget https://raw.githubusercontent.com/anhvt00/S-HNBM/master/data/Golden-standard-datasets/Pan-2010/pan_dict.tsv

# Download file seq2tensor.py for converting protein sequences to tensors
!wget https://raw.githubusercontent.com/anhvt00/PIPR/master/embeddings/seq2tensor.py

!wget https://raw.githubusercontent.com/anhvt00/S-HNBM/master/data/Dscript-data/pairs/human_train.tsv
!wget https://raw.githubusercontent.com/anhvt00/S-HNBM/master/data/Dscript-data/seqs/human_dict.tsv

# Download file ac5_aph.txt for ac5_aph embedding 
!wget https://raw.githubusercontent.com/anhvt00/PIPR/master/embeddings/ac5_aph.txt



--2022-12-21 16:29:38--  https://raw.githubusercontent.com/anhvt00/S-HNBM/master/data/Golden-standard-datasets/Pan-2010/pan_pairs.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 990256 (967K) [text/plain]
Saving to: ‘pan_pairs.tsv’


2022-12-21 16:29:38 (16.4 MB/s) - ‘pan_pairs.tsv’ saved [990256/990256]

--2022-12-21 16:29:39--  https://raw.githubusercontent.com/anhvt00/S-HNBM/master/data/Golden-standard-datasets/Pan-2010/pan_dict.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5012967 (4.8M) [text/plain]
Saving to: ‘pan_

### Architecture of FSNN

In [None]:
def fsnn():

    d1 = Dense(256, activation='relu', kernel_initializer='glorot_normal', name='ProA_feature_1', kernel_regularizer=l2(0.01))
    d2 = Dense(256, activation='relu', kernel_initializer='glorot_normal', name='ProA_feature_1_cos', kernel_regularizer=l2(0.01))
    d3 = Dense(256, activation='relu', kernel_initializer='glorot_normal', name='ProA_feature_1_sin', kernel_regularizer=l2(0.01))

    ########################################################"Channel-1" ########################################################
    input_1 = Input(shape=(FEATURE_NUM, ), name='Protein_a')
    p1 = d1(input_1)
    p1 = Dropout(.2)(p1)
    p1_cos = d2(tf.math.cos(math.pi*input_1))
    p1_cos = Dropout(.2)(p1_cos)
    p1_sin = d3(tf.math.sin(math.pi*input_1))
    p1_sin = Dropout(.2)(p1_sin)
    p1 = p1 + p1_cos + p1_sin
    p1 = Dense(128, activation='relu', kernel_initializer='glorot_normal', name='ProA_feature_12', kernel_regularizer=l2(0.01))(p1)
    p1 = Dropout(.2)(p1)
    
    ########################################################"Channel-2" ########################################################
    
   
    input_2 = Input(shape=(FEATURE_NUM, ), name='Protein_b')
    p2 = d1(input_2)
    p2 = Dropout(.2)(p2)
    p2_cos = d2(tf.math.cos(math.pi*input_2))
    p2_cos = Dropout(.2)(p2_cos)
    p2_sin = d3(tf.math.sin(math.pi*input_2))
    p2_sin = Dropout(.2)(p2_sin)
    p2 = p2 + p2_sin + p2_cos
    p2 = Dense(128, activation='relu', kernel_initializer='glorot_normal', name='ProB_feature_12', kernel_regularizer=l2(0.01))(p2)
    p2 = Dropout(.2)(p2)
    


    ##################################### Merge Abstraction features ##################################################
    
    # Hadamard multiplication
    merged = tf.keras.layers.multiply([p1,p2], name='merged_protein1_2')
 
    # Min-max scaling
    merged = tf.divide(
   tf.subtract(
      merged, 
      tf.reduce_min(merged)
   ), 
   tf.subtract(
      tf.reduce_max(merged), 
      tf.reduce_min(merged)
   )
)
    ##################################### Prediction Module ##########################################################

    
    pre_output = Dense(64, activation='relu', kernel_initializer='glorot_normal', name='Merged_feature_1')(merged)
    pre_output=Dropout(0.2)(pre_output)

    output = Dense(1, activation='sigmoid', name='output')(pre_output)
    model = Model(inputs=[input_1, input_2], outputs=output)
   
    sgd = SGD(learning_rate=0.01, momentum=0.9, decay=0.001)
    model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
    return model


def dnn():
    
    ########################################################"Channel-1" ########################################################
    
    input_1 = Input(shape=(FEATURE_NUM, ), name='Protein_a')
    p11 = Dense(512, activation='relu', kernel_initializer='glorot_normal', name='ProA_feature_1', kernel_regularizer=l2(0.01))(input_1)
    p11 = Dropout(0.2)(p11)
    
    p12 = Dense(256, activation='relu', kernel_initializer='glorot_normal', name='ProA_feature_2', kernel_regularizer=l2(0.01))(p11)
    p12 = Dropout(0.2)(p12)
    
    p13= Dense(128, activation='relu', kernel_initializer='glorot_normal', name='ProA_feature_3', kernel_regularizer=l2(0.01))(p12)
    p13 = Dropout(0.2)(p13)
    
    p14= Dense(64, activation='relu', kernel_initializer='glorot_normal', name='ProA_feature_4', kernel_regularizer=l2(0.01))(p13)
    p14 = Dropout(0.2)(p14)
    
    ########################################################"Channel-2" ########################################################
    
    input_2 = Input(shape=(FEATURE_NUM, ), name='Protein_b')
    p21 = Dense(512, activation='relu', kernel_initializer='glorot_normal', name='ProB_feature_1', kernel_regularizer=l2(0.01))(input_2)
    p21 = Dropout(0.2)(p21)
    
    p22 = Dense(256, activation='relu', kernel_initializer='glorot_normal', name='ProB_feature_2', kernel_regularizer=l2(0.01))(p21)
    p22 = Dropout(0.2)(p22)
    
    p23= Dense(128, activation='relu', kernel_initializer='glorot_normal', name='ProB_feature_3', kernel_regularizer=l2(0.01))(p22)
    p23 = Dropout(0.2)(p23)
    
    p24= Dense(64, activation='relu', kernel_initializer='glorot_normal', name='ProB_feature_4', kernel_regularizer=l2(0.01))(p23)
    p24 = Dropout(0.2)(p24)
   


    ##################################### Merge Abstraction features ##################################################
    
    merged = concatenate([p14,p24], name='merged_protein1_2')
    
    ##################################### Prediction Module ##########################################################
    
    pre_output = Dense(64, activation='relu', kernel_initializer='glorot_normal', name='Merged_feature_1')(merged)
    pre_output = Dense(32, activation='relu', kernel_initializer='glorot_normal', name='Merged_feature_2')(pre_output)
    pre_output = Dense(16, activation='relu', kernel_initializer='glorot_normal', name='Merged_feature_3')(pre_output)


    
    pre_output=Dropout(0.2)(pre_output)

    output = Dense(1, activation='sigmoid', name='output')(pre_output)
    model = Model(inputs=[input_1, input_2], outputs=output)
   
    sgd = SGD(learning_rate=0.01, momentum=0.9, decay=0.001)
    model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
    return model

### Training phase 

#### Read Pan dataset

In [None]:
from tqdm import tqdm
import pdb

# Hyperparameter for embedding file
ds_file = 'human_train.tsv'
label_index = 2
id2seq_file = 'human_dict.tsv'
id2index = {}
seqs = []
index = 0
sid1_index = 0
sid2_index = 1

# Create line variable as a list of protein sequences with index is the number of protein sequences
# id2index is a dictionary of protein id and incremental index number 
for line in open(id2seq_file):
    line = line.strip().split('\t')
    id2index[line[0]] = index
    seqs.append(line[1])
    index += 1

seq_array = []
id2_aid = {}
sid = 0
max_data = -1
limit_data = max_data > 0
raw_data = []
skip_head = False
x = None
count = 0

# Create sequence array as a list of protein strings
# Create raw data as list of pairs and label
for line in tqdm(open(ds_file)):
    # pdb.set_trace()
    if skip_head:
        skip_head = False
        continue
    line = line.rstrip('\n').rstrip('\r').split('\t')
    if id2index.get(line[sid1_index]) is None or id2index.get(line[sid2_index]) is None:
        continue
    if id2_aid.get(line[sid1_index]) is None:
        id2_aid[line[sid1_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid1_index]]])
    line[sid1_index] = id2_aid[line[sid1_index]]
    if id2_aid.get(line[sid2_index]) is None:
        id2_aid[line[sid2_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid2_index]]])
    line[sid2_index] = id2_aid[line[sid2_index]]
    raw_data.append(line)
    if limit_data:
        count += 1
        if count >= max_data:
            break

len_m_seq = np.array([len(line.split()) for line in seq_array])
avg_m_seq = int(np.average(len_m_seq)) + 1
max_m_seq = max(len_m_seq)

# Random for distribution of class labels
np.random.seed(42)
np.random.shuffle(raw_data)
seq_tensor = np.array([encode_seq(line) for line in tqdm(seq_array)])

# Extract index of 1st and 2nd sequences in pairs
seq_index1 = np.array([line[sid1_index] for line in tqdm(raw_data)])
seq_index2 = np.array([line[sid2_index] for line in tqdm(raw_data)])

# Create class labels
class_labels = np.zeros((len(raw_data,)))
for i in range(len(raw_data)):
  class_labels[i] = float(raw_data[i][label_index])
class_labels

421792it [00:01, 294052.93it/s]
100%|██████████| 15816/15816 [00:40<00:00, 393.64it/s]
100%|██████████| 421792/421792 [00:00<00:00, 1977832.69it/s]
100%|██████████| 421792/421792 [00:00<00:00, 2025575.97it/s]


array([0., 0., 0., ..., 0., 0., 0.])

#### Train FSNN

In [None]:
##################################### Load Positive and Negative Dataset ##########################################################

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y_train = df.iloc[:,FEATURE_NUM*2:].values

# standard scaler
standard_scaler = StandardScaler()
X = standard_scaler.fit_transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]


model=fsnn()
model.fit([X1_train, X2_train],y_train, epochs=50,batch_size=64,verbose=1)




Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f8a0f2d5f40>

In [None]:
print("============================= INFER BY TRAINED NEURAL NETWORK ON TRAINING SET ==============================")
y_pred = model.predict([X1_train, X2_train])
y_true = y_train
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

print("============= INFERENCE BY NEURAL NETWORK ===============")
try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")

accuracy: 0.9851751574235642, precision: 0.9439694529758446, recall: 0.8897350302524515, specificity: 0.9947189710208425, f1-score: 0.9160502114519702, mcc: 0.9084057698303128, auroc: 0.9976192139095166, auprc: 0.979436682225439 
0.9851751574235642	0.9439694529758446	0.8897350302524515	0.9947189710208425	0.9160502114519702	0.9084057698303128	0.9976192139095166	0.979436682225439



#### Extract hidden layer of FSNN

In [None]:
################################Intermediate Layer prediction (Abstraction features extraction)######################################

intermediate_layer_model = Model(inputs=model.input,outputs=model.get_layer('Merged_feature_1').output)

# Use intermediate layer to transform pairs matrix
intermediate_output_p1 = intermediate_layer_model.predict([X1_train,X2_train])  
p_merge=pd.DataFrame(intermediate_output_p1)    

# create dataframe use transformed pairs matrix outputs and labels
X_train_feat=pd.concat((p_merge,pd.DataFrame(pd.DataFrame(y_train))),axis=1,ignore_index=True)

# write to file dataframe of transformed pairs matrix and labels
X_train_feat.to_csv('X_train.csv',header=False, index=False)

# read dataframe of transformed pairs matrix and labels
Train=pd.read_csv("X_train.csv",header=None)
Train=Train.sample(frac=1)

X=Train.iloc[:,0:64].values
y=Train.iloc[:,64:].values.ravel()

extracted_df=X_train_feat

robust_scaler=RobustScaler()
# scaler=MinMaxScaler()
# scaler = StandardScaler()
X=robust_scaler.fit_transform(X)





#### Train LGBM

In [None]:
model_=LGBMClassifier(learning_rate=.2, gamma=0, max_depth=10, n_estimators=1000)
model_.fit(X, y)
print("============================= INFER BY TRAINED HYBRID MODEL ON TRAINING SET ==============================")
y_true=y
y_pred = model_.predict(X)
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")

accuracy: 1.0, precision: 1.0, recall: 1.0, specificity: 1.0, f1-score: 1.0, mcc: 1.0, auroc: 1.0, auprc: 1.0 
1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0



## Inference phase

## Evaluation on intra-species datasets

### HPRD

#### Read dataset

In [None]:
id2seq_file = 'hprd_dict.tsv'
id2index = {}
seqs = []
index = 0
sid1_index = 0
sid2_index = 1
ds_file = 'hprd_pairs.tsv'
label_index = 2
use_emb = 'ac5_aph.txt'

if not os.path.isfile(ds_file) or not os.path.isfile(id2seq_file):
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/Human-sets/HPRD/hprd_pairs.tsv
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/Human-sets/HPRD/hprd_dict.tsv


# Create line variable as a list of protein sequences with index is the number of protein sequences
# id2index is a dictionary of protein id and incremental index number 
for line in open(id2seq_file):
    line = line.strip().split('\t')
    id2index[line[0]] = index
    seqs.append(line[1])
    index += 1

seq_array = []
id2_aid = {}
sid = 0

seq2t = s2t(use_emb)

max_data = -1
limit_data = max_data > 0
raw_data = []
skip_head = False
x = None
count = 0

# Create sequence array as a list of protein strings
for line in tqdm(open(ds_file)):
    if skip_head:
        skip_head = False
        continue
    line = line.rstrip('\n').rstrip('\r').split('\t')
    if id2index.get(line[sid1_index]) is None or id2index.get(line[sid2_index]) is None:
        continue
    if id2_aid.get(line[sid1_index]) is None:
        id2_aid[line[sid1_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid1_index]]])
    line[sid1_index] = id2_aid[line[sid1_index]]
    if id2_aid.get(line[sid2_index]) is None:
        id2_aid[line[sid2_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid2_index]]])
    line[sid2_index] = id2_aid[line[sid2_index]]
    raw_data.append(line)
    if limit_data:
        count += 1
        if count >= max_data:
            break

len_m_seq = np.array([len(line.split()) for line in seq_array])
avg_m_seq = int(np.average(len_m_seq)) + 1
max_m_seq = max(len_m_seq)
dim = seq2t.dim

# seq_tensor is tensor representation of dataset having shape of (number_of_sequences, padding_length, embedding_dim_of_aa)
# Random for distribution of class labels
np.random.seed(42)
np.random.shuffle(raw_data)
seq_tensor = np.array([encode_seq(line) for line in tqdm(seq_array)])

# Extract index of 1st and 2nd sequences in pairs
seq_index1 = np.array([line[sid1_index] for line in tqdm(raw_data)])
seq_index2 = np.array([line[sid2_index] for line in tqdm(raw_data)])

# Assign labels for pairs of sequences
class_map = {'0':1,'1':0}
class_labels = np.zeros((len(raw_data,)))
for i in range(len(raw_data)):
  class_labels[i] = float(raw_data[i][label_index])


--2022-12-17 17:08:33--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/Human-sets/HPRD/hprd_pairs.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 56256 (55K) [text/plain]
Saving to: ‘hprd_pairs.tsv’


2022-12-17 17:08:34 (70.7 MB/s) - ‘hprd_pairs.tsv’ saved [56256/56256]

--2022-12-17 17:08:34--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/Human-sets/HPRD/hprd_dict.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2428937 (2.3M) [text/plain]
Saving to: ‘

3516it [00:00, 380474.02it/s]
100%|██████████| 2747/2747 [00:08<00:00, 330.81it/s]
100%|██████████| 3516/3516 [00:00<00:00, 1734349.39it/s]
100%|██████████| 3516/3516 [00:00<00:00, 1806144.87it/s]


In [None]:
##################################### Load Positive and Negative Dataset ##########################################################

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values


# standard scaler
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]





print("============================= INFER BY TRAINED NEURAL NETWORK ON TRAINING SET ==============================")
y_true = y
y_pred = model.predict([X1_train, X2_train])
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

print("============= INFERENCE BY NEURAL NETWORK ===============")
try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")


accuracy: 0.36888509670079633, precision: 1.0, recall: 0.36888509670079633, specificity: nan, f1-score: 0.5389569914814045, mcc: 0.0, auroc: nan, auprc: 1.0 
0.36888509670079633	1.0	0.36888509670079633	nan	0.5389569914814045	0.0	 nan	1.0



  spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])


#### Evaluate with trained FSNN-LGBM prediction

In [None]:
# Read new data

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values

Trainlabels=y

# standard scaler
# scaler = StandardScaler().fit(X)
# scaler = MinMaxScaler().fit(X)

# scaler = RobustScaler().fit(X)
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]


# Predict representation from trained neural network

intermediate_layer_model = Model(inputs=model.input,outputs=model.get_layer('Merged_feature_1').output)
# Use intermediate layer to transform pairs matrix
intermediate_output_p1 = intermediate_layer_model.predict([X1_train,X2_train])  
p_merge=pd.DataFrame(intermediate_output_p1)    

# create dataframe use transformed pairs matrix outputs and labels
X_train_feat=pd.concat((p_merge,pd.DataFrame(pd.DataFrame(Trainlabels))),axis=1,ignore_index=True)

# write to file dataframe of transformed pairs matrix and labels
X_train_feat.to_csv('X_train.csv',header=False, index=False)

# read dataframe of transformed pairs matrix and labels
Train=pd.read_csv("X_train.csv",header=None)
Train=Train.sample(frac=1)

X=Train.iloc[:,0:64].values
y=Train.iloc[:,64:].values.ravel()

extracted_df=X_train_feat

X=robust_scaler.transform(X)

# Predict probability from neural network output for new data
y_true=y
y_pred = model_.predict(X)
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")

accuracy: 0.41439135381114905, precision: 1.0, recall: 0.41439135381114905, specificity: nan, f1-score: 0.5859642067162679, mcc: 0.0, auroc: nan, auprc: 1.0 
0.41439135381114905	1.0	0.41439135381114905	nan	0.5859642067162679	0.0	 nan	1.0



  spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])


### DIP

#### Read dataset

In [None]:
id2seq_file = 'dip_dict.tsv'
id2index = {}
seqs = []
index = 0
sid1_index = 0
sid2_index = 1
ds_file = 'dip_pairs.tsv'
label_index = 2
use_emb = 'ac5_aph.txt'

if not os.path.isfile(ds_file) or not os.path.isfile(id2seq_file):
  !wget https://raw.githubusercontent.com/anhvt00/S-HNBM/master/data/Independent-testsets/Human-sets/DIP/dip_pairs.tsv
  !wget https://raw.githubusercontent.com/anhvt00/S-HNBM/master/data/Independent-testsets/Human-sets/DIP/dip_dict.tsv



# Create line variable as a list of protein sequences with index is the number of protein sequences
# id2index is a dictionary of protein id and incremental index number 
for line in open(id2seq_file):
    line = line.strip().split('\t')
    id2index[line[0]] = index
    seqs.append(line[1])
    index += 1

seq_array = []
id2_aid = {}
sid = 0

seq2t = s2t(use_emb)

max_data = -1
limit_data = max_data > 0
raw_data = []
skip_head = False
x = None
count = 0

# Create sequence array as a list of protein strings
for line in tqdm(open(ds_file)):
    if skip_head:
        skip_head = False
        continue
    line = line.rstrip('\n').rstrip('\r').split('\t')
    if id2index.get(line[sid1_index]) is None or id2index.get(line[sid2_index]) is None:
        continue
    if id2_aid.get(line[sid1_index]) is None:
        id2_aid[line[sid1_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid1_index]]])
    line[sid1_index] = id2_aid[line[sid1_index]]
    if id2_aid.get(line[sid2_index]) is None:
        id2_aid[line[sid2_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid2_index]]])
    line[sid2_index] = id2_aid[line[sid2_index]]
    raw_data.append(line)
    if limit_data:
        count += 1
        if count >= max_data:
            break

len_m_seq = np.array([len(line.split()) for line in seq_array])
avg_m_seq = int(np.average(len_m_seq)) + 1
max_m_seq = max(len_m_seq)
dim = seq2t.dim

# seq_tensor is tensor representation of dataset having shape of (number_of_sequences, padding_length, embedding_dim_of_aa)
# Random for distribution of class labels
np.random.seed(42)
np.random.shuffle(raw_data)
seq_tensor = np.array([encode_seq(line) for line in tqdm(seq_array)])

# Extract index of 1st and 2nd sequences in pairs
seq_index1 = np.array([line[sid1_index] for line in tqdm(raw_data)])
seq_index2 = np.array([line[sid2_index] for line in tqdm(raw_data)])

# Assign labels for pairs of sequences
class_map = {'0':1,'1':0}
class_labels = np.zeros((len(raw_data,)))
for i in range(len(raw_data)):
  class_labels[i] = float(raw_data[i][label_index])


--2022-12-17 17:08:44--  https://raw.githubusercontent.com/anhvt00/S-HNBM/master/data/Independent-testsets/Human-sets/DIP/dip_pairs.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23488 (23K) [text/plain]
Saving to: ‘dip_pairs.tsv’


2022-12-17 17:08:44 (88.2 MB/s) - ‘dip_pairs.tsv’ saved [23488/23488]

--2022-12-17 17:08:45--  https://raw.githubusercontent.com/anhvt00/S-HNBM/master/data/Independent-testsets/Human-sets/DIP/dip_dict.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1262993 (1.2M) [text/plain]
Saving to: ‘dip_

1468it [00:00, 199327.88it/s]
100%|██████████| 1312/1312 [00:04<00:00, 321.62it/s]
100%|██████████| 1468/1468 [00:00<00:00, 1072315.97it/s]
100%|██████████| 1468/1468 [00:00<00:00, 1159992.14it/s]


In [None]:
##################################### Load Positive and Negative Dataset ##########################################################

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values


# standard scaler
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]





print("============================= INFER BY TRAINED NEURAL NETWORK ON TRAINING SET ==============================")
y_true = y
y_pred = model.predict([X1_train, X2_train])
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

print("============= INFERENCE BY NEURAL NETWORK ===============")
try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")




  spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])


accuracy: 0.44618528610354224, precision: 1.0, recall: 0.44618528610354224, specificity: nan, f1-score: 0.6170513424399435, mcc: 0.0, auroc: nan, auprc: 1.0 
0.44618528610354224	1.0	0.44618528610354224	nan	0.6170513424399435	0.0	 nan	1.0



#### Evaluate with trained FSNN-LGBM prediction

In [None]:
# Read new data

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values

Trainlabels=y

# standard scaler
# scaler = StandardScaler().fit(X)
# scaler = MinMaxScaler().fit(X)

# scaler = RobustScaler().fit(X)
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]


# Predict representation from trained neural network

intermediate_layer_model = Model(inputs=model.input,outputs=model.get_layer('Merged_feature_1').output)
# Use intermediate layer to transform pairs matrix
intermediate_output_p1 = intermediate_layer_model.predict([X1_train,X2_train])  
p_merge=pd.DataFrame(intermediate_output_p1)    

# create dataframe use transformed pairs matrix outputs and labels
X_train_feat=pd.concat((p_merge,pd.DataFrame(pd.DataFrame(Trainlabels))),axis=1,ignore_index=True)

# write to file dataframe of transformed pairs matrix and labels
X_train_feat.to_csv('X_train.csv',header=False, index=False)

# read dataframe of transformed pairs matrix and labels
Train=pd.read_csv("X_train.csv",header=None)
Train=Train.sample(frac=1)

X=Train.iloc[:,0:64].values
y=Train.iloc[:,64:].values.ravel()

extracted_df=X_train_feat

X=robust_scaler.transform(X)

# Predict probability from neural network output for new data
y_true=y
y_pred = model_.predict(X)
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")

accuracy: 0.4925068119891008, precision: 1.0, recall: 0.4925068119891008, specificity: nan, f1-score: 0.6599726152441807, mcc: 0.0, auroc: nan, auprc: 1.0 
0.4925068119891008	1.0	0.4925068119891008	nan	0.6599726152441807	0.0	 nan	1.0



  spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])


### HIPPIE HQ

#### Read dataset

In [None]:
id2seq_file = 'hiphq_dict.tsv'
id2index = {}
seqs = []
index = 0
sid1_index = 0
sid2_index = 1
ds_file = 'hiphq_pairs.tsv'
label_index = 2
use_emb = 'ac5_aph.txt'

if not os.path.isfile(ds_file) or not os.path.isfile(id2seq_file):
  !wget https://raw.githubusercontent.com/anhvt00/S-HNBM/master/data/Independent-testsets/Human-sets/HIPPIE/HQ/hiphq_dict.tsv
  !wget https://raw.githubusercontent.com/anhvt00/S-HNBM/master/data/Independent-testsets/Human-sets/HIPPIE/HQ/hiphq_pairs.tsv


# Create line variable as a list of protein sequences with index is the number of protein sequences
# id2index is a dictionary of protein id and incremental index number 
for line in open(id2seq_file):
    line = line.strip().split('\t')
    id2index[line[0]] = index
    seqs.append(line[1])
    index += 1

seq_array = []
id2_aid = {}
sid = 0

seq2t = s2t(use_emb)

max_data = -1
limit_data = max_data > 0
raw_data = []
skip_head = False
x = None
count = 0

# Create sequence array as a list of protein strings
for line in tqdm(open(ds_file)):
    if skip_head:
        skip_head = False
        continue
    line = line.rstrip('\n').rstrip('\r').split('\t')
    if id2index.get(line[sid1_index]) is None or id2index.get(line[sid2_index]) is None:
        continue
    if id2_aid.get(line[sid1_index]) is None:
        id2_aid[line[sid1_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid1_index]]])
    line[sid1_index] = id2_aid[line[sid1_index]]
    if id2_aid.get(line[sid2_index]) is None:
        id2_aid[line[sid2_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid2_index]]])
    line[sid2_index] = id2_aid[line[sid2_index]]
    raw_data.append(line)
    if limit_data:
        count += 1
        if count >= max_data:
            break

len_m_seq = np.array([len(line.split()) for line in seq_array])
avg_m_seq = int(np.average(len_m_seq)) + 1
max_m_seq = max(len_m_seq)
dim = seq2t.dim

# seq_tensor is tensor representation of dataset having shape of (number_of_sequences, padding_length, embedding_dim_of_aa)
# Random for distribution of class labels
np.random.seed(42)
np.random.shuffle(raw_data)
seq_tensor = np.array([encode_seq(line) for line in tqdm(seq_array)])

# Extract index of 1st and 2nd sequences in pairs
seq_index1 = np.array([line[sid1_index] for line in tqdm(raw_data)])
seq_index2 = np.array([line[sid2_index] for line in tqdm(raw_data)])

# Assign labels for pairs of sequences
class_map = {'0':1,'1':0}
class_labels = np.zeros((len(raw_data,)))
for i in range(len(raw_data)):
  class_labels[i] = float(raw_data[i][label_index])


--2022-12-17 17:08:50--  https://raw.githubusercontent.com/anhvt00/S-HNBM/master/data/Independent-testsets/Human-sets/HIPPIE/HQ/hiphq_dict.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4151340 (4.0M) [text/plain]
Saving to: ‘hiphq_dict.tsv’


2022-12-17 17:08:50 (303 MB/s) - ‘hiphq_dict.tsv’ saved [4151340/4151340]

--2022-12-17 17:08:50--  https://raw.githubusercontent.com/anhvt00/S-HNBM/master/data/Independent-testsets/Human-sets/HIPPIE/HQ/hiphq_pairs.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 247824 (242K) [text/

15489it [00:00, 419370.83it/s]
100%|██████████| 5517/5517 [00:21<00:00, 253.76it/s]
100%|██████████| 15489/15489 [00:00<00:00, 1826620.22it/s]
100%|██████████| 15489/15489 [00:00<00:00, 2163571.94it/s]


In [None]:
##################################### Load Positive and Negative Dataset ##########################################################

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values


# standard scaler
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]





print("============================= INFER BY TRAINED NEURAL NETWORK ON TRAINING SET ==============================")
y_true = y
y_pred = model.predict([X1_train, X2_train])
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

print("============= INFERENCE BY NEURAL NETWORK ===============")
try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")


accuracy: 0.4125508425334108, precision: 1.0, recall: 0.4125508425334108, specificity: nan, f1-score: 0.5841217605923488, mcc: 0.0, auroc: nan, auprc: 1.0 
0.4125508425334108	1.0	0.4125508425334108	nan	0.5841217605923488	0.0	 nan	1.0



  spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])


#### Evaluate with trained FSNN-LGBM prediction

In [None]:
# Read new data

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values

Trainlabels=y

# standard scaler
# scaler = StandardScaler().fit(X)
# scaler = MinMaxScaler().fit(X)

# scaler = RobustScaler().fit(X)
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]


# Predict representation from trained neural network

intermediate_layer_model = Model(inputs=model.input,outputs=model.get_layer('Merged_feature_1').output)
# Use intermediate layer to transform pairs matrix
intermediate_output_p1 = intermediate_layer_model.predict([X1_train,X2_train])  
p_merge=pd.DataFrame(intermediate_output_p1)    

# create dataframe use transformed pairs matrix outputs and labels
X_train_feat=pd.concat((p_merge,pd.DataFrame(pd.DataFrame(Trainlabels))),axis=1,ignore_index=True)

# write to file dataframe of transformed pairs matrix and labels
X_train_feat.to_csv('X_train.csv',header=False, index=False)

# read dataframe of transformed pairs matrix and labels
Train=pd.read_csv("X_train.csv",header=None)
Train=Train.sample(frac=1)

X=Train.iloc[:,0:64].values
y=Train.iloc[:,64:].values.ravel()

extracted_df=X_train_feat

X=robust_scaler.transform(X)

# Predict probability from neural network output for new data
y_true=y
y_pred = model_.predict(X)
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")



### HIPPIE LQ

#### Read dataset

In [None]:
id2seq_file = 'hiplq_dict.tsv'
id2index = {}
seqs = []
index = 0
sid1_index = 0
sid2_index = 1
ds_file = 'hiplq_pairs.tsv'
label_index = 2
use_emb = 'ac5_aph.txt'

if not os.path.isfile(ds_file) or not os.path.isfile(id2seq_file):
  !wget https://raw.githubusercontent.com/anhvt00/S-HNBM/master/data/Independent-testsets/Human-sets/HIPPIE/LQ/hiplq_dict.tsv
  !wget https://raw.githubusercontent.com/anhvt00/S-HNBM/master/data/Independent-testsets/Human-sets/HIPPIE/LQ/hiplq_pairs.tsv



# Create line variable as a list of protein sequences with index is the number of protein sequences
# id2index is a dictionary of protein id and incremental index number 
for line in open(id2seq_file):
    line = line.strip().split('\t')
    id2index[line[0]] = index
    seqs.append(line[1])
    index += 1

seq_array = []
id2_aid = {}
sid = 0

seq2t = s2t(use_emb)

max_data = -1
limit_data = max_data > 0
raw_data = []
skip_head = False
x = None
count = 0

# Create sequence array as a list of protein strings
for line in tqdm(open(ds_file)):
    if skip_head:
        skip_head = False
        continue
    line = line.rstrip('\n').rstrip('\r').split('\t')
    if id2index.get(line[sid1_index]) is None or id2index.get(line[sid2_index]) is None:
        continue
    if id2_aid.get(line[sid1_index]) is None:
        id2_aid[line[sid1_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid1_index]]])
    line[sid1_index] = id2_aid[line[sid1_index]]
    if id2_aid.get(line[sid2_index]) is None:
        id2_aid[line[sid2_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid2_index]]])
    line[sid2_index] = id2_aid[line[sid2_index]]
    raw_data.append(line)
    if limit_data:
        count += 1
        if count >= max_data:
            break

len_m_seq = np.array([len(line.split()) for line in seq_array])
avg_m_seq = int(np.average(len_m_seq)) + 1
max_m_seq = max(len_m_seq)
dim = seq2t.dim

# seq_tensor is tensor representation of dataset having shape of (number_of_sequences, padding_length, embedding_dim_of_aa)
# Random for distribution of class labels
np.random.seed(42)
np.random.shuffle(raw_data)
seq_tensor = np.array([encode_seq(line) for line in tqdm(seq_array)])

# Extract index of 1st and 2nd sequences in pairs
seq_index1 = np.array([line[sid1_index] for line in tqdm(raw_data)])
seq_index2 = np.array([line[sid2_index] for line in tqdm(raw_data)])

# Assign labels for pairs of sequences
class_map = {'0':1,'1':0}
class_labels = np.zeros((len(raw_data,)))
for i in range(len(raw_data)):
  class_labels[i] = float(raw_data[i][label_index])


--2022-12-17 17:09:16--  https://raw.githubusercontent.com/anhvt00/S-HNBM/master/data/Independent-testsets/Human-sets/HIPPIE/LQ/hiplq_dict.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6837915 (6.5M) [text/plain]
Saving to: ‘hiplq_dict.tsv’


2022-12-17 17:09:17 (377 MB/s) - ‘hiplq_dict.tsv’ saved [6837915/6837915]

--2022-12-17 17:09:17--  https://raw.githubusercontent.com/anhvt00/S-HNBM/master/data/Independent-testsets/Human-sets/HIPPIE/LQ/hiplq_pairs.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1626980 (1.6M) [text

101684it [00:00, 210856.33it/s]
100%|██████████| 10011/10011 [00:38<00:00, 257.39it/s]
100%|██████████| 101684/101684 [00:00<00:00, 1743294.89it/s]
100%|██████████| 101684/101684 [00:00<00:00, 1820288.55it/s]


In [None]:
##################################### Load Positive and Negative Dataset ##########################################################

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values


# standard scaler
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]





print("============================= INFER BY TRAINED NEURAL NETWORK ON TRAINING SET ==============================")
y_true = y
y_pred = model.predict([X1_train, X2_train])
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

print("============= INFERENCE BY NEURAL NETWORK ===============")
try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")


accuracy: 0.34825537941072343, precision: 1.0, recall: 0.34825537941072343, specificity: nan, f1-score: 0.5166015055143841, mcc: 0.0, auroc: nan, auprc: 1.0 
0.34825537941072343	1.0	0.34825537941072343	nan	0.5166015055143841	0.0	 nan	1.0



  spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])


#### Evaluate with trained FSNN-LGBM prediction

In [None]:
# Read new data

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values

Trainlabels=y

# standard scaler
# scaler = StandardScaler().fit(X)
# scaler = MinMaxScaler().fit(X)

# scaler = RobustScaler().fit(X)
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]


# Predict representation from trained neural network

intermediate_layer_model = Model(inputs=model.input,outputs=model.get_layer('Merged_feature_1').output)
# Use intermediate layer to transform pairs matrix
intermediate_output_p1 = intermediate_layer_model.predict([X1_train,X2_train])  
p_merge=pd.DataFrame(intermediate_output_p1)    

# create dataframe use transformed pairs matrix outputs and labels
X_train_feat=pd.concat((p_merge,pd.DataFrame(pd.DataFrame(Trainlabels))),axis=1,ignore_index=True)

# write to file dataframe of transformed pairs matrix and labels
X_train_feat.to_csv('X_train.csv',header=False, index=False)

# read dataframe of transformed pairs matrix and labels
Train=pd.read_csv("X_train.csv",header=None)
Train=Train.sample(frac=1)

X=Train.iloc[:,0:64].values
y=Train.iloc[:,64:].values.ravel()

extracted_df=X_train_feat

X=robust_scaler.transform(X)

# Predict probability from neural network output for new data
y_true=y
y_pred = model_.predict(X)
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")

accuracy: 0.3803744935289721, precision: 1.0, recall: 0.3803744935289721, specificity: nan, f1-score: 0.551117823912455, mcc: 0.0, auroc: nan, auprc: 1.0 
0.3803744935289721	1.0	0.3803744935289721	nan	0.551117823912455	0.0	 nan	1.0



  spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])


## Evaluation on cross-species datasets

### E. coli

#### Read dataset

In [None]:
id2seq_file = 'ecoli_dict.tsv'
id2index = {}
seqs = []
index = 0
sid1_index = 0
sid2_index = 1
ds_file = 'ecoli_test.tsv'
label_index = 2
use_emb = 'ac5_aph.txt'

if not os.path.isfile(ds_file) or not os.path.isfile(id2seq_file):
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/cross-species/pairs/ecoli_test.tsv
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/cross-species/seqs/ecoli_dict.tsv


# Create line variable as a list of protein sequences with index is the number of protein sequences
# id2index is a dictionary of protein id and incremental index number 
for line in open(id2seq_file):
    line = line.strip().split('\t')
    id2index[line[0]] = index
    seqs.append(line[1])
    index += 1

seq_array = []
id2_aid = {}
sid = 0

seq2t = s2t(use_emb)

max_data = -1
limit_data = max_data > 0
raw_data = []
skip_head = False
x = None
count = 0

# Create sequence array as a list of protein strings
for line in tqdm(open(ds_file)):
    if skip_head:
        skip_head = False
        continue
    line = line.rstrip('\n').rstrip('\r').split('\t')
    if id2index.get(line[sid1_index]) is None or id2index.get(line[sid2_index]) is None:
        continue
    if id2_aid.get(line[sid1_index]) is None:
        id2_aid[line[sid1_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid1_index]]])
    line[sid1_index] = id2_aid[line[sid1_index]]
    if id2_aid.get(line[sid2_index]) is None:
        id2_aid[line[sid2_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid2_index]]])
    line[sid2_index] = id2_aid[line[sid2_index]]
    raw_data.append(line)
    if limit_data:
        count += 1
        if count >= max_data:
            break

len_m_seq = np.array([len(line.split()) for line in seq_array])
avg_m_seq = int(np.average(len_m_seq)) + 1
max_m_seq = max(len_m_seq)
dim = seq2t.dim

# seq_tensor is tensor representation of dataset having shape of (number_of_sequences, padding_length, embedding_dim_of_aa)
# Random for distribution of class labels
np.random.seed(42)
np.random.shuffle(raw_data)
seq_tensor = np.array([encode_seq(line) for line in tqdm(seq_array)])

# Extract index of 1st and 2nd sequences in pairs
seq_index1 = np.array([line[sid1_index] for line in tqdm(raw_data)])
seq_index2 = np.array([line[sid2_index] for line in tqdm(raw_data)])

# Assign labels for pairs of sequences
class_map = {'0':1,'1':0}
class_labels = np.zeros((len(raw_data,)))
for i in range(len(raw_data)):
  class_labels[i] = float(raw_data[i][label_index])


--2022-12-17 17:10:20--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/cross-species/pairs/ecoli_test.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 768468 (750K) [text/plain]
Saving to: ‘ecoli_test.tsv’


2022-12-17 17:10:21 (111 MB/s) - ‘ecoli_test.tsv’ saved [768468/768468]

--2022-12-17 17:10:21--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/cross-species/seqs/ecoli_dict.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5395789 (5.1M) [text/plain]
S

22000it [00:00, 404300.38it/s]
100%|██████████| 7138/7138 [00:13<00:00, 535.10it/s]
100%|██████████| 22000/22000 [00:00<00:00, 1751707.35it/s]
100%|██████████| 22000/22000 [00:00<00:00, 1977893.98it/s]


In [None]:
##################################### Load Positive and Negative Dataset ##########################################################

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values


# standard scaler
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]





print("============================= INFER BY TRAINED NEURAL NETWORK ON TRAINING SET ==============================")
y_true = y
y_pred = model.predict([X1_train, X2_train])
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

print("============= INFERENCE BY NEURAL NETWORK ===============")
try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")


accuracy: 0.8594545454545455, precision: 0.20613562970936491, recall: 0.1915, specificity: 0.92625, f1-score: 0.19854847071021253, mcc: 0.12173536460387874, auroc: 0.6732481000000001, auprc: 0.15631856627686794 
0.8594545454545455	0.20613562970936491	0.1915	0.92625	0.19854847071021253	0.12173536460387874	0.6732481000000001	0.15631856627686794



#### Evaluate with trained FSNN-LGBM prediction

In [None]:
# Read new data

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values

Trainlabels=y

# standard scaler
# scaler = StandardScaler().fit(X)
# scaler = MinMaxScaler().fit(X)

# scaler = RobustScaler().fit(X)
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]


# Predict representation from trained neural network

intermediate_layer_model = Model(inputs=model.input,outputs=model.get_layer('Merged_feature_1').output)
# Use intermediate layer to transform pairs matrix
intermediate_output_p1 = intermediate_layer_model.predict([X1_train,X2_train])  
p_merge=pd.DataFrame(intermediate_output_p1)    

# create dataframe use transformed pairs matrix outputs and labels
X_train_feat=pd.concat((p_merge,pd.DataFrame(pd.DataFrame(Trainlabels))),axis=1,ignore_index=True)

# write to file dataframe of transformed pairs matrix and labels
X_train_feat.to_csv('X_train.csv',header=False, index=False)

# read dataframe of transformed pairs matrix and labels
Train=pd.read_csv("X_train.csv",header=None)
Train=Train.sample(frac=1)

X=Train.iloc[:,0:64].values
y=Train.iloc[:,64:].values.ravel()

extracted_df=X_train_feat

X=robust_scaler.transform(X)

# Predict probability from neural network output for new data
y_true=y
y_pred = model_.predict(X)
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")

accuracy: 0.8146818181818182, precision: 0.15021892893230043, recall: 0.223, specificity: 0.87385, f1-score: 0.17951298047896957, mcc: 0.08148801096905704, auroc: 0.548425, auprc: 0.10413518478826664 
0.8146818181818182	0.15021892893230043	0.223	0.87385	0.17951298047896957	0.08148801096905704	0.548425	0.10413518478826664



### Fly

#### Read dataset

In [None]:
id2seq_file = 'fly_dict.tsv'
id2index = {}
seqs = []
index = 0
sid1_index = 0
sid2_index = 1
ds_file = 'fly_test.tsv'
label_index = 2
use_emb = 'ac5_aph.txt'

if not os.path.isfile(ds_file) or not os.path.isfile(id2seq_file):
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/cross-species/pairs/fly_test.tsv
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/cross-species/seqs/fly_dict.tsv


# Create line variable as a list of protein sequences with index is the number of protein sequences
# id2index is a dictionary of protein id and incremental index number 
for line in open(id2seq_file):
    line = line.strip().split('\t')
    id2index[line[0]] = index
    seqs.append(line[1])
    index += 1

seq_array = []
id2_aid = {}
sid = 0

seq2t = s2t(use_emb)

max_data = -1
limit_data = max_data > 0
raw_data = []
skip_head = False
x = None
count = 0

# Create sequence array as a list of protein strings
for line in tqdm(open(ds_file)):
    if skip_head:
        skip_head = False
        continue
    line = line.rstrip('\n').rstrip('\r').split('\t')
    if id2index.get(line[sid1_index]) is None or id2index.get(line[sid2_index]) is None:
        continue
    if id2_aid.get(line[sid1_index]) is None:
        id2_aid[line[sid1_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid1_index]]])
    line[sid1_index] = id2_aid[line[sid1_index]]
    if id2_aid.get(line[sid2_index]) is None:
        id2_aid[line[sid2_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid2_index]]])
    line[sid2_index] = id2_aid[line[sid2_index]]
    raw_data.append(line)
    if limit_data:
        count += 1
        if count >= max_data:
            break

len_m_seq = np.array([len(line.split()) for line in seq_array])
avg_m_seq = int(np.average(len_m_seq)) + 1
max_m_seq = max(len_m_seq)
dim = seq2t.dim

# seq_tensor is tensor representation of dataset having shape of (number_of_sequences, padding_length, embedding_dim_of_aa)
# Random for distribution of class labels
np.random.seed(42)
np.random.shuffle(raw_data)
seq_tensor = np.array([encode_seq(line) for line in tqdm(seq_array)])

# Extract index of 1st and 2nd sequences in pairs
seq_index1 = np.array([line[sid1_index] for line in tqdm(raw_data)])
seq_index2 = np.array([line[sid2_index] for line in tqdm(raw_data)])

# Assign labels for pairs of sequences
class_map = {'0':1,'1':0}
class_labels = np.zeros((len(raw_data,)))
for i in range(len(raw_data)):
  class_labels[i] = float(raw_data[i][label_index])


--2022-12-17 17:10:40--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/cross-species/pairs/fly_test.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1980000 (1.9M) [text/plain]
Saving to: ‘fly_test.tsv’


2022-12-17 17:10:41 (276 MB/s) - ‘fly_test.tsv’ saved [1980000/1980000]

--2022-12-17 17:10:41--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/cross-species/seqs/fly_dict.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7839133 (7.5M) [text/plain]
Saving

55000it [00:00, 357257.80it/s]
100%|██████████| 19213/19213 [00:44<00:00, 427.85it/s]
100%|██████████| 55000/55000 [00:00<00:00, 1847000.91it/s]
100%|██████████| 55000/55000 [00:00<00:00, 1982440.77it/s]


In [None]:
##################################### Load Positive and Negative Dataset ##########################################################

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values


# standard scaler
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]





print("============================= INFER BY TRAINED NEURAL NETWORK ON TRAINING SET ==============================")
y_true = y
y_pred = model.predict([X1_train, X2_train])
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

print("============= INFERENCE BY NEURAL NETWORK ===============")
try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")


accuracy: 0.8688363636363636, precision: 0.2612165660051769, recall: 0.2422, specificity: 0.9315, f1-score: 0.25134910751349104, mcc: 0.17973721227928793, auroc: 0.707428478, auprc: 0.20583744604505744 
0.8688363636363636	0.2612165660051769	0.2422	0.9315	0.25134910751349104	0.17973721227928793	0.707428478	0.20583744604505744



#### Evaluate with trained FSNN-LGBM prediction

In [None]:
# Read new data

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values

Trainlabels=y

# standard scaler
# scaler = StandardScaler().fit(X)
# scaler = MinMaxScaler().fit(X)

# scaler = RobustScaler().fit(X)
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]


# Predict representation from trained neural network

intermediate_layer_model = Model(inputs=model.input,outputs=model.get_layer('Merged_feature_1').output)
# Use intermediate layer to transform pairs matrix
intermediate_output_p1 = intermediate_layer_model.predict([X1_train,X2_train])  
p_merge=pd.DataFrame(intermediate_output_p1)    

# create dataframe use transformed pairs matrix outputs and labels
X_train_feat=pd.concat((p_merge,pd.DataFrame(pd.DataFrame(Trainlabels))),axis=1,ignore_index=True)

# write to file dataframe of transformed pairs matrix and labels
X_train_feat.to_csv('X_train.csv',header=False, index=False)

# read dataframe of transformed pairs matrix and labels
Train=pd.read_csv("X_train.csv",header=None)
Train=Train.sample(frac=1)

X=Train.iloc[:,0:64].values
y=Train.iloc[:,64:].values.ravel()

extracted_df=X_train_feat

X=robust_scaler.transform(X)

# Predict probability from neural network output for new data
y_true=y
y_pred = model_.predict(X)
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")

accuracy: 0.8284909090909091, precision: 0.20458483273357322, recall: 0.307, specificity: 0.88064, f1-score: 0.2455410701431656, mcc: 0.15716102969112408, auroc: 0.5938199999999999, auprc: 0.125807543649207 
0.8284909090909091	0.20458483273357322	0.307	0.88064	0.2455410701431656	0.15716102969112408	0.5938199999999999	0.125807543649207



### Mouse

#### Read dataset

In [None]:
id2seq_file = 'mouse_dict.tsv'
id2index = {}
seqs = []
index = 0
sid1_index = 0
sid2_index = 1
ds_file = 'mouse_test.tsv'
label_index = 2
use_emb = 'ac5_aph.txt'

if not os.path.isfile(ds_file) or not os.path.isfile(id2seq_file):
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/cross-species/pairs/mouse_test.tsv
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/cross-species/seqs/mouse_dict.tsv

# Create line variable as a list of protein sequences with index is the number of protein sequences
# id2index is a dictionary of protein id and incremental index number 
for line in open(id2seq_file):
    line = line.strip().split('\t')
    id2index[line[0]] = index
    seqs.append(line[1])
    index += 1

seq_array = []
id2_aid = {}
sid = 0

seq2t = s2t(use_emb)

max_data = -1
limit_data = max_data > 0
raw_data = []
skip_head = False
x = None
count = 0

# Create sequence array as a list of protein strings
for line in tqdm(open(ds_file)):
    if skip_head:
        skip_head = False
        continue
    line = line.rstrip('\n').rstrip('\r').split('\t')
    if id2index.get(line[sid1_index]) is None or id2index.get(line[sid2_index]) is None:
        continue
    if id2_aid.get(line[sid1_index]) is None:
        id2_aid[line[sid1_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid1_index]]])
    line[sid1_index] = id2_aid[line[sid1_index]]
    if id2_aid.get(line[sid2_index]) is None:
        id2_aid[line[sid2_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid2_index]]])
    line[sid2_index] = id2_aid[line[sid2_index]]
    raw_data.append(line)
    if limit_data:
        count += 1
        if count >= max_data:
            break

len_m_seq = np.array([len(line.split()) for line in seq_array])
avg_m_seq = int(np.average(len_m_seq)) + 1
max_m_seq = max(len_m_seq)
dim = seq2t.dim

# seq_tensor is tensor representation of dataset having shape of (number_of_sequences, padding_length, embedding_dim_of_aa)
# Random for distribution of class labels
np.random.seed(42)
np.random.shuffle(raw_data)
seq_tensor = np.array([encode_seq(line) for line in tqdm(seq_array)])

# Extract index of 1st and 2nd sequences in pairs
seq_index1 = np.array([line[sid1_index] for line in tqdm(raw_data)])
seq_index2 = np.array([line[sid2_index] for line in tqdm(raw_data)])

# Assign labels for pairs of sequences
class_map = {'0':1,'1':0}
class_labels = np.zeros((len(raw_data,)))
for i in range(len(raw_data)):
  class_labels[i] = float(raw_data[i][label_index])


--2022-12-17 17:11:39--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/cross-species/pairs/mouse_test.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2860000 (2.7M) [text/plain]
Saving to: ‘mouse_test.tsv’


2022-12-17 17:11:40 (252 MB/s) - ‘mouse_test.tsv’ saved [2860000/2860000]

--2022-12-17 17:11:40--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/cross-species/seqs/mouse_dict.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16989035 (16M) [text/plain

55000it [00:00, 148263.96it/s]
100%|██████████| 37497/37497 [01:28<00:00, 425.19it/s]
100%|██████████| 55000/55000 [00:00<00:00, 1730790.79it/s]
100%|██████████| 55000/55000 [00:00<00:00, 2011042.80it/s]


In [None]:
##################################### Load Positive and Negative Dataset ##########################################################

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values


# standard scaler
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]





print("============================= INFER BY TRAINED NEURAL NETWORK ON TRAINING SET ==============================")
y_true = y
y_pred = model.predict([X1_train, X2_train])
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

print("============= INFERENCE BY NEURAL NETWORK ===============")
try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")


accuracy: 0.8713090909090909, precision: 0.346301775147929, recall: 0.4682, specificity: 0.91162, f1-score: 0.39812925170068025, mcc: 0.33256063154399146, auroc: 0.819519356, auprc: 0.3519240403914037 
0.8713090909090909	0.346301775147929	0.4682	0.91162	0.39812925170068025	0.33256063154399146	0.819519356	0.3519240403914037



#### Evaluate with trained FSNN-LGBM prediction

In [None]:
# Read new data

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values

Trainlabels=y

# standard scaler
# scaler = StandardScaler().fit(X)
# scaler = MinMaxScaler().fit(X)

# scaler = RobustScaler().fit(X)
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]


# Predict representation from trained neural network

intermediate_layer_model = Model(inputs=model.input,outputs=model.get_layer('Merged_feature_1').output)
# Use intermediate layer to transform pairs matrix
intermediate_output_p1 = intermediate_layer_model.predict([X1_train,X2_train])  
p_merge=pd.DataFrame(intermediate_output_p1)    

# create dataframe use transformed pairs matrix outputs and labels
X_train_feat=pd.concat((p_merge,pd.DataFrame(pd.DataFrame(Trainlabels))),axis=1,ignore_index=True)

# write to file dataframe of transformed pairs matrix and labels
X_train_feat.to_csv('X_train.csv',header=False, index=False)

# read dataframe of transformed pairs matrix and labels
Train=pd.read_csv("X_train.csv",header=None)
Train=Train.sample(frac=1)

X=Train.iloc[:,0:64].values
y=Train.iloc[:,64:].values.ravel()

extracted_df=X_train_feat

X=robust_scaler.transform(X)

# Predict probability from neural network output for new data
y_true=y
y_pred = model_.predict(X)
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")

accuracy: 0.8395272727272727, precision: 0.2879627577033917, recall: 0.5196, specificity: 0.87152, f1-score: 0.3705605477107402, mcc: 0.3036362221199512, auroc: 0.69556, auprc: 0.1932981761754096 
0.8395272727272727	0.2879627577033917	0.5196	0.87152	0.3705605477107402	0.3036362221199512	0.69556	0.1932981761754096



### Worm

#### Read dataset

In [None]:
id2seq_file = 'worm_dict.tsv'
id2index = {}
seqs = []
index = 0
sid1_index = 0
sid2_index = 1
ds_file = 'worm_test.tsv'
label_index = 2
use_emb = 'ac5_aph.txt'

if not os.path.isfile(ds_file) or not os.path.isfile(id2seq_file):
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/cross-species/pairs/worm_test.tsv
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/cross-species/seqs/worm_dict.tsv


# Create line variable as a list of protein sequences with index is the number of protein sequences
# id2index is a dictionary of protein id and incremental index number 
for line in open(id2seq_file):
    line = line.strip().split('\t')
    id2index[line[0]] = index
    seqs.append(line[1])
    index += 1

seq_array = []
id2_aid = {}
sid = 0

seq2t = s2t(use_emb)

max_data = -1
limit_data = max_data > 0
raw_data = []
skip_head = False
x = None
count = 0

# Create sequence array as a list of protein strings
for line in tqdm(open(ds_file)):
    if skip_head:
        skip_head = False
        continue
    line = line.rstrip('\n').rstrip('\r').split('\t')
    if id2index.get(line[sid1_index]) is None or id2index.get(line[sid2_index]) is None:
        continue
    if id2_aid.get(line[sid1_index]) is None:
        id2_aid[line[sid1_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid1_index]]])
    line[sid1_index] = id2_aid[line[sid1_index]]
    if id2_aid.get(line[sid2_index]) is None:
        id2_aid[line[sid2_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid2_index]]])
    line[sid2_index] = id2_aid[line[sid2_index]]
    raw_data.append(line)
    if limit_data:
        count += 1
        if count >= max_data:
            break

len_m_seq = np.array([len(line.split()) for line in seq_array])
avg_m_seq = int(np.average(len_m_seq)) + 1
max_m_seq = max(len_m_seq)
dim = seq2t.dim

# seq_tensor is tensor representation of dataset having shape of (number_of_sequences, padding_length, embedding_dim_of_aa)
# Random for distribution of class labels
np.random.seed(42)
np.random.shuffle(raw_data)
seq_tensor = np.array([encode_seq(line) for line in tqdm(seq_array)])

# Extract index of 1st and 2nd sequences in pairs
seq_index1 = np.array([line[sid1_index] for line in tqdm(raw_data)])
seq_index2 = np.array([line[sid2_index] for line in tqdm(raw_data)])

# Assign labels for pairs of sequences
class_map = {'0':1,'1':0}
class_labels = np.zeros((len(raw_data,)))
for i in range(len(raw_data)):
  class_labels[i] = float(raw_data[i][label_index])


--2022-12-17 17:13:22--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/cross-species/pairs/worm_test.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1688849 (1.6M) [text/plain]
Saving to: ‘worm_test.tsv’


2022-12-17 17:13:22 (187 MB/s) - ‘worm_test.tsv’ saved [1688849/1688849]

--2022-12-17 17:13:22--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/cross-species/seqs/worm_dict.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9519727 (9.1M) [text/plain]
Sa

55000it [00:00, 328470.38it/s]
100%|██████████| 25429/25429 [00:54<00:00, 468.26it/s]
100%|██████████| 55000/55000 [00:00<00:00, 1707765.86it/s]
100%|██████████| 55000/55000 [00:00<00:00, 1904314.22it/s]


In [None]:
##################################### Load Positive and Negative Dataset ##########################################################

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values


# standard scaler
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]





print("============================= INFER BY TRAINED NEURAL NETWORK ON TRAINING SET ==============================")
y_true = y
y_pred = model.predict([X1_train, X2_train])
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

print("============= INFERENCE BY NEURAL NETWORK ===============")
try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")


accuracy: 0.8801272727272728, precision: 0.3017176997759522, recall: 0.2424, specificity: 0.9439, f1-score: 0.26882555173561046, mcc: 0.20583501234419635, auroc: 0.734948372, auprc: 0.21792906924331168 
0.8801272727272728	0.3017176997759522	0.2424	0.9439	0.26882555173561046	0.20583501234419635	0.734948372	0.21792906924331168



#### Evaluate with trained FSNN-LGBM prediction

In [None]:
# Read new data

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values

Trainlabels=y

# standard scaler
# scaler = StandardScaler().fit(X)
# scaler = MinMaxScaler().fit(X)

# scaler = RobustScaler().fit(X)
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]


# Predict representation from trained neural network

intermediate_layer_model = Model(inputs=model.input,outputs=model.get_layer('Merged_feature_1').output)
# Use intermediate layer to transform pairs matrix
intermediate_output_p1 = intermediate_layer_model.predict([X1_train,X2_train])  
p_merge=pd.DataFrame(intermediate_output_p1)    

# create dataframe use transformed pairs matrix outputs and labels
X_train_feat=pd.concat((p_merge,pd.DataFrame(pd.DataFrame(Trainlabels))),axis=1,ignore_index=True)

# write to file dataframe of transformed pairs matrix and labels
X_train_feat.to_csv('X_train.csv',header=False, index=False)

# read dataframe of transformed pairs matrix and labels
Train=pd.read_csv("X_train.csv",header=None)
Train=Train.sample(frac=1)

X=Train.iloc[:,0:64].values
y=Train.iloc[:,64:].values.ravel()

extracted_df=X_train_feat

X=robust_scaler.transform(X)

# Predict probability from neural network output for new data
y_true=y
y_pred = model_.predict(X)
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")

accuracy: 0.8452, precision: 0.2292758089368259, recall: 0.2976, specificity: 0.89996, f1-score: 0.2590078328981723, mcc: 0.17604797587956456, auroc: 0.59878, auprc: 0.13208702619414484 
0.8452	0.2292758089368259	0.2976	0.89996	0.2590078328981723	0.17604797587956456	0.59878	0.13208702619414484



### Yeast

#### Read dataset

In [None]:
id2seq_file = 'yeast_dict.tsv'
id2index = {}
seqs = []
index = 0
sid1_index = 0
sid2_index = 1
ds_file = 'yeast_test.tsv'
label_index = 2
use_emb = 'ac5_aph.txt'

if not os.path.isfile(ds_file) or not os.path.isfile(id2seq_file):
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/cross-species/pairs/yeast_test.tsv
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/cross-species/seqs/yeast_dict.tsv


# Create line variable as a list of protein sequences with index is the number of protein sequences
# id2index is a dictionary of protein id and incremental index number 
for line in open(id2seq_file):
    line = line.strip().split('\t')
    id2index[line[0]] = index
    seqs.append(line[1])
    index += 1

seq_array = []
id2_aid = {}
sid = 0

seq2t = s2t(use_emb)

max_data = -1
limit_data = max_data > 0
raw_data = []
skip_head = False
x = None
count = 0

# Create sequence array as a list of protein strings
for line in tqdm(open(ds_file)):
    if skip_head:
        skip_head = False
        continue
    line = line.rstrip('\n').rstrip('\r').split('\t')
    if id2index.get(line[sid1_index]) is None or id2index.get(line[sid2_index]) is None:
        continue
    if id2_aid.get(line[sid1_index]) is None:
        id2_aid[line[sid1_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid1_index]]])
    line[sid1_index] = id2_aid[line[sid1_index]]
    if id2_aid.get(line[sid2_index]) is None:
        id2_aid[line[sid2_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid2_index]]])
    line[sid2_index] = id2_aid[line[sid2_index]]
    raw_data.append(line)
    if limit_data:
        count += 1
        if count >= max_data:
            break

len_m_seq = np.array([len(line.split()) for line in seq_array])
avg_m_seq = int(np.average(len_m_seq)) + 1
max_m_seq = max(len_m_seq)
dim = seq2t.dim

# seq_tensor is tensor representation of dataset having shape of (number_of_sequences, padding_length, embedding_dim_of_aa)
# Random for distribution of class labels
np.random.seed(42)
np.random.shuffle(raw_data)
seq_tensor = np.array([encode_seq(line) for line in tqdm(seq_array)])

# Extract index of 1st and 2nd sequences in pairs
seq_index1 = np.array([line[sid1_index] for line in tqdm(raw_data)])
seq_index2 = np.array([line[sid2_index] for line in tqdm(raw_data)])

# Assign labels for pairs of sequences
class_map = {'0':1,'1':0}
class_labels = np.zeros((len(raw_data,)))
for i in range(len(raw_data)):
  class_labels[i] = float(raw_data[i][label_index])


--2022-12-17 17:14:30--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/cross-species/pairs/yeast_test.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1556676 (1.5M) [text/plain]
Saving to: ‘yeast_test.tsv’


2022-12-17 17:14:31 (253 MB/s) - ‘yeast_test.tsv’ saved [1556676/1556676]

--2022-12-17 17:14:31--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/cross-species/seqs/yeast_dict.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2014626 (1.9M) [text/plain

55000it [00:00, 169654.40it/s]
100%|██████████| 5664/5664 [00:11<00:00, 486.35it/s]
100%|██████████| 55000/55000 [00:00<00:00, 1741912.67it/s]
100%|██████████| 55000/55000 [00:00<00:00, 2076743.28it/s]


In [None]:
##################################### Load Positive and Negative Dataset ##########################################################

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values


# standard scaler
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]





print("============================= INFER BY TRAINED NEURAL NETWORK ON TRAINING SET ==============================")
y_true = y
y_pred = model.predict([X1_train, X2_train])
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

print("============= INFERENCE BY NEURAL NETWORK ===============")
try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")


accuracy: 0.8780363636363636, precision: 0.25641756988020536, recall: 0.1798, specificity: 0.94786, f1-score: 0.21138020221020457, mcc: 0.15022443324352652, auroc: 0.699082788, auprc: 0.18664255060274748 
0.8780363636363636	0.25641756988020536	0.1798	0.94786	0.21138020221020457	0.15022443324352652	0.699082788	0.18664255060274748



#### Evaluate with trained FSNN-LGBM prediction

In [None]:
# Read new data

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values

Trainlabels=y

# standard scaler
# scaler = StandardScaler().fit(X)
# scaler = MinMaxScaler().fit(X)

# scaler = RobustScaler().fit(X)
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]


# Predict representation from trained neural network

intermediate_layer_model = Model(inputs=model.input,outputs=model.get_layer('Merged_feature_1').output)
# Use intermediate layer to transform pairs matrix
intermediate_output_p1 = intermediate_layer_model.predict([X1_train,X2_train])  
p_merge=pd.DataFrame(intermediate_output_p1)    

# create dataframe use transformed pairs matrix outputs and labels
X_train_feat=pd.concat((p_merge,pd.DataFrame(pd.DataFrame(Trainlabels))),axis=1,ignore_index=True)

# write to file dataframe of transformed pairs matrix and labels
X_train_feat.to_csv('X_train.csv',header=False, index=False)

# read dataframe of transformed pairs matrix and labels
Train=pd.read_csv("X_train.csv",header=None)
Train=Train.sample(frac=1)

X=Train.iloc[:,0:64].values
y=Train.iloc[:,64:].values.ravel()

extracted_df=X_train_feat

X=robust_scaler.transform(X)

# Predict probability from neural network output for new data
y_true=y
y_pred = model_.predict(X)
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")

accuracy: 0.8470727272727273, precision: 0.20457301229863156, recall: 0.2362, specificity: 0.90816, f1-score: 0.21925183328692102, mcc: 0.1353986749086399, auroc: 0.5721799999999999, auprc: 0.11775650914130043 
0.8470727272727273	0.20457301229863156	0.2362	0.90816	0.21925183328692102	0.1353986749086399	0.5721799999999999	0.11775650914130043



### Guo-2008

#### Read dataset

In [None]:
id2seq_file = 'guo_dict.tsv'
id2index = {}
seqs = []
index = 0
sid1_index = 0
sid2_index = 1
ds_file = 'guo_pairs.tsv'
label_index = 2
use_emb = 'ac5_aph.txt'

if not os.path.isfile(ds_file) or not os.path.isfile(id2seq_file):
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Golden-standard-datasets/Guo-2008/guo_pairs.tsv
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Golden-standard-datasets/Guo-2008/guo_dict.tsv


# Create line variable as a list of protein sequences with index is the number of protein sequences
# id2index is a dictionary of protein id and incremental index number 
for line in open(id2seq_file):
    line = line.strip().split('\t')
    id2index[line[0]] = index
    seqs.append(line[1])
    index += 1

seq_array = []
id2_aid = {}
sid = 0

seq2t = s2t(use_emb)

max_data = -1
limit_data = max_data > 0
raw_data = []
skip_head = False
x = None
count = 0

# Create sequence array as a list of protein strings
for line in tqdm(open(ds_file)):
    if skip_head:
        skip_head = False
        continue
    line = line.rstrip('\n').rstrip('\r').split('\t')
    if id2index.get(line[sid1_index]) is None or id2index.get(line[sid2_index]) is None:
        continue
    if id2_aid.get(line[sid1_index]) is None:
        id2_aid[line[sid1_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid1_index]]])
    line[sid1_index] = id2_aid[line[sid1_index]]
    if id2_aid.get(line[sid2_index]) is None:
        id2_aid[line[sid2_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid2_index]]])
    line[sid2_index] = id2_aid[line[sid2_index]]
    raw_data.append(line)
    if limit_data:
        count += 1
        if count >= max_data:
            break

len_m_seq = np.array([len(line.split()) for line in seq_array])
avg_m_seq = int(np.average(len_m_seq)) + 1
max_m_seq = max(len_m_seq)
dim = seq2t.dim

# seq_tensor is tensor representation of dataset having shape of (number_of_sequences, padding_length, embedding_dim_of_aa)
# Random for distribution of class labels
np.random.seed(42)
np.random.shuffle(raw_data)
seq_tensor = np.array([encode_seq(line) for line in tqdm(seq_array)])

# Extract index of 1st and 2nd sequences in pairs
seq_index1 = np.array([line[sid1_index] for line in tqdm(raw_data)])
seq_index2 = np.array([line[sid2_index] for line in tqdm(raw_data)])

# Assign labels for pairs of sequences
class_map = {'0':1,'1':0}
class_labels = np.zeros((len(raw_data,)))
for i in range(len(raw_data)):
  class_labels[i] = float(raw_data[i][label_index])


--2022-12-17 17:14:56--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Golden-standard-datasets/Guo-2008/guo_pairs.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 179008 (175K) [text/plain]
Saving to: ‘guo_pairs.tsv’


2022-12-17 17:14:56 (52.1 MB/s) - ‘guo_pairs.tsv’ saved [179008/179008]

--2022-12-17 17:14:56--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Golden-standard-datasets/Guo-2008/guo_dict.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1415079 (1.3M) [text/plain]
Saving to: ‘guo_di

11188it [00:00, 288355.82it/s]
100%|██████████| 2497/2497 [00:08<00:00, 306.32it/s]
100%|██████████| 11188/11188 [00:00<00:00, 1722998.83it/s]
100%|██████████| 11188/11188 [00:00<00:00, 2319962.09it/s]


In [None]:
##################################### Load Positive and Negative Dataset ##########################################################

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values


# standard scaler
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]





print("============================= INFER BY TRAINED NEURAL NETWORK ON TRAINING SET ==============================")
y_true = y
y_pred = model.predict([X1_train, X2_train])
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

print("============= INFERENCE BY NEURAL NETWORK ===============")
try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")


accuracy: 0.5076868072935288, precision: 0.5458422174840085, recall: 0.09152663568108688, specificity: 0.9238469789059707, f1-score: 0.15676668707899571, mcc: 0.0277354425138481, auroc: 0.5420829387275733, auprc: 0.5323118008797516 
0.5076868072935288	0.5458422174840085	0.09152663568108688	0.9238469789059707	0.15676668707899571	0.0277354425138481	0.5420829387275733	0.5323118008797516



#### Evaluate with trained FSNN-LGBM prediction

In [None]:
# Read new data

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values

Trainlabels=y

# standard scaler
# scaler = StandardScaler().fit(X)
# scaler = MinMaxScaler().fit(X)

# scaler = RobustScaler().fit(X)
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]


# Predict representation from trained neural network

intermediate_layer_model = Model(inputs=model.input,outputs=model.get_layer('Merged_feature_1').output)
# Use intermediate layer to transform pairs matrix
intermediate_output_p1 = intermediate_layer_model.predict([X1_train,X2_train])  
p_merge=pd.DataFrame(intermediate_output_p1)    

# create dataframe use transformed pairs matrix outputs and labels
X_train_feat=pd.concat((p_merge,pd.DataFrame(pd.DataFrame(Trainlabels))),axis=1,ignore_index=True)

# write to file dataframe of transformed pairs matrix and labels
X_train_feat.to_csv('X_train.csv',header=False, index=False)

# read dataframe of transformed pairs matrix and labels
Train=pd.read_csv("X_train.csv",header=None)
Train=Train.sample(frac=1)

X=Train.iloc[:,0:64].values
y=Train.iloc[:,64:].values.ravel()

extracted_df=X_train_feat

X=robust_scaler.transform(X)

# Predict probability from neural network output for new data
y_true=y
y_pred = model_.predict(X)
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")

accuracy: 0.5136753664640686, precision: 0.5524331734064428, recall: 0.14408294601358598, specificity: 0.8832677869145513, f1-score: 0.22855522472706652, mcc: 0.04060969083429286, auroc: 0.5136753664640686, auprc: 0.5075547260932415 
0.5136753664640686	0.5524331734064428	0.14408294601358598	0.8832677869145513	0.22855522472706652	0.04060969083429286	0.5136753664640686	0.5075547260932415



### Martin-2005

#### Read dataset

In [None]:
id2seq_file = 'martin_dict.tsv'
id2index = {}
seqs = []
index = 0
sid1_index = 0
sid2_index = 1
ds_file = 'martin_pairs.tsv'
label_index = 2
use_emb = 'ac5_aph.txt'

if not os.path.isfile(ds_file) or not os.path.isfile(id2seq_file):
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Golden-standard-datasets/Martin-2005/martin_pairs.tsv
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Golden-standard-datasets/Martin-2005/martin_dict.tsv



# Create line variable as a list of protein sequences with index is the number of protein sequences
# id2index is a dictionary of protein id and incremental index number 
for line in open(id2seq_file):
    line = line.strip().split('\t')
    id2index[line[0]] = index
    seqs.append(line[1])
    index += 1

seq_array = []
id2_aid = {}
sid = 0

seq2t = s2t(use_emb)

max_data = -1
limit_data = max_data > 0
raw_data = []
skip_head = False
x = None
count = 0

# Create sequence array as a list of protein strings
for line in tqdm(open(ds_file)):
    if skip_head:
        skip_head = False
        continue
    line = line.rstrip('\n').rstrip('\r').split('\t')
    if id2index.get(line[sid1_index]) is None or id2index.get(line[sid2_index]) is None:
        continue
    if id2_aid.get(line[sid1_index]) is None:
        id2_aid[line[sid1_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid1_index]]])
    line[sid1_index] = id2_aid[line[sid1_index]]
    if id2_aid.get(line[sid2_index]) is None:
        id2_aid[line[sid2_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid2_index]]])
    line[sid2_index] = id2_aid[line[sid2_index]]
    raw_data.append(line)
    if limit_data:
        count += 1
        if count >= max_data:
            break

len_m_seq = np.array([len(line.split()) for line in seq_array])
avg_m_seq = int(np.average(len_m_seq)) + 1
max_m_seq = max(len_m_seq)
dim = seq2t.dim

# seq_tensor is tensor representation of dataset having shape of (number_of_sequences, padding_length, embedding_dim_of_aa)
# Random for distribution of class labels
np.random.seed(42)
np.random.shuffle(raw_data)
seq_tensor = np.array([encode_seq(line) for line in tqdm(seq_array)])

# Extract index of 1st and 2nd sequences in pairs
seq_index1 = np.array([line[sid1_index] for line in tqdm(raw_data)])
seq_index2 = np.array([line[sid2_index] for line in tqdm(raw_data)])

# Assign labels for pairs of sequences
class_map = {'0':1,'1':0}
class_labels = np.zeros((len(raw_data,)))
for i in range(len(raw_data)):
  class_labels[i] = float(raw_data[i][label_index])


--2022-12-17 17:15:09--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Golden-standard-datasets/Martin-2005/martin_pairs.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 39040 (38K) [text/plain]
Saving to: ‘martin_pairs.tsv’


2022-12-17 17:15:10 (156 MB/s) - ‘martin_pairs.tsv’ saved [39040/39040]

--2022-12-17 17:15:10--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Golden-standard-datasets/Martin-2005/martin_dict.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 508100 (496K) [text/plain]
Saving

2878it [00:00, 279704.50it/s]
100%|██████████| 1414/1414 [00:02<00:00, 520.56it/s]
100%|██████████| 2878/2878 [00:00<00:00, 1320700.98it/s]
100%|██████████| 2878/2878 [00:00<00:00, 2013545.77it/s]


In [None]:
##################################### Load Positive and Negative Dataset ##########################################################

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values


# standard scaler
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]





print("============================= INFER BY TRAINED NEURAL NETWORK ON TRAINING SET ==============================")
y_true = y
y_pred = model.predict([X1_train, X2_train])
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

print("============= INFERENCE BY NEURAL NETWORK ===============")
try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")


accuracy: 0.4742876997915219, precision: 0.33568904593639576, recall: 0.06690140845070422, specificity: 0.8710562414266118, f1-score: 0.11156782149148563, mcc: -0.10417158582534025, auroc: 0.42339061805676304, auprc: 0.43704281323156763 
0.4742876997915219	0.33568904593639576	0.06690140845070422	0.8710562414266118	0.11156782149148563	-0.10417158582534025	0.42339061805676304	0.43704281323156763



#### Evaluate with trained FSNN-LGBM prediction

In [None]:
# Read new data

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values

Trainlabels=y

# standard scaler
# scaler = StandardScaler().fit(X)
# scaler = MinMaxScaler().fit(X)

# scaler = RobustScaler().fit(X)
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]


# Predict representation from trained neural network

intermediate_layer_model = Model(inputs=model.input,outputs=model.get_layer('Merged_feature_1').output)
# Use intermediate layer to transform pairs matrix
intermediate_output_p1 = intermediate_layer_model.predict([X1_train,X2_train])  
p_merge=pd.DataFrame(intermediate_output_p1)    

# create dataframe use transformed pairs matrix outputs and labels
X_train_feat=pd.concat((p_merge,pd.DataFrame(pd.DataFrame(Trainlabels))),axis=1,ignore_index=True)

# write to file dataframe of transformed pairs matrix and labels
X_train_feat.to_csv('X_train.csv',header=False, index=False)

# read dataframe of transformed pairs matrix and labels
Train=pd.read_csv("X_train.csv",header=None)
Train=Train.sample(frac=1)

X=Train.iloc[:,0:64].values
y=Train.iloc[:,64:].values.ravel()

extracted_df=X_train_feat

X=robust_scaler.transform(X)

# Predict probability from neural network output for new data
y_true=y
y_pred = model_.predict(X)
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")

accuracy: 0.48054204308547604, precision: 0.41418764302059496, recall: 0.12746478873239436, specificity: 0.8244170096021948, f1-score: 0.19493807215939685, mcc: -0.0670359416015423, auroc: 0.47594089916729454, auprc: 0.4833016371470315 
0.48054204308547604	0.41418764302059496	0.12746478873239436	0.8244170096021948	0.19493807215939685	-0.0670359416015423	0.47594089916729454	0.4833016371470315



### Chen-2019 multispecies

#### Read dataset

In [None]:
id2seq_file = 'dict.tsv'
id2index = {}
seqs = []
index = 0
sid1_index = 0
sid2_index = 1
ds_file = 'pairs.tsv'
label_index = 2
use_emb = 'ac5_aph.txt'

if not os.path.isfile(ds_file) or not os.path.isfile(id2seq_file):
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/multi-species/dict.tsv
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/multi-species/pairs.tsv



# Create line variable as a list of protein sequences with index is the number of protein sequences
# id2index is a dictionary of protein id and incremental index number 
for line in open(id2seq_file):
    line = line.strip().split('\t')
    id2index[line[0]] = index
    seqs.append(line[1])
    index += 1

seq_array = []
id2_aid = {}
sid = 0

seq2t = s2t(use_emb)

max_data = -1
limit_data = max_data > 0
raw_data = []
skip_head = False
x = None
count = 0

# Create sequence array as a list of protein strings
for line in tqdm(open(ds_file)):
    if skip_head:
        skip_head = False
        continue
    line = line.rstrip('\n').rstrip('\r').split('\t')
    if id2index.get(line[sid1_index]) is None or id2index.get(line[sid2_index]) is None:
        continue
    if id2_aid.get(line[sid1_index]) is None:
        id2_aid[line[sid1_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid1_index]]])
    line[sid1_index] = id2_aid[line[sid1_index]]
    if id2_aid.get(line[sid2_index]) is None:
        id2_aid[line[sid2_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid2_index]]])
    line[sid2_index] = id2_aid[line[sid2_index]]
    raw_data.append(line)
    if limit_data:
        count += 1
        if count >= max_data:
            break

len_m_seq = np.array([len(line.split()) for line in seq_array])
avg_m_seq = int(np.average(len_m_seq)) + 1
max_m_seq = max(len_m_seq)
dim = seq2t.dim

# seq_tensor is tensor representation of dataset having shape of (number_of_sequences, padding_length, embedding_dim_of_aa)
# Random for distribution of class labels
np.random.seed(42)
np.random.shuffle(raw_data)
seq_tensor = np.array([encode_seq(line) for line in tqdm(seq_array)])

# Extract index of 1st and 2nd sequences in pairs
seq_index1 = np.array([line[sid1_index] for line in tqdm(raw_data)])
seq_index2 = np.array([line[sid2_index] for line in tqdm(raw_data)])

# Assign labels for pairs of sequences
class_map = {'0':1,'1':0}
class_labels = np.zeros((len(raw_data,)))
for i in range(len(raw_data)):
  class_labels[i] = float(raw_data[i][label_index])


--2022-12-17 17:15:14--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/multi-species/dict.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5878871 (5.6M) [text/plain]
Saving to: ‘dict.tsv’


2022-12-17 17:15:15 (389 MB/s) - ‘dict.tsv’ saved [5878871/5878871]

--2022-12-17 17:15:15--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Independent-testsets/multi-species/pairs.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1346871 (1.3M) [text/plain]
Saving to: ‘pairs.tsv’


2022-12

65918it [00:00, 495058.25it/s]
100%|██████████| 11529/11529 [00:32<00:00, 349.61it/s]
100%|██████████| 65918/65918 [00:00<00:00, 1854600.48it/s]
100%|██████████| 65918/65918 [00:00<00:00, 2199681.21it/s]


In [None]:
##################################### Load Positive and Negative Dataset ##########################################################

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values


# standard scaler
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]





print("============================= INFER BY TRAINED NEURAL NETWORK ON TRAINING SET ==============================")
y_true = y
y_pred = model.predict([X1_train, X2_train])
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

print("============= INFERENCE BY NEURAL NETWORK ===============")
try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")


accuracy: 0.49214175187353987, precision: 0.46588514225500527, recall: 0.10731514912467005, specificity: 0.8769683546224096, f1-score: 0.17444699267588964, mcc: -0.024616174578581812, auroc: 0.45783036257878673, auprc: 0.47437603893757857 
0.49214175187353987	0.46588514225500527	0.10731514912467005	0.8769683546224096	0.17444699267588964	-0.024616174578581812	0.45783036257878673	0.47437603893757857



#### Evaluate with trained FSNN-LGBM prediction

In [None]:
# Read new data

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values

Trainlabels=y

# standard scaler
# scaler = StandardScaler().fit(X)
# scaler = MinMaxScaler().fit(X)

# scaler = RobustScaler().fit(X)
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]


# Predict representation from trained neural network

intermediate_layer_model = Model(inputs=model.input,outputs=model.get_layer('Merged_feature_1').output)
# Use intermediate layer to transform pairs matrix
intermediate_output_p1 = intermediate_layer_model.predict([X1_train,X2_train])  
p_merge=pd.DataFrame(intermediate_output_p1)    

# create dataframe use transformed pairs matrix outputs and labels
X_train_feat=pd.concat((p_merge,pd.DataFrame(pd.DataFrame(Trainlabels))),axis=1,ignore_index=True)

# write to file dataframe of transformed pairs matrix and labels
X_train_feat.to_csv('X_train.csv',header=False, index=False)

# read dataframe of transformed pairs matrix and labels
Train=pd.read_csv("X_train.csv",header=None)
Train=Train.sample(frac=1)

X=Train.iloc[:,0:64].values
y=Train.iloc[:,64:].values.ravel()

extracted_df=X_train_feat

X=robust_scaler.transform(X)

# Predict probability from neural network output for new data
y_true=y
y_pred = model_.predict(X)
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")

accuracy: 0.4919445371522194, precision: 0.4767858704205648, recall: 0.16544798082466095, specificity: 0.8184410934797779, f1-score: 0.24565276151004597, mcc: -0.021272372303364186, auroc: 0.4919445371522194, auprc: 0.4961592691344804 
0.4919445371522194	0.4767858704205648	0.16544798082466095	0.8184410934797779	0.24565276151004597	-0.021272372303364186	0.4919445371522194	0.4961592691344804



## Evaluation on inter-species datasets

### DENV

#### Read dataset

In [None]:
!rm -rf pro_seq.txt protein_pair_label.txt
id2seq_file = 'pro_seq.txt'
id2index = {}
seqs = []
index = 0
sid1_index = 0
sid2_index = 1
ds_file = 'protein_pair_label.txt'
label_index = 2
use_emb = 'ac5_aph.txt'

if not os.path.isfile(ds_file) or not os.path.isfile(id2seq_file):
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/DENV/protein_pair_label.txt
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/DENV/pro_seq.txt


# Create line variable as a list of protein sequences with index is the number of protein sequences
# id2index is a dictionary of protein id and incremental index number 
for line in open(id2seq_file):
    line = line.strip().split('\t')
    id2index[line[0]] = index
    seqs.append(line[1])
    index += 1

seq_array = []
id2_aid = {}
sid = 0

seq2t = s2t(use_emb)

max_data = -1
limit_data = max_data > 0
raw_data = []
skip_head = False
x = None
count = 0

# Create sequence array as a list of protein strings
for line in tqdm(open(ds_file)):
    if skip_head:
        skip_head = False
        continue
    line = line.rstrip('\n').rstrip('\r').split('\t')
    if id2index.get(line[sid1_index]) is None or id2index.get(line[sid2_index]) is None:
        continue
    if id2_aid.get(line[sid1_index]) is None:
        id2_aid[line[sid1_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid1_index]]])
    line[sid1_index] = id2_aid[line[sid1_index]]
    if id2_aid.get(line[sid2_index]) is None:
        id2_aid[line[sid2_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid2_index]]])
    line[sid2_index] = id2_aid[line[sid2_index]]
    raw_data.append(line)
    if limit_data:
        count += 1
        if count >= max_data:
            break

len_m_seq = np.array([len(line.split()) for line in seq_array])
avg_m_seq = int(np.average(len_m_seq)) + 1
max_m_seq = max(len_m_seq)
dim = seq2t.dim

# seq_tensor is tensor representation of dataset having shape of (number_of_sequences, padding_length, embedding_dim_of_aa)
# Random for distribution of class labels
np.random.seed(42)
np.random.shuffle(raw_data)
seq_tensor = np.array([encode_seq(line) for line in tqdm(seq_array)])

# Extract index of 1st and 2nd sequences in pairs
seq_index1 = np.array([line[sid1_index] for line in tqdm(raw_data)])
seq_index2 = np.array([line[sid2_index] for line in tqdm(raw_data)])

# Assign labels for pairs of sequences
class_map = {'0':1,'1':0}
class_labels = np.zeros((len(raw_data,)))
for i in range(len(raw_data)):
  class_labels[i] = float(raw_data[i][label_index])


--2022-12-17 17:16:03--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/DENV/protein_pair_label.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 316639 (309K) [text/plain]
Saving to: ‘protein_pair_label.txt’


2022-12-17 17:16:05 (100 MB/s) - ‘protein_pair_label.txt’ saved [316639/316639]

--2022-12-17 17:16:05--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/DENV/pro_seq.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4

10197it [00:00, 385253.64it/s]
100%|██████████| 8028/8028 [00:24<00:00, 325.86it/s]
100%|██████████| 10197/10197 [00:00<00:00, 1621892.98it/s]
100%|██████████| 10197/10197 [00:00<00:00, 2274600.75it/s]


In [None]:
##################################### Load Positive and Negative Dataset ##########################################################

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values


# standard scaler
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]





print("============================= INFER BY TRAINED NEURAL NETWORK ON TRAINING SET ==============================")
y_true = y
y_pred = model.predict([X1_train, X2_train])
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

print("============= INFERENCE BY NEURAL NETWORK ===============")
try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")


accuracy: 0.8248504462096695, precision: 0.2002791346824843, recall: 0.30960086299892126, specificity: 0.8763754045307444, f1-score: 0.24322033898305082, mcc: 0.15383777212932226, auroc: 0.6891447862227389, auprc: 0.18495294942295404 
0.8248504462096695	0.2002791346824843	0.30960086299892126	0.8763754045307444	0.24322033898305082	0.15383777212932226	0.6891447862227389	0.18495294942295404



#### Evaluate with trained FSNN-LGBM prediction

In [None]:
# Read new data

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values

Trainlabels=y

# standard scaler
# scaler = StandardScaler().fit(X)
# scaler = MinMaxScaler().fit(X)

# scaler = RobustScaler().fit(X)
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]


# Predict representation from trained neural network

intermediate_layer_model = Model(inputs=model.input,outputs=model.get_layer('Merged_feature_1').output)
# Use intermediate layer to transform pairs matrix
intermediate_output_p1 = intermediate_layer_model.predict([X1_train,X2_train])  
p_merge=pd.DataFrame(intermediate_output_p1)    

# create dataframe use transformed pairs matrix outputs and labels
X_train_feat=pd.concat((p_merge,pd.DataFrame(pd.DataFrame(Trainlabels))),axis=1,ignore_index=True)

# write to file dataframe of transformed pairs matrix and labels
X_train_feat.to_csv('X_train.csv',header=False, index=False)

# read dataframe of transformed pairs matrix and labels
Train=pd.read_csv("X_train.csv",header=None)
Train=Train.sample(frac=1)

X=Train.iloc[:,0:64].values
y=Train.iloc[:,64:].values.ravel()

extracted_df=X_train_feat

X=robust_scaler.transform(X)

# Predict probability from neural network output for new data
y_true=y
y_pred = model_.predict(X)
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")

accuracy: 0.8036677454153183, precision: 0.18214074512123005, recall: 0.33225458468176916, specificity: 0.8508090614886732, f1-score: 0.23529411764705885, mcc: 0.14149703819261353, auroc: 0.5915318230852211, auprc: 0.12122122628917598 
0.8036677454153183	0.18214074512123005	0.33225458468176916	0.8508090614886732	0.23529411764705885	0.14149703819261353	0.5915318230852211	0.12122122628917598



### HIV

#### Read dataset

In [None]:
!rm -rf pro_seq.txt protein_pair_label.txt
id2seq_file = 'pro_seq.txt'
id2index = {}
seqs = []
index = 0
sid1_index = 0
sid2_index = 1
ds_file = 'protein_pair_label.txt'
label_index = 2
use_emb = 'ac5_aph.txt'

if not os.path.isfile(ds_file) or not os.path.isfile(id2seq_file):
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/HIV/protein_pair_label.txt
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/HIV/pro_seq.txt



# Create line variable as a list of protein sequences with index is the number of protein sequences
# id2index is a dictionary of protein id and incremental index number 
for line in open(id2seq_file):
    line = line.strip().split('\t')
    id2index[line[0]] = index
    seqs.append(line[1])
    index += 1

seq_array = []
id2_aid = {}
sid = 0

seq2t = s2t(use_emb)

max_data = -1
limit_data = max_data > 0
raw_data = []
skip_head = False
x = None
count = 0

# Create sequence array as a list of protein strings
for line in tqdm(open(ds_file)):
    if skip_head:
        skip_head = False
        continue
    line = line.rstrip('\n').rstrip('\r').split('\t')
    if id2index.get(line[sid1_index]) is None or id2index.get(line[sid2_index]) is None:
        continue
    if id2_aid.get(line[sid1_index]) is None:
        id2_aid[line[sid1_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid1_index]]])
    line[sid1_index] = id2_aid[line[sid1_index]]
    if id2_aid.get(line[sid2_index]) is None:
        id2_aid[line[sid2_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid2_index]]])
    line[sid2_index] = id2_aid[line[sid2_index]]
    raw_data.append(line)
    if limit_data:
        count += 1
        if count >= max_data:
            break

len_m_seq = np.array([len(line.split()) for line in seq_array])
avg_m_seq = int(np.average(len_m_seq)) + 1
max_m_seq = max(len_m_seq)
dim = seq2t.dim

# seq_tensor is tensor representation of dataset having shape of (number_of_sequences, padding_length, embedding_dim_of_aa)
# Random for distribution of class labels
np.random.seed(42)
np.random.shuffle(raw_data)
seq_tensor = np.array([encode_seq(line) for line in tqdm(seq_array)])

# Extract index of 1st and 2nd sequences in pairs
seq_index1 = np.array([line[sid1_index] for line in tqdm(raw_data)])
seq_index2 = np.array([line[sid2_index] for line in tqdm(raw_data)])

# Assign labels for pairs of sequences
class_map = {'0':1,'1':0}
class_labels = np.zeros((len(raw_data,)))
for i in range(len(raw_data)):
  class_labels[i] = float(raw_data[i][label_index])


--2022-12-17 17:16:33--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/HIV/protein_pair_label.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1797825 (1.7M) [text/plain]
Saving to: ‘protein_pair_label.txt’


2022-12-17 17:16:34 (308 MB/s) - ‘protein_pair_label.txt’ saved [1797825/1797825]

--2022-12-17 17:16:34--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/HIV/pro_seq.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 

108680it [00:00, 256156.82it/s]
100%|██████████| 20464/20464 [01:02<00:00, 327.02it/s]
100%|██████████| 108680/108680 [00:00<00:00, 1771293.07it/s]
100%|██████████| 108680/108680 [00:00<00:00, 2037824.82it/s]


In [None]:
##################################### Load Positive and Negative Dataset ##########################################################

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values


# standard scaler
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]





print("============================= INFER BY TRAINED NEURAL NETWORK ON TRAINING SET ==============================")
y_true = y
y_pred = model.predict([X1_train, X2_train])
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

print("============= INFERENCE BY NEURAL NETWORK ===============")
try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")


accuracy: 0.8250736105999263, precision: 0.18692998697113078, recall: 0.27591093117408905, specificity: 0.8799898785425101, f1-score: 0.22286718718064016, mcc: 0.13149038251945297, auroc: 0.6549280802832369, auprc: 0.15326659553790065 
0.8250736105999263	0.18692998697113078	0.27591093117408905	0.8799898785425101	0.22286718718064016	0.13149038251945297	0.6549280802832369	0.15326659553790065



#### Evaluate with trained FSNN-LGBM prediction

In [None]:
# Read new data

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values

Trainlabels=y

# standard scaler
# scaler = StandardScaler().fit(X)
# scaler = MinMaxScaler().fit(X)

# scaler = RobustScaler().fit(X)
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]


# Predict representation from trained neural network

intermediate_layer_model = Model(inputs=model.input,outputs=model.get_layer('Merged_feature_1').output)
# Use intermediate layer to transform pairs matrix
intermediate_output_p1 = intermediate_layer_model.predict([X1_train,X2_train])  
p_merge=pd.DataFrame(intermediate_output_p1)    

# create dataframe use transformed pairs matrix outputs and labels
X_train_feat=pd.concat((p_merge,pd.DataFrame(pd.DataFrame(Trainlabels))),axis=1,ignore_index=True)

# write to file dataframe of transformed pairs matrix and labels
X_train_feat.to_csv('X_train.csv',header=False, index=False)

# read dataframe of transformed pairs matrix and labels
Train=pd.read_csv("X_train.csv",header=None)
Train=Train.sample(frac=1)

X=Train.iloc[:,0:64].values
y=Train.iloc[:,64:].values.ravel()

extracted_df=X_train_feat

X=robust_scaler.transform(X)

# Predict probability from neural network output for new data
y_true=y
y_pred = model_.predict(X)
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")

accuracy: 0.8033768862716231, precision: 0.17678501097169863, recall: 0.3180161943319838, specificity: 0.8519129554655871, f1-score: 0.22724478356778646, mcc: 0.13208265669426533, auroc: 0.5849645748987855, auprc: 0.11821902419215907 
0.8033768862716231	0.17678501097169863	0.3180161943319838	0.8519129554655871	0.22724478356778646	0.13208265669426533	0.5849645748987855	0.11821902419215907



### Hepatitis

#### Read dataset

In [None]:
!rm -rf pro_seq.txt protein_pair_label.txt
id2seq_file = 'pro_seq.txt'
id2index = {}
seqs = []
index = 0
sid1_index = 0
sid2_index = 1
ds_file = 'protein_pair_label.txt'
label_index = 2
use_emb = 'ac5_aph.txt'

if not os.path.isfile(ds_file) or not os.path.isfile(id2seq_file):
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/Hepatitis/protein_pair_label.txt
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/Hepatitis/pro_seq.txt



# Create line variable as a list of protein sequences with index is the number of protein sequences
# id2index is a dictionary of protein id and incremental index number 
for line in open(id2seq_file):
    line = line.strip().split('\t')
    id2index[line[0]] = index
    seqs.append(line[1])
    index += 1

seq_array = []
id2_aid = {}
sid = 0

seq2t = s2t(use_emb)

max_data = -1
limit_data = max_data > 0
raw_data = []
skip_head = False
x = None
count = 0

# Create sequence array as a list of protein strings
for line in tqdm(open(ds_file)):
    if skip_head:
        skip_head = False
        continue
    line = line.rstrip('\n').rstrip('\r').split('\t')
    if id2index.get(line[sid1_index]) is None or id2index.get(line[sid2_index]) is None:
        continue
    if id2_aid.get(line[sid1_index]) is None:
        id2_aid[line[sid1_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid1_index]]])
    line[sid1_index] = id2_aid[line[sid1_index]]
    if id2_aid.get(line[sid2_index]) is None:
        id2_aid[line[sid2_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid2_index]]])
    line[sid2_index] = id2_aid[line[sid2_index]]
    raw_data.append(line)
    if limit_data:
        count += 1
        if count >= max_data:
            break

len_m_seq = np.array([len(line.split()) for line in seq_array])
avg_m_seq = int(np.average(len_m_seq)) + 1
max_m_seq = max(len_m_seq)
dim = seq2t.dim

# seq_tensor is tensor representation of dataset having shape of (number_of_sequences, padding_length, embedding_dim_of_aa)
# Random for distribution of class labels
np.random.seed(42)
np.random.shuffle(raw_data)
seq_tensor = np.array([encode_seq(line) for line in tqdm(seq_array)])

# Extract index of 1st and 2nd sequences in pairs
seq_index1 = np.array([line[sid1_index] for line in tqdm(raw_data)])
seq_index2 = np.array([line[sid2_index] for line in tqdm(raw_data)])

# Assign labels for pairs of sequences
class_map = {'0':1,'1':0}
class_labels = np.zeros((len(raw_data,)))
for i in range(len(raw_data)):
  class_labels[i] = float(raw_data[i][label_index])


--2022-12-17 17:18:03--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/Hepatitis/protein_pair_label.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 342539 (335K) [text/plain]
Saving to: ‘protein_pair_label.txt’


2022-12-17 17:18:03 (90.6 MB/s) - ‘protein_pair_label.txt’ saved [342539/342539]

--2022-12-17 17:18:03--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/Hepatitis/pro_seq.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 O

14300it [00:00, 181878.84it/s]
100%|██████████| 10287/10287 [00:32<00:00, 315.30it/s]
100%|██████████| 14300/14300 [00:00<00:00, 1676033.85it/s]
100%|██████████| 14300/14300 [00:00<00:00, 2134012.21it/s]


In [None]:
##################################### Load Positive and Negative Dataset ##########################################################

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values


# standard scaler
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]





print("============================= INFER BY TRAINED NEURAL NETWORK ON TRAINING SET ==============================")
y_true = y
y_pred = model.predict([X1_train, X2_train])
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

print("============= INFERENCE BY NEURAL NETWORK ===============")
try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")


accuracy: 0.8296503496503497, precision: 0.1570048309178744, recall: 0.2, specificity: 0.8926153846153846, f1-score: 0.17591339648173207, mcc: 0.0832058912999136, auroc: 0.6169234319526626, auprc: 0.1321527721144974 
0.8296503496503497	0.1570048309178744	0.2	0.8926153846153846	0.17591339648173207	0.0832058912999136	0.6169234319526626	0.1321527721144974



#### Evaluate with trained FSNN-LGBM prediction

In [None]:
# Read new data

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values

Trainlabels=y

# standard scaler
# scaler = StandardScaler().fit(X)
# scaler = MinMaxScaler().fit(X)

# scaler = RobustScaler().fit(X)
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]


# Predict representation from trained neural network

intermediate_layer_model = Model(inputs=model.input,outputs=model.get_layer('Merged_feature_1').output)
# Use intermediate layer to transform pairs matrix
intermediate_output_p1 = intermediate_layer_model.predict([X1_train,X2_train])  
p_merge=pd.DataFrame(intermediate_output_p1)    

# create dataframe use transformed pairs matrix outputs and labels
X_train_feat=pd.concat((p_merge,pd.DataFrame(pd.DataFrame(Trainlabels))),axis=1,ignore_index=True)

# write to file dataframe of transformed pairs matrix and labels
X_train_feat.to_csv('X_train.csv',header=False, index=False)

# read dataframe of transformed pairs matrix and labels
Train=pd.read_csv("X_train.csv",header=None)
Train=Train.sample(frac=1)

X=Train.iloc[:,0:64].values
y=Train.iloc[:,64:].values.ravel()

extracted_df=X_train_feat

X=robust_scaler.transform(X)

# Predict probability from neural network output for new data
y_true=y
y_pred = model_.predict(X)
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")

accuracy: 0.8070629370629371, precision: 0.15278438838648262, recall: 0.2469230769230769, specificity: 0.8630769230769231, f1-score: 0.18876800940899735, mcc: 0.08932250530663252, auroc: 0.555, auprc: 0.10618752974773918 
0.8070629370629371	0.15278438838648262	0.2469230769230769	0.8630769230769231	0.18876800940899735	0.08932250530663252	0.555	0.10618752974773918



### Herpes

#### Read dataset

In [None]:
!rm -rf pro_seq.txt protein_pair_label.txt
id2seq_file = 'pro_seq.txt'
id2index = {}
seqs = []
index = 0
sid1_index = 0
sid2_index = 1
ds_file = 'protein_pair_label.txt'
label_index = 2
use_emb = 'ac5_aph.txt'

if not os.path.isfile(ds_file) or not os.path.isfile(id2seq_file):
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/Herpes/protein_pair_label.txt
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/Herpes/pro_seq.txt



# Create line variable as a list of protein sequences with index is the number of protein sequences
# id2index is a dictionary of protein id and incremental index number 
for line in open(id2seq_file):
    line = line.strip().split('\t')
    id2index[line[0]] = index
    seqs.append(line[1])
    index += 1

seq_array = []
id2_aid = {}
sid = 0

seq2t = s2t(use_emb)

max_data = -1
limit_data = max_data > 0
raw_data = []
skip_head = False
x = None
count = 0

# Create sequence array as a list of protein strings
for line in tqdm(open(ds_file)):
    if skip_head:
        skip_head = False
        continue
    line = line.rstrip('\n').rstrip('\r').split('\t')
    if id2index.get(line[sid1_index]) is None or id2index.get(line[sid2_index]) is None:
        continue
    if id2_aid.get(line[sid1_index]) is None:
        id2_aid[line[sid1_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid1_index]]])
    line[sid1_index] = id2_aid[line[sid1_index]]
    if id2_aid.get(line[sid2_index]) is None:
        id2_aid[line[sid2_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid2_index]]])
    line[sid2_index] = id2_aid[line[sid2_index]]
    raw_data.append(line)
    if limit_data:
        count += 1
        if count >= max_data:
            break

len_m_seq = np.array([len(line.split()) for line in seq_array])
avg_m_seq = int(np.average(len_m_seq)) + 1
max_m_seq = max(len_m_seq)
dim = seq2t.dim

# seq_tensor is tensor representation of dataset having shape of (number_of_sequences, padding_length, embedding_dim_of_aa)
# Random for distribution of class labels
np.random.seed(42)
np.random.shuffle(raw_data)
seq_tensor = np.array([encode_seq(line) for line in tqdm(seq_array)])

# Extract index of 1st and 2nd sequences in pairs
seq_index1 = np.array([line[sid1_index] for line in tqdm(raw_data)])
seq_index2 = np.array([line[sid2_index] for line in tqdm(raw_data)])

# Assign labels for pairs of sequences
class_map = {'0':1,'1':0}
class_labels = np.zeros((len(raw_data,)))
for i in range(len(raw_data)):
  class_labels[i] = float(raw_data[i][label_index])


--2022-12-17 17:18:40--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/Herpes/protein_pair_label.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1056464 (1.0M) [text/plain]
Saving to: ‘protein_pair_label.txt’


2022-12-17 17:18:41 (159 MB/s) - ‘protein_pair_label.txt’ saved [1056464/1056464]

--2022-12-17 17:18:41--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/Herpes/pro_seq.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Le

65626it [00:00, 419609.43it/s]
100%|██████████| 19845/19845 [01:01<00:00, 321.61it/s]
100%|██████████| 65626/65626 [00:00<00:00, 1784695.65it/s]
100%|██████████| 65626/65626 [00:00<00:00, 2100272.36it/s]


In [None]:
##################################### Load Positive and Negative Dataset ##########################################################

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values


# standard scaler
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]





print("============================= INFER BY TRAINED NEURAL NETWORK ON TRAINING SET ==============================")
y_true = y
y_pred = model.predict([X1_train, X2_train])
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

print("============= INFERENCE BY NEURAL NETWORK ===============")
try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")


accuracy: 0.8273550117331545, precision: 0.17530266343825665, recall: 0.24270868253436137, specificity: 0.8858196446530339, f1-score: 0.2035709264726557, mcc: 0.11139476875022422, auroc: 0.6049634514006007, auprc: 0.13554239202396556 
0.8273550117331545	0.17530266343825665	0.24270868253436137	0.8858196446530339	0.2035709264726557	0.11139476875022422	0.6049634514006007	0.13554239202396556



#### Evaluate with trained FSNN-LGBM prediction

In [None]:
# Read new data

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values

Trainlabels=y

# standard scaler
# scaler = StandardScaler().fit(X)
# scaler = MinMaxScaler().fit(X)

# scaler = RobustScaler().fit(X)
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]


# Predict representation from trained neural network

intermediate_layer_model = Model(inputs=model.input,outputs=model.get_layer('Merged_feature_1').output)
# Use intermediate layer to transform pairs matrix
intermediate_output_p1 = intermediate_layer_model.predict([X1_train,X2_train])  
p_merge=pd.DataFrame(intermediate_output_p1)    

# create dataframe use transformed pairs matrix outputs and labels
X_train_feat=pd.concat((p_merge,pd.DataFrame(pd.DataFrame(Trainlabels))),axis=1,ignore_index=True)

# write to file dataframe of transformed pairs matrix and labels
X_train_feat.to_csv('X_train.csv',header=False, index=False)

# read dataframe of transformed pairs matrix and labels
Train=pd.read_csv("X_train.csv",header=None)
Train=Train.sample(frac=1)

X=Train.iloc[:,0:64].values
y=Train.iloc[:,64:].values.ravel()

extracted_df=X_train_feat

X=robust_scaler.transform(X)

# Predict probability from neural network output for new data
y_true=y
y_pred = model_.predict(X)
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")

accuracy: 0.8057324840764332, precision: 0.16668304668304668, recall: 0.2842775729131747, specificity: 0.857877975192759, f1-score: 0.2101480701319621, mcc: 0.11290811657060225, auroc: 0.5710777740529668, auprc: 0.11244992714654128 
0.8057324840764332	0.16668304668304668	0.2842775729131747	0.857877975192759	0.2101480701319621	0.11290811657060225	0.5710777740529668	0.11244992714654128



### Influenza

#### Read dataset

In [None]:
!rm -rf pro_seq.txt protein_pair_label.txt
id2seq_file = 'pro_seq.txt'
id2index = {}
seqs = []
index = 0
sid1_index = 0
sid2_index = 1
ds_file = 'protein_pair_label.txt'
label_index = 2
use_emb = 'ac5_aph.txt'

if not os.path.isfile(ds_file) or not os.path.isfile(id2seq_file):
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/Influenza/protein_pair_label.txt
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/Influenza/pro_seq.txt



# Create line variable as a list of protein sequences with index is the number of protein sequences
# id2index is a dictionary of protein id and incremental index number 
for line in open(id2seq_file):
    line = line.strip().split('\t')
    id2index[line[0]] = index
    seqs.append(line[1])
    index += 1

seq_array = []
id2_aid = {}
sid = 0

seq2t = s2t(use_emb)

max_data = -1
limit_data = max_data > 0
raw_data = []
skip_head = False
x = None
count = 0

# Create sequence array as a list of protein strings
for line in tqdm(open(ds_file)):
    if skip_head:
        skip_head = False
        continue
    line = line.rstrip('\n').rstrip('\r').split('\t')
    if id2index.get(line[sid1_index]) is None or id2index.get(line[sid2_index]) is None:
        continue
    if id2_aid.get(line[sid1_index]) is None:
        id2_aid[line[sid1_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid1_index]]])
    line[sid1_index] = id2_aid[line[sid1_index]]
    if id2_aid.get(line[sid2_index]) is None:
        id2_aid[line[sid2_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid2_index]]])
    line[sid2_index] = id2_aid[line[sid2_index]]
    raw_data.append(line)
    if limit_data:
        count += 1
        if count >= max_data:
            break

len_m_seq = np.array([len(line.split()) for line in seq_array])
avg_m_seq = int(np.average(len_m_seq)) + 1
max_m_seq = max(len_m_seq)
dim = seq2t.dim

# seq_tensor is tensor representation of dataset having shape of (number_of_sequences, padding_length, embedding_dim_of_aa)
# Random for distribution of class labels
np.random.seed(42)
np.random.shuffle(raw_data)
seq_tensor = np.array([encode_seq(line) for line in tqdm(seq_array)])

# Extract index of 1st and 2nd sequences in pairs
seq_index1 = np.array([line[sid1_index] for line in tqdm(raw_data)])
seq_index2 = np.array([line[sid2_index] for line in tqdm(raw_data)])

# Assign labels for pairs of sequences
class_map = {'0':1,'1':0}
class_labels = np.zeros((len(raw_data,)))
for i in range(len(raw_data)):
  class_labels[i] = float(raw_data[i][label_index])


--2022-12-17 17:19:58--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/Influenza/protein_pair_label.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 538680 (526K) [text/plain]
Saving to: ‘protein_pair_label.txt’


2022-12-17 17:20:00 (83.9 MB/s) - ‘protein_pair_label.txt’ saved [538680/538680]

--2022-12-17 17:20:00--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/Influenza/pro_seq.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 O

33484it [00:00, 107638.33it/s]
100%|██████████| 16377/16377 [00:50<00:00, 327.10it/s]
100%|██████████| 33484/33484 [00:00<00:00, 1647240.47it/s]
100%|██████████| 33484/33484 [00:00<00:00, 2333622.60it/s]


In [None]:
##################################### Load Positive and Negative Dataset ##########################################################

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values


# standard scaler
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]





print("============================= INFER BY TRAINED NEURAL NETWORK ON TRAINING SET ==============================")
y_true = y
y_pred = model.predict([X1_train, X2_train])
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

print("============= INFERENCE BY NEURAL NETWORK ===============")
try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")


accuracy: 0.8231991398877075, precision: 0.1973905723905724, recall: 0.30814717477003944, specificity: 0.8747043363994744, f1-score: 0.24063622370446386, mcc: 0.15063366995450742, auroc: 0.6975917435648163, auprc: 0.18065373741448862 
0.8231991398877075	0.1973905723905724	0.30814717477003944	0.8747043363994744	0.24063622370446386	0.15063366995450742	0.6975917435648163	0.18065373741448862



#### Evaluate with trained FSNN-LGBM prediction

In [None]:
# Read new data

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values

Trainlabels=y

# standard scaler
# scaler = StandardScaler().fit(X)
# scaler = MinMaxScaler().fit(X)

# scaler = RobustScaler().fit(X)
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]


# Predict representation from trained neural network

intermediate_layer_model = Model(inputs=model.input,outputs=model.get_layer('Merged_feature_1').output)
# Use intermediate layer to transform pairs matrix
intermediate_output_p1 = intermediate_layer_model.predict([X1_train,X2_train])  
p_merge=pd.DataFrame(intermediate_output_p1)    

# create dataframe use transformed pairs matrix outputs and labels
X_train_feat=pd.concat((p_merge,pd.DataFrame(pd.DataFrame(Trainlabels))),axis=1,ignore_index=True)

# write to file dataframe of transformed pairs matrix and labels
X_train_feat.to_csv('X_train.csv',header=False, index=False)

# read dataframe of transformed pairs matrix and labels
Train=pd.read_csv("X_train.csv",header=None)
Train=Train.sample(frac=1)

X=Train.iloc[:,0:64].values
y=Train.iloc[:,64:].values.ravel()

extracted_df=X_train_feat

X=robust_scaler.transform(X)

# Predict probability from neural network output for new data
y_true=y
y_pred = model_.predict(X)
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")

accuracy: 0.8018755226376777, precision: 0.18364469510045822, recall: 0.34231274638633374, specificity: 0.8478318002628121, f1-score: 0.2390456526726313, mcc: 0.14570798832885112, auroc: 0.5950722733245729, auprc: 0.12265367026763385 
0.8018755226376777	0.18364469510045822	0.34231274638633374	0.8478318002628121	0.2390456526726313	0.14570798832885112	0.5950722733245729	0.12265367026763385



### Papilloma

#### Read dataset

In [None]:
!rm -rf pro_seq.txt protein_pair_label.txt
id2seq_file = 'pro_seq.txt'
id2index = {}
seqs = []
index = 0
sid1_index = 0
sid2_index = 1
ds_file = 'protein_pair_label.txt'
label_index = 2
use_emb = 'ac5_aph.txt'

if not os.path.isfile(ds_file) or not os.path.isfile(id2seq_file):
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/Papilloma/protein_pair_label.txt
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/Papilloma/pro_seq.txt


# Create line variable as a list of protein sequences with index is the number of protein sequences
# id2index is a dictionary of protein id and incremental index number 
for line in open(id2seq_file):
    line = line.strip().split('\t')
    id2index[line[0]] = index
    seqs.append(line[1])
    index += 1

seq_array = []
id2_aid = {}
sid = 0

seq2t = s2t(use_emb)

max_data = -1
limit_data = max_data > 0
raw_data = []
skip_head = False
x = None
count = 0

# Create sequence array as a list of protein strings
for line in tqdm(open(ds_file)):
    if skip_head:
        skip_head = False
        continue
    line = line.rstrip('\n').rstrip('\r').split('\t')
    if id2index.get(line[sid1_index]) is None or id2index.get(line[sid2_index]) is None:
        continue
    if id2_aid.get(line[sid1_index]) is None:
        id2_aid[line[sid1_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid1_index]]])
    line[sid1_index] = id2_aid[line[sid1_index]]
    if id2_aid.get(line[sid2_index]) is None:
        id2_aid[line[sid2_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid2_index]]])
    line[sid2_index] = id2_aid[line[sid2_index]]
    raw_data.append(line)
    if limit_data:
        count += 1
        if count >= max_data:
            break

len_m_seq = np.array([len(line.split()) for line in seq_array])
avg_m_seq = int(np.average(len_m_seq)) + 1
max_m_seq = max(len_m_seq)
dim = seq2t.dim

# seq_tensor is tensor representation of dataset having shape of (number_of_sequences, padding_length, embedding_dim_of_aa)
# Random for distribution of class labels
np.random.seed(42)
np.random.shuffle(raw_data)
seq_tensor = np.array([encode_seq(line) for line in tqdm(seq_array)])

# Extract index of 1st and 2nd sequences in pairs
seq_index1 = np.array([line[sid1_index] for line in tqdm(raw_data)])
seq_index2 = np.array([line[sid2_index] for line in tqdm(raw_data)])

# Assign labels for pairs of sequences
class_map = {'0':1,'1':0}
class_labels = np.zeros((len(raw_data,)))
for i in range(len(raw_data)):
  class_labels[i] = float(raw_data[i][label_index])


--2022-12-17 17:20:59--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/Papilloma/protein_pair_label.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 909090 (888K) [text/plain]
Saving to: ‘protein_pair_label.txt’


2022-12-17 17:20:59 (131 MB/s) - ‘protein_pair_label.txt’ saved [909090/909090]

--2022-12-17 17:20:59--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/Papilloma/pro_seq.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK

56089it [00:00, 405119.86it/s]
100%|██████████| 19087/19087 [00:58<00:00, 326.61it/s]
100%|██████████| 56089/56089 [00:00<00:00, 1765987.94it/s]
100%|██████████| 56089/56089 [00:00<00:00, 2189756.66it/s]


In [None]:
##################################### Load Positive and Negative Dataset ##########################################################

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values


# standard scaler
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]





print("============================= INFER BY TRAINED NEURAL NETWORK ON TRAINING SET ==============================")
y_true = y
y_pred = model.predict([X1_train, X2_train])
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

print("============= INFERENCE BY NEURAL NETWORK ===============")
try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")


accuracy: 0.8297348856281981, precision: 0.1771362251559553, recall: 0.23945871739556776, specificity: 0.888762502451461, f1-score: 0.20363575717144763, mcc: 0.11227314300611148, auroc: 0.6091413853513725, auprc: 0.1438046173009216 
0.8297348856281981	0.1771362251559553	0.23945871739556776	0.888762502451461	0.20363575717144763	0.11227314300611148	0.6091413853513725	0.1438046173009216



#### Evaluate with trained FSNN-LGBM prediction

In [None]:
# Read new data

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values

Trainlabels=y

# standard scaler
# scaler = StandardScaler().fit(X)
# scaler = MinMaxScaler().fit(X)

# scaler = RobustScaler().fit(X)
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]


# Predict representation from trained neural network

intermediate_layer_model = Model(inputs=model.input,outputs=model.get_layer('Merged_feature_1').output)
# Use intermediate layer to transform pairs matrix
intermediate_output_p1 = intermediate_layer_model.predict([X1_train,X2_train])  
p_merge=pd.DataFrame(intermediate_output_p1)    

# create dataframe use transformed pairs matrix outputs and labels
X_train_feat=pd.concat((p_merge,pd.DataFrame(pd.DataFrame(Trainlabels))),axis=1,ignore_index=True)

# write to file dataframe of transformed pairs matrix and labels
X_train_feat.to_csv('X_train.csv',header=False, index=False)

# read dataframe of transformed pairs matrix and labels
Train=pd.read_csv("X_train.csv",header=None)
Train=Train.sample(frac=1)

X=Train.iloc[:,0:64].values
y=Train.iloc[:,64:].values.ravel()

extracted_df=X_train_feat

X=robust_scaler.transform(X)

# Predict probability from neural network output for new data
y_true=y
y_pred = model_.predict(X)
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")

accuracy: 0.8071101285457042, precision: 0.16775092936802974, recall: 0.28319278289860755, specificity: 0.8595018631104138, f1-score: 0.21069526519296708, mcc: 0.1138103169189368, auroc: 0.5713473230045106, auprc: 0.11267014498532305 
0.8071101285457042	0.16775092936802974	0.28319278289860755	0.8595018631104138	0.21069526519296708	0.1138103169189368	0.5713473230045106	0.11267014498532305



### SARS2

#### Read dataset

In [None]:
!rm -rf pro_seq.txt protein_pair_label.txt
id2seq_file = 'pro_seq.txt'
id2index = {}
seqs = []
index = 0
sid1_index = 0
sid2_index = 1
ds_file = 'protein_pair_label.txt'
label_index = 2
use_emb = 'ac5_aph.txt'

if not os.path.isfile(ds_file) or not os.path.isfile(id2seq_file):
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/SARS2/protein_pair_label.txt
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/SARS2/pro_seq.txt



# Create line variable as a list of protein sequences with index is the number of protein sequences
# id2index is a dictionary of protein id and incremental index number 
for line in open(id2seq_file):
    line = line.strip().split('\t')
    id2index[line[0]] = index
    seqs.append(line[1])
    index += 1

seq_array = []
id2_aid = {}
sid = 0

seq2t = s2t(use_emb)

max_data = -1
limit_data = max_data > 0
raw_data = []
skip_head = False
x = None
count = 0

# Create sequence array as a list of protein strings
for line in tqdm(open(ds_file)):
    if skip_head:
        skip_head = False
        continue
    line = line.rstrip('\n').rstrip('\r').split('\t')
    if id2index.get(line[sid1_index]) is None or id2index.get(line[sid2_index]) is None:
        continue
    if id2_aid.get(line[sid1_index]) is None:
        id2_aid[line[sid1_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid1_index]]])
    line[sid1_index] = id2_aid[line[sid1_index]]
    if id2_aid.get(line[sid2_index]) is None:
        id2_aid[line[sid2_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid2_index]]])
    line[sid2_index] = id2_aid[line[sid2_index]]
    raw_data.append(line)
    if limit_data:
        count += 1
        if count >= max_data:
            break

len_m_seq = np.array([len(line.split()) for line in seq_array])
avg_m_seq = int(np.average(len_m_seq)) + 1
max_m_seq = max(len_m_seq)
dim = seq2t.dim

# seq_tensor is tensor representation of dataset having shape of (number_of_sequences, padding_length, embedding_dim_of_aa)
# Random for distribution of class labels
np.random.seed(42)
np.random.shuffle(raw_data)
seq_tensor = np.array([encode_seq(line) for line in tqdm(seq_array)])

# Extract index of 1st and 2nd sequences in pairs
seq_index1 = np.array([line[sid1_index] for line in tqdm(raw_data)])
seq_index2 = np.array([line[sid2_index] for line in tqdm(raw_data)])

# Assign labels for pairs of sequences
class_map = {'0':1,'1':0}
class_labels = np.zeros((len(raw_data,)))
for i in range(len(raw_data)):
  class_labels[i] = float(raw_data[i][label_index])


--2022-12-17 17:22:12--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/SARS2/protein_pair_label.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 157705 (154K) [text/plain]
Saving to: ‘protein_pair_label.txt’


2022-12-17 17:22:12 (68.3 MB/s) - ‘protein_pair_label.txt’ saved [157705/157705]

--2022-12-17 17:22:12--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/SARS2/pro_seq.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length

6248it [00:00, 317694.83it/s]
100%|██████████| 5360/5360 [00:16<00:00, 317.77it/s]
100%|██████████| 6248/6248 [00:00<00:00, 1773432.46it/s]
100%|██████████| 6248/6248 [00:00<00:00, 2193522.34it/s]


In [None]:
##################################### Load Positive and Negative Dataset ##########################################################

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values


# standard scaler
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]





print("============================= INFER BY TRAINED NEURAL NETWORK ON TRAINING SET ==============================")
y_true = y
y_pred = model.predict([X1_train, X2_train])
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

print("============= INFERENCE BY NEURAL NETWORK ===============")
try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")


accuracy: 0.8165813060179258, precision: 0.14841849148418493, recall: 0.2147887323943662, specificity: 0.8767605633802817, f1-score: 0.17553956834532378, mcc: 0.07786235590333669, auroc: 0.6135826844872049, auprc: 0.13196592046680622 
0.8165813060179258	0.14841849148418493	0.2147887323943662	0.8767605633802817	0.17553956834532378	0.07786235590333669	0.6135826844872049	0.13196592046680622



#### Evaluate with trained FSNN-LGBM prediction

In [None]:
# Read new data

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values

Trainlabels=y

# standard scaler
# scaler = StandardScaler().fit(X)
# scaler = MinMaxScaler().fit(X)

# scaler = RobustScaler().fit(X)
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]


# Predict representation from trained neural network

intermediate_layer_model = Model(inputs=model.input,outputs=model.get_layer('Merged_feature_1').output)
# Use intermediate layer to transform pairs matrix
intermediate_output_p1 = intermediate_layer_model.predict([X1_train,X2_train])  
p_merge=pd.DataFrame(intermediate_output_p1)    

# create dataframe use transformed pairs matrix outputs and labels
X_train_feat=pd.concat((p_merge,pd.DataFrame(pd.DataFrame(Trainlabels))),axis=1,ignore_index=True)

# write to file dataframe of transformed pairs matrix and labels
X_train_feat.to_csv('X_train.csv',header=False, index=False)

# read dataframe of transformed pairs matrix and labels
Train=pd.read_csv("X_train.csv",header=None)
Train=Train.sample(frac=1)

X=Train.iloc[:,0:64].values
y=Train.iloc[:,64:].values.ravel()

extracted_df=X_train_feat

X=robust_scaler.transform(X)

# Predict probability from neural network output for new data
y_true=y
y_pred = model_.predict(X)
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")

accuracy: 0.7924135723431498, precision: 0.1518624641833811, recall: 0.27992957746478875, specificity: 0.8436619718309859, f1-score: 0.19690402476780186, mcc: 0.09513062376667293, auroc: 0.5617957746478873, auprc: 0.10797174293481651 
0.7924135723431498	0.1518624641833811	0.27992957746478875	0.8436619718309859	0.19690402476780186	0.09513062376667293	0.5617957746478873	0.10797174293481651



### ZIKV

#### Read dataset

In [None]:
!rm -rf pro_seq.txt protein_pair_label.txt
id2seq_file = 'pro_seq.txt'
id2index = {}
seqs = []
index = 0
sid1_index = 0
sid2_index = 1
ds_file = 'protein_pair_label.txt'
label_index = 2
use_emb = 'ac5_aph.txt'

if not os.path.isfile(ds_file) or not os.path.isfile(id2seq_file):
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/ZIKV/protein_pair_label.txt
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/ZIKV/pro_seq.txt



# Create line variable as a list of protein sequences with index is the number of protein sequences
# id2index is a dictionary of protein id and incremental index number 
for line in open(id2seq_file):
    line = line.strip().split('\t')
    id2index[line[0]] = index
    seqs.append(line[1])
    index += 1

seq_array = []
id2_aid = {}
sid = 0

seq2t = s2t(use_emb)

max_data = -1
limit_data = max_data > 0
raw_data = []
skip_head = False
x = None
count = 0

# Create sequence array as a list of protein strings
for line in tqdm(open(ds_file)):
    if skip_head:
        skip_head = False
        continue
    line = line.rstrip('\n').rstrip('\r').split('\t')
    if id2index.get(line[sid1_index]) is None or id2index.get(line[sid2_index]) is None:
        continue
    if id2_aid.get(line[sid1_index]) is None:
        id2_aid[line[sid1_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid1_index]]])
    line[sid1_index] = id2_aid[line[sid1_index]]
    if id2_aid.get(line[sid2_index]) is None:
        id2_aid[line[sid2_index]] = sid
        sid += 1
        seq_array.append(seqs[id2index[line[sid2_index]]])
    line[sid2_index] = id2_aid[line[sid2_index]]
    raw_data.append(line)
    if limit_data:
        count += 1
        if count >= max_data:
            break

len_m_seq = np.array([len(line.split()) for line in seq_array])
avg_m_seq = int(np.average(len_m_seq)) + 1
max_m_seq = max(len_m_seq)
dim = seq2t.dim

# seq_tensor is tensor representation of dataset having shape of (number_of_sequences, padding_length, embedding_dim_of_aa)
# Random for distribution of class labels
np.random.seed(42)
np.random.shuffle(raw_data)
seq_tensor = np.array([encode_seq(line) for line in tqdm(seq_array)])

# Extract index of 1st and 2nd sequences in pairs
seq_index1 = np.array([line[sid1_index] for line in tqdm(raw_data)])
seq_index2 = np.array([line[sid2_index] for line in tqdm(raw_data)])

# Assign labels for pairs of sequences
class_map = {'0':1,'1':0}
class_labels = np.zeros((len(raw_data,)))
for i in range(len(raw_data)):
  class_labels[i] = float(raw_data[i][label_index])


--2022-12-17 17:22:32--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/ZIKV/protein_pair_label.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 257373 (251K) [text/plain]
Saving to: ‘protein_pair_label.txt’


2022-12-17 17:22:32 (76.5 MB/s) - ‘protein_pair_label.txt’ saved [257373/257373]

--2022-12-17 17:22:32--  https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Interspecies-host-pathogen/human-virus/ZIKV/pro_seq.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 

7799it [00:00, 339904.37it/s]
100%|██████████| 6480/6480 [00:20<00:00, 321.92it/s]
100%|██████████| 7799/7799 [00:00<00:00, 1810559.41it/s]
100%|██████████| 7799/7799 [00:00<00:00, 2315685.75it/s]


In [None]:
##################################### Load Positive and Negative Dataset ##########################################################

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values


# standard scaler
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]





print("============================= INFER BY TRAINED NEURAL NETWORK ON TRAINING SET ==============================")
y_true = y
y_pred = model.predict([X1_train, X2_train])
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

print("============= INFERENCE BY NEURAL NETWORK ===============")
try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")


accuracy: 0.7997179125528914, precision: 0.15797914995990378, recall: 0.2778561354019746, specificity: 0.8519040902679831, f1-score: 0.2014314928425358, mcc: 0.10178122093588193, auroc: 0.6079505690487605, auprc: 0.13478663232539323 
0.7997179125528914	0.15797914995990378	0.2778561354019746	0.8519040902679831	0.2014314928425358	0.10178122093588193	0.6079505690487605	0.13478663232539323



#### Evaluate with trained FSNN-LGBM prediction

In [None]:
# Read new data

FEATURE_NUM = seq_tensor.shape[1]
# Create data frame of pairs and labels
df = np.hstack([seq_tensor[seq_index1], seq_tensor[seq_index2]])
df = np.hstack([df, class_labels.reshape(-1,1)])
df = pd.DataFrame(df)
df = df.sample(frac=1)

# Create pairs matrix and label vector
X = df.iloc[:,0:FEATURE_NUM*2].values
y = df.iloc[:,FEATURE_NUM*2:].values

Trainlabels=y

# standard scaler
# scaler = StandardScaler().fit(X)
# scaler = MinMaxScaler().fit(X)

# scaler = RobustScaler().fit(X)
X = standard_scaler.transform(X)


X1_train = X[:, :FEATURE_NUM]
X2_train = X[:, FEATURE_NUM:]


# Predict representation from trained neural network

intermediate_layer_model = Model(inputs=model.input,outputs=model.get_layer('Merged_feature_1').output)
# Use intermediate layer to transform pairs matrix
intermediate_output_p1 = intermediate_layer_model.predict([X1_train,X2_train])  
p_merge=pd.DataFrame(intermediate_output_p1)    

# create dataframe use transformed pairs matrix outputs and labels
X_train_feat=pd.concat((p_merge,pd.DataFrame(pd.DataFrame(Trainlabels))),axis=1,ignore_index=True)

# write to file dataframe of transformed pairs matrix and labels
X_train_feat.to_csv('X_train.csv',header=False, index=False)

# read dataframe of transformed pairs matrix and labels
Train=pd.read_csv("X_train.csv",header=None)
Train=Train.sample(frac=1)

X=Train.iloc[:,0:64].values
y=Train.iloc[:,64:].values.ravel()

extracted_df=X_train_feat

X=robust_scaler.transform(X)

# Predict probability from neural network output for new data
y_true=y
y_pred = model_.predict(X)
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")

accuracy: 0.7827926657263752, precision: 0.1499644633972992, recall: 0.29760225669957685, specificity: 0.831311706629055, f1-score: 0.1994328922495274, mcc: 0.09637863338435428, auroc: 0.5644569816643159, auprc: 0.10848410303181581 
0.7827926657263752	0.1499644633972992	0.29760225669957685	0.831311706629055	0.1994328922495274	0.09637863338435428	0.5644569816643159	0.10848410303181581

