### Check GPU hardware

In [1]:
!nvidia-smi

Fri Jun  2 17:07:44 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  On   | 00000000:A1:00.0 Off |                    0 |
| N/A   35C    P0    63W / 400W |      0MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Install requirements for ProtBert embedder

In [2]:
!pip install torch transformers sentencepiece h5py

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [3]:
!pip3 install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html

Looking in links: https://download.pytorch.org/whl/torch_stable.html
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


### Read FASTA file function

In [7]:
def read_fasta( fasta_path, split_char="!", id_field=0):
    '''
        Reads in fasta file containing multiple sequences.
        Split_char and id_field allow to control identifier extraction from header.
        E.g.: set split_char="|" and id_field=1 for SwissProt/UniProt Headers.
        Returns dictionary holding multiple sequences or only single 
        sequence, depending on input file.
    '''
    
    seqs = dict()
    with open( fasta_path, 'r' ) as fasta_f:
        for line in fasta_f:
            # get uniprot ID from header and create new entry
            if line.startswith('>'):
                uniprot_id = line.replace('>', '').strip().split(split_char)[id_field]
                # replace tokens that are mis-interpreted when loading h5
                uniprot_id = uniprot_id.replace("/","_").replace(".","_")
                seqs[ uniprot_id ] = ''
            else:
                # repl. all whie-space chars and join seqs spanning multiple lines, drop gaps and cast to upper-case
                seq= ''.join( line.split() ).upper().replace("-","")
                # repl. all non-standard AAs and map them to unknown/X
                seq = seq.replace('U','X').replace('Z','X').replace('O','X')
                seqs[ uniprot_id ] += seq 
    example_id=next(iter(seqs))
    print("Read {} sequences.".format(len(seqs)))
    print("Example:\n{}\n{}".format(example_id,seqs[example_id]))

    return seqs

### Import ProtBert embedder model 

In [8]:
from transformers import T5EncoderModel, T5Tokenizer
import torch
import numpy as np
import h5py
import time
from tqdm import tqdm
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print("Using {}".format(device))

def get_T5_model():
    model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc")
    model = model.to(device) # move model to GPU
    model = model.eval() # set model to evaluation model
    tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False)

    return model, tokenizer

Using cuda:0


### Generate embeddings function

In [9]:
# Generate embeddings via batch-processing
# per_residue indicates that embeddings for each residue in a protein should be returned.
# per_protein indicates that embeddings for a whole protein should be returned (average-pooling)
# max_residues gives the upper limit of residues within one batch
# max_seq_len gives the upper sequences length for applying batch-processing
# max_batch gives the upper number of sequences per batch
def get_embeddings( model, tokenizer, seqs, per_residue, per_protein, sec_struct, 
                   max_residues=4000, max_seq_len=1000, max_batch=100 ):

    if sec_struct:
      sec_struct_model = load_sec_struct_model()

    results = {"residue_embs" : dict(), 
               "protein_embs" : dict(),
               "sec_structs" : dict() 
               }

    # sort sequences according to length (reduces unnecessary padding --> speeds up embedding)
    seq_dict   = sorted( seqs.items(), key=lambda kv: len( seqs[kv[0]] ), reverse=True )
    start = time.time()
    batch = list()
    for seq_idx, (pdb_id, seq) in enumerate(tqdm(seq_dict),1):
        seq = seq
        seq_len = len(seq)
        seq = ' '.join(list(seq))
        batch.append((pdb_id,seq,seq_len))

        # count residues in current batch and add the last sequence length to
        # avoid that batches with (n_res_batch > max_residues) get processed 
        n_res_batch = sum([ s_len for  _, _, s_len in batch ]) + seq_len 
        if len(batch) >= max_batch or n_res_batch>=max_residues or seq_idx==len(seq_dict) or seq_len>max_seq_len:
            pdb_ids, seqs, seq_lens = zip(*batch)
            batch = list()

            # add_special_tokens adds extra token at the end of each sequence
            token_encoding = tokenizer.batch_encode_plus(seqs, add_special_tokens=True, padding="longest")
            input_ids      = torch.tensor(token_encoding['input_ids']).to(device)
            attention_mask = torch.tensor(token_encoding['attention_mask']).to(device)
            
            try:
                with torch.no_grad():
                    # returns: ( batch-size x max_seq_len_in_minibatch x embedding_dim )
                    embedding_repr = model(input_ids, attention_mask=attention_mask)
            except RuntimeError:
                print("RuntimeError during embedding for {} (L={})".format(pdb_id, seq_len))
                continue

            if sec_struct: # in case you want to predict secondary structure from embeddings
              d3_Yhat, d8_Yhat, diso_Yhat = sec_struct_model(embedding_repr.last_hidden_state)


            for batch_idx, identifier in enumerate(pdb_ids): # for each protein in the current mini-batch
                s_len = seq_lens[batch_idx]
                # slice off padding --> batch-size x seq_len x embedding_dim  
                emb = embedding_repr.last_hidden_state[batch_idx,:s_len]
                if sec_struct: # get classification results
                    results["sec_structs"][identifier] = torch.max( d3_Yhat[batch_idx,:s_len], dim=1 )[1].detach().cpu().numpy().squeeze()
                if per_residue: # store per-residue embeddings (Lx1024)
                    results["residue_embs"][ identifier ] = emb.detach().cpu().numpy().squeeze()
                if per_protein: # apply average-pooling to derive per-protein embeddings (1024-d)
                    protein_emb = emb.mean(dim=0)
                    results["protein_embs"][identifier] = protein_emb.detach().cpu().numpy().squeeze()


    passed_time=time.time()-start
    avg_time = passed_time/len(results["residue_embs"]) if per_residue else passed_time/len(results["protein_embs"])
    print('\n############# EMBEDDING STATS #############')
    print('Total number of per-residue embeddings: {}'.format(len(results["residue_embs"])))
    print('Total number of per-protein embeddings: {}'.format(len(results["protein_embs"])))
    print("Time for generating embeddings: {:.1f}[m] ({:.3f}[s/protein])".format(
        passed_time/60, avg_time ))
    print('\n############# END #############')
    return results

### Write embeddings to disk function

In [10]:
def save_embeddings(emb_dict,out_path):
    with h5py.File(str(out_path), "w") as hf:
        for sequence_id, embedding in emb_dict.items():
            # noinspection PyUnboundLocalVariable
            hf.create_dataset(sequence_id, data=embedding)
    return None

### Import libraries

In [13]:
!pip install xgboost
!pip install lightgbm
!pip install matplotlib
!apt install wget
!pip install sklearn pandas 
!pip install gdown
import numpy as np
from xgboost import XGBClassifier
import xgboost as xgb
from lightgbm import LGBMClassifier
import math
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization, ReLU, LeakyReLU, Conv1D, GlobalMaxPooling1D, AveragePooling1D, MaxPooling1D, GlobalAveragePooling1D
from tensorflow.keras.layers import concatenate, multiply, Bidirectional, LSTM, GRU, Flatten, PReLU, add, SpatialDropout1D
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2, l1_l2
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, precision_recall_curve
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef,accuracy_score, precision_score,recall_score
from sklearn.manifold import TSNE
import os
from keras.callbacks import ModelCheckpoint
from tensorflow.keras import mixed_precision
!pip install tensorflow-addons
import tensorflow_addons as tfa

import time

from tqdm import tqdm

### Setting RAM GPU for training growth 
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

# Disables caching (when set to 1) or enables caching (when set to 0) for just-in-time-compilation. When disabled,
# no binary code is added to or retrieved from the cache.
os.environ['CUDA_CACHE_DISABLE'] = '0' # orig is 0

# When set to 1, forces the device driver to ignore any binary code embedded in an application 
# (see Application Compatibility) and to just-in-time compile embedded PTX code instead.
# If a kernel does not have embedded PTX code, it will fail to load. This environment variable can be used to
# validate that PTX code is embedded in an application and that its just-in-time compilation works as expected to guarantee application 
# forward compatibility with future architectures.
os.environ['CUDA_FORCE_PTX_JIT'] = '1'# no orig


os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL'

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
os.environ['TF_GPU_THREAD_COUNT']='1'

os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'

os.environ['TF_ADJUST_HUE_FUSED'] = '1'
os.environ['TF_ADJUST_SATURATION_FUSED'] = '1'
os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

os.environ['TF_SYNC_ON_FINISH'] = '0'
os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'
os.environ['TF_DISABLE_NVTX_RANGES'] = '1'
os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"



# =================================================
mixed_precision.set_global_policy('mixed_float16')

2023-06-02 17:14:39.107792: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Collecting tensorflow-addons
  Downloading tensorflow_addons-0.20.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (591 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m591.0/591.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting typeguard<3.0.0,>=2.7
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow-addons
Successfully installed tensorflow-addons-0.20.0 typeguard-2.13.3
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m



TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



1 Physical GPUs, 1 Logical GPUs
INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA A100-SXM4-80GB, compute capability 8.0


2023-06-02 17:14:45.290422: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 70630 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-80GB, pci bus id: 0000:a1:00.0, compute capability: 8.0


### Create train dataset

In [22]:
per_residue_path = "pan.h5"
seq_path = "pan.fasta"
pair_path = "pan_pairs.tsv"
BATCH_SIZE = 64
if os.path.isfile(seq_path) == False or os.path.isfile(pair_path) == False:
  print("Have no dictionary and pair files, start downloading:")
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Golden-standard-datasets/Pan-2010/pan_dict.tsv
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Golden-standard-datasets/Pan-2010/pan_pairs.tsv
  !wget https://raw.githubusercontent.com/anhvt00/MCAPS/master/data/Golden-standard-datasets/Pan-2010/pan.fasta
else:
  print("Already have the dictionary and pair files")

# Load the encoder part of ProtT5-XL-U50 in half-precision (recommended)
if os.path.isfile(per_residue_path) == False:
  print("Have no embedding file, start embedding:")
  model, tokenizer = get_T5_model()

  # Load example fasta.
  seqs = read_fasta( seq_path )

  for id, seq in seqs.items():
      if len(seq) > 1200:
          seqs[id] = seq[:1200]

  # Compute embeddings and/or secondary structure predictions
  results = get_embeddings( model, tokenizer, seqs,
                           True, False, False)
  # Store per-residue embeddings
  save_embeddings(results["residue_embs"], per_residue_path)
else:
  print("Already have the embedding file")

# ### Load embeddings
print("Load the embedding file")
embedding_matrix= h5py.File(per_residue_path, 'r')
protein_keys = list(embedding_matrix.keys())
embedding_dict = dict()

for key in protein_keys:
  embedding_dict[key] = np.array(embedding_matrix[key])

### Read protein pairs 
print("Load the pair dataset file")
pair_dataframe = pd.read_csv(pair_path, sep='\t', header=None)
pair_array  = pair_dataframe.to_numpy()
np.random.seed(42)
np.random.shuffle(pair_array)
pair_dataframe = pd.DataFrame(pair_array)
pair_dataframe = pd.DataFrame(pair_array, columns = ['p1', 'p2', 'label'])
pair_dataframe['label'] = pair_dataframe['label'].astype('float16') 
pair_dataframe['p1'] = pair_dataframe['p1'].str.replace(".","_")
pair_dataframe['p2'] = pair_dataframe['p2'].str.replace(".","_")


# Create dataset object  
train_dataset = tf.data.Dataset.from_generator(lambda: range(len(pair_dataframe)), tf.uint64).map(lambda i: tf.py_function(func=func, 
                                              inp=[i], 
                                              Tout=[tf.float16,
                                                    tf.float16, tf.float16]
                                              ), 
                      num_parallel_calls=tf.data.AUTOTUNE).map(_fixup_shape).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)



Already have the dictionary and pair files
Already have the embedding file
Load the embedding file
Load the pair dataset file


### Padding function 

In [23]:
def pad(rst, length=1200, dim=1024):
    if len(rst) > length:
        return rst[:length]
    elif len(rst) < length:
        return np.concatenate((rst, np.zeros((length - len(rst), dim))))
    return rst

### Architecture of MCAPS

In [25]:
from tensorflow.keras.utils import get_custom_objects
def leaky_relu(x, alpha = .2):
   return tf.keras.backend.maximum(alpha*x, x)
!pip install tensorflow-addons
import tensorflow_addons as tfa 
get_custom_objects().update({'leaky_relu': leaky_relu})
get_custom_objects().update({'mish': tfa.activations.mish})
get_custom_objects().update({'lisht': tfa.activations.lisht})
get_custom_objects().update({'rrelu': tfa.activations.rrelu})

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [31]:
seq_size = 1200
dim = 1024
def multi_cnn():
    DEPTH = 5
    WIDTH = 3
    POOLING_SIZE = 4
    FILTERS = 50
    KERNEL_SIZE = 2
    DEPTH_DENSE1 = 3
    DEPTH_DENSE2 = 2
    DROPOUT = DROPOUT1 = DROPOUT2 = 0.05
    DROPOUT_SPATIAL= 0.15
    ACTIVATION = 'swish'
    ACTIVATION_CNN = 'swish'
    INITIALIZER = 'glorot_normal'
    
    def BlockCNN_single(KERNEL_SIZE, POOLING_SIZE, FILTERS, LAYER_IN1, LAYER_IN2):
        c1 = Conv1D(filters=FILTERS, kernel_size=KERNEL_SIZE, activation=ACTIVATION_CNN, padding='same')
        x1 = c1(LAYER_IN1)
        x2 = c1(LAYER_IN2)

        g1 = Dropout(DROPOUT)(concatenate([GlobalMaxPooling1D()(x1),GlobalAveragePooling1D()(x1)]))
        a1 = GlobalAveragePooling1D()(x1)
        g2 = Dropout(DROPOUT)(concatenate([GlobalMaxPooling1D()(x2),GlobalAveragePooling1D()(x2)]))
        a2 = GlobalAveragePooling1D()(x1)

        x1 = SpatialDropout1D(DROPOUT_SPATIAL)(concatenate([MaxPooling1D(POOLING_SIZE)(x1), AveragePooling1D(POOLING_SIZE)(x1)]))
        x2 = SpatialDropout1D(DROPOUT_SPATIAL)(concatenate([MaxPooling1D(POOLING_SIZE)(x2), AveragePooling1D(POOLING_SIZE)(x2)]))

        return x1, x2, g1, g2, a1, a2

    def BlockCNN_multi(POOLING_SIZE, FILTERS, LAYER_IN1, LAYER_IN2, WIDTH):
      X1 = []
      X2 = []
      G1 = []
      G2 = []
      A1 = []
      A2 = []
      for i in range(2, 2+WIDTH):
        x1, x2, g1, g2, a1, a2 = BlockCNN_single(i, POOLING_SIZE, FILTERS, LAYER_IN1, LAYER_IN2)
        X1.append(x1)
        X2.append(x2)
        G1.append(g1)
        G2.append(g2)
        A1.append(a1)
        A2.append(a2)
      x1 = concatenate(X1)
      x2 = concatenate(X2)
      g1 = GlobalMaxPooling1D()(x1)
      g2 = GlobalMaxPooling1D()(x2)
      return x1, x2, g1, g2

    def BlockCNN_single_deep(KERNEL_SIZE, POOLING_SIZE, DEPTH, FILTERS, LAYER_IN1, LAYER_IN2):
      X1 = []
      X2 = []
      G1 = []
      G2 = []
      A1 = []
      A2 = []
      x1 = LAYER_IN1
      x2 = LAYER_IN2
      for i in range(DEPTH):
        x1, x2, g1, g2, a1, a2 = BlockCNN_single(KERNEL_SIZE, POOLING_SIZE, FILTERS, x1, x2)
        X1.append(x1)
        X2.append(x2)
        G1.append(g1)
        G2.append(g2)
        A1.append(a1)
        A2.append(a2)

      return X1, X2, G1, G2, A1, A2

    input1 = Input(shape=(seq_size, dim), name="seq1")
    input2 = Input(shape=(seq_size, dim), name="seq2")
    


    X1 = dict()
    X2 = dict()
    G1 = dict()
    G2 = dict()
    A1 = dict()
    A2 = dict()

    for i in range(KERNEL_SIZE, KERNEL_SIZE+WIDTH):
      X1[f'{i}'], X2[f'{i}'], G1[f'{i}'], G2[f'{i}'], A1[f'{i}'], A2[f'{i}'] = BlockCNN_single_deep(i, POOLING_SIZE, DEPTH, FILTERS, input1, input2)

    s1 = []
    s2 = []
    for i in range(KERNEL_SIZE, KERNEL_SIZE+WIDTH):
      s1.extend(G1[f'{i}'])
      s2.extend(G2[f'{i}'])

    s1 = concatenate(s1)
    s2 = concatenate(s2)
    
    s1 = BatchNormalization(momentum=.9)(s1)
    s2 = BatchNormalization(momentum=.9)(s2)

    s1 = Dropout(DROPOUT1)(s1)
    s2 = Dropout(DROPOUT1)(s2)
    
    s1_shape = s1.shape[-1]
    DENSE1 = 744 
    d1 = []
    for i in range(DEPTH_DENSE1):
        d1.append(Dense(int(DENSE1*(1/2)**i), kernel_initializer=INITIALIZER, activation=ACTIVATION))

    for i in range(DEPTH_DENSE1):
        s1 = d1[i](s1)
        s2 = d1[i](s2)
        s1 = Dropout(DROPOUT1)(s1)
        s2 = Dropout(DROPOUT1)(s2)
        
    s = concatenate([s1, s2])

    
    s_shape = s.shape[-1]
    DENSE2 = 328
        
    d2 = []
    for i in range(DEPTH_DENSE2):
        d2.append(Dense(int(DENSE2*(1/2)**i), kernel_initializer=INITIALIZER, activation=ACTIVATION))

    for i in range(DEPTH_DENSE2):
        s = d2[i](s)
        s = Dropout(DROPOUT2)(s)

    output = Dense(1, activation='sigmoid')(s)
    model = Model(inputs=[input1, input2], outputs=[output])
    
    adabelief = tfa.optimizers.AdaBelief(
    rectify=False,
    epsilon=1e-8)
    adam = Adam(learning_rate=1e-3, amsgrad=True, epsilon=1e-6)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    return model


model = multi_cnn()
model.summary()
tf.keras.utils.plot_model(model, show_shapes=True)

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 seq1 (InputLayer)              [(None, 1200, 1024)  0           []                               
                                ]                                                                 
                                                                                                  
 seq2 (InputLayer)              [(None, 1200, 1024)  0           []                               
                                ]                                                                 
                                                                                                  
 conv1d_30 (Conv1D)             (None, 1200, 50)     102450      ['seq1[0][0]',                   
                                                                  'seq2[0][0]']             

### Train on the training dataset

In [None]:
for i in range(40):
  model.fit(train_dataset, epochs=1)
  model.save(f'model_at_epoch_{i+1}.hdf5')

    768/Unknown - 287s 351ms/step - loss: 0.1643 - accuracy: 0.9434

### Test on the training dataset with MCAPST5

In [28]:
y_pred = model.predict(train_dataset)
y_true = pair_dataframe['label'].values
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

print("============= INFERENCE BY NEURAL NETWORK ===============")
try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")

accuracy: 0.9932300334458968, precision: 0.9898334414882111, recall: 0.9950349726379879, specificity: 0.9917779462359321, f1-score: 0.9924273915165097, mcc: 0.9863173754377397, auroc: 0.9994101251935479, auprc: 0.9989141196285366 
0.9932300334458968	0.9898334414882111	0.9950349726379879	0.9917779462359321	0.9924273915165097	0.9863173754377397	0.9994101251935479	0.9989141196285366



### Fit XGBoost for learned representations from MCAPST5

In [29]:
intermediate_layer_model = Model(inputs=model.input,outputs=model.get_layer(model.layers[-2].name).output)

# Use intermediate layer to transform pairs matrix
pred = intermediate_layer_model.predict(train_dataset)
p_merge=pd.DataFrame(pred)    
Trainlabels = pair_dataframe['label']
# create dataframe use transformed pairs matrix outputs and labels
X_train_feat=pd.concat((p_merge,pd.DataFrame(pd.DataFrame(Trainlabels))),axis=1,ignore_index=True)

# write to file dataframe of transformed pairs matrix and labels
X_train_feat.to_csv('X_train.csv',header=False, index=False)

# read dataframe of transformed pairs matrix and labels
Train=pd.read_csv("X_train.csv",header=None)
# Train=Train.sample(frac=1)

shape_x = model.layers[-2].get_output_at(0).get_shape()[1]
X=Train.iloc[:,0:shape_x].values
y=Train.iloc[:,shape_x:].values

extracted_df=X_train_feat


y = y.reshape(-1, )
model_= XGBClassifier(booster='gbtree', reg_lambda=1, alpha=1e-7, subsample=0.8, colsample_bytree=0.2, n_estimators=100, max_depth=5, min_child_weight=2, gamma=1e-7, eta=1e-6)
model_.fit(X, y, verbose=False)



### Test on the training dataset with MCAPST5-X

In [30]:
y_pred = model_.predict(X)
y_true = y
cm1=confusion_matrix(y_true, np.round(y_pred))
acc = (cm1[0,0]+cm1[1,1])/(cm1[0,0]+cm1[0,1]+cm1[1,0]+cm1[1,1])
spec= (cm1[0,0])/(cm1[0,0]+cm1[0,1])
sens = (cm1[1,1])/(cm1[1,0]+cm1[1,1])
prec=cm1[1,1]/(cm1[1,1]+cm1[0,1])
rec=cm1[1,1]/(cm1[1,1]+cm1[1,0])
f1 = 2 * (prec * rec) / (prec + rec)
mcc = matthews_corrcoef(y_true, np.round(y_pred))

prc = metrics.average_precision_score(y_true, y_pred)

print("============= INFERENCE BY HYBRID MODEL ===============")
try:
  auc = metrics.roc_auc_score(y_true, y_pred)
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: {auc}, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t" + str(auc)  + "\t" + str(prc) + "\n")
except ValueError:
  print(f'accuracy: {acc}, precision: {prec}, recall: {rec}, specificity: {spec}, f1-score: {f1}, mcc: {mcc}, auroc: nan, auprc: {prc} ')
  print(str(acc) + "\t" + str(prec) + "\t" + str(rec) + "\t" + str(spec) + "\t" + str(f1) + "\t" + str(mcc)+"\t nan"  + "\t" + str(prc) + "\n")

accuracy: 0.9950073516343249, precision: 0.9921711523197921, recall: 0.9966658210415685, specificity: 0.9936731004723307, f1-score: 0.994413407821229, mcc: 0.9899088515906708, auroc: 0.9951694607569497, auprc: 0.9903495604436137 
0.9950073516343249	0.9921711523197921	0.9966658210415685	0.9936731004723307	0.994413407821229	0.9899088515906708	0.9951694607569497	0.9903495604436137

