<a href="https://colab.research.google.com/github/anhvt00/PIPR/blob/master/Evaluation/Tunning-On-Guo/baseline_PIPR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook use for tunning model using embeddings file and language model embedder

### Check GPU hardware

In [1]:
!nvidia-smi

Sun Jan  9 15:08:20 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Download embedding files

In [2]:
# Download file seq2tensor.py for converting protein sequences to tensors
!wget https://raw.githubusercontent.com/anhvt00/PIPR/master/embeddings/seq2tensor.py

# Download file ac5_aph.txt for ac5_aph embedding 
!wget https://raw.githubusercontent.com/anhvt00/PIPR/master/embeddings/ac5_aph_CT.txt

### Download interaction pairs and dictionary files
# Download dictionary file (id: sequence)
!wget https://raw.githubusercontent.com/anhvt00/PIPR/master/yeast/preprocessed/protein_dictionary.tsv
# !wget https://raw.githubusercontent.com/anhvt00/PIPR/master/yeast/yeast_DSCRIPT/yeast_dictionary.tsv

# Download pairs of proteins with labels file
!wget https://raw.githubusercontent.com/anhvt00/PIPR/master/yeast/preprocessed/protein_pairs.tsv
# !wget https://raw.githubusercontent.com/anhvt00/PIPR/master/yeast/yeast_DSCRIPT/yeast_pairs.tsv

--2022-01-09 15:08:21--  https://raw.githubusercontent.com/anhvt00/PIPR/master/embeddings/seq2tensor.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1104 (1.1K) [text/plain]
Saving to: ‘seq2tensor.py’


2022-01-09 15:08:22 (45.8 MB/s) - ‘seq2tensor.py’ saved [1104/1104]

--2022-01-09 15:08:22--  https://raw.githubusercontent.com/anhvt00/PIPR/master/embeddings/ac5_aph_CT.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2530 (2.5K) [text/plain]
Saving to: ‘ac5_aph_CT.txt’


2022-01-09 15:08:23 (23.0 MB/s) - ‘ac5_aph_CT.txt’ sa

### Import libraries 

In [3]:
# Libraries for system and debug
import sys
import pdb
import os
from datetime import datetime

# Class for converting sequences to tensors
from seq2tensor import s2t

# Libraries for neural network training
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GRU, LSTM, Bidirectional, Input, Conv1D, Conv2D
from tensorflow.keras.layers import Add, Flatten, subtract, multiply, concatenate
from tensorflow.keras.layers import MaxPooling1D, AveragePooling1D, GlobalAveragePooling1D, MaxPooling2D
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.layers import Dropout, BatchNormalization
from tensorflow.keras.utils import Sequence
from tensorflow.keras import mixed_precision
from tensorflow import keras
from tensorboard.plugins.hparams import api as hp
from tensorflow.keras.utils import get_custom_objects
from tensorflow.keras.layers import Activation
from keras.callbacks import ModelCheckpoint
from tensorflow.keras import regularizers

# Import accessory modules
import numpy as np
import h5py
import gc
from tqdm import tqdm


# For tensorboard extension
!pip install -U tensorboard_plugin_profile

Collecting tensorboard_plugin_profile
  Downloading tensorboard_plugin_profile-2.5.0-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 4.4 MB/s 
[?25hCollecting gviz-api>=1.9.0
  Downloading gviz_api-1.10.0-py2.py3-none-any.whl (13 kB)
Installing collected packages: gviz-api, tensorboard-plugin-profile
Successfully installed gviz-api-1.10.0 tensorboard-plugin-profile-2.5.0


### Set CUDA environment variables

In [4]:
### Setting RAM GPU for training growth 
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)





# ============================================
# Optimisation Flags - Do not remove
# ============================================

# Disables caching (when set to 1) or enables caching (when set to 0) for just-in-time-compilation. When disabled,
# no binary code is added to or retrieved from the cache.
os.environ['CUDA_CACHE_DISABLE'] = '0' # orig is 0

# When set to 1, forces the device driver to ignore any binary code embedded in an application 
# (see Application Compatibility) and to just-in-time compile embedded PTX code instead.
# If a kernel does not have embedded PTX code, it will fail to load. This environment variable can be used to
# validate that PTX code is embedded in an application and that its just-in-time compilation works as expected to guarantee application 
# forward compatibility with future architectures.
os.environ['CUDA_FORCE_PTX_JIT'] = '1'# no orig


os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL'

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
os.environ['TF_GPU_THREAD_COUNT']='1'

os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'

os.environ['TF_ADJUST_HUE_FUSED'] = '1'
os.environ['TF_ADJUST_SATURATION_FUSED'] = '1'
os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

os.environ['TF_SYNC_ON_FINISH'] = '0'
os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'
os.environ['TF_DISABLE_NVTX_RANGES'] = '1'
os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"



# =================================================
# mixed_precision.set_global_policy('mixed_float16')

1 Physical GPUs, 1 Logical GPUs


### Hyperparameter set by default



In [None]:
# Default hyperparameters
CONV_HIDDEN_DIM = 48
RNN_HIDDEN = 48
N_EPOCHS = 100
HIDDEN_DIM=50
BATCH_SIZE = 32
DTYPE='float16'
LEARNING_RATE=.001
EPSILON=1e-6
adam = Adam(learning_rate=LEARNING_RATE, amsgrad=True, epsilon=EPSILON)
MAX_DATASET_SIZE = 11187
DATASET_SIZE = MAX_DATASET_SIZE
KERNEL_SIZE = 3
POOLING_KERNEL = 3
seq_size=1500
dim = 1024
# 1 for language model embedding
flags_embedding = 0
# 1 for loading from drive
available_data = 0



### Load the embeddings from drive

In [None]:
if available_data == 1:
    from google.colab import drive
    drive.mount('/content/drive')

    # seq_tensor = np.load('/content/drive/MyDrive/seq_tensor.npy', allow_pickle=True)
    seq_tensor = np.load('/content/drive/MyDrive/prottranst5uniref_2000.npy', allow_pickle=True)

    class_labels = np.load('/content/drive/MyDrive/class_labels.npy', allow_pickle=True)
    seq_index1 = np.load('/content/drive/MyDrive/seq_index1.npy', allow_pickle=True)
    seq_index2 = np.load('/content/drive/MyDrive/seq_index2.npy', allow_pickle=True)
    seq_tensor_physicochemical= np.load('/content/drive/MyDrive/physicochemical.npy', allow_pickle=True)

In [None]:
# seq_tensor = np.concatenate((seq_tensor, seq_tensor_physicochemical), axis=2)

### Use universal embedding files

In [None]:
if available_data == 0:
  id2seq_file = 'protein_dictionary.tsv'
  id2index = {}
  seqs = []
  index = 0
  sid1_index = 0
  sid2_index = 1
  ds_file = 'protein_pairs.tsv'
  label_index = 2
  use_emb = 'ac5_aph.txt'


  # Create line variable as a list of protein sequences with index is the number of protein sequences
  # id2index is a dictionary of protein id and incremental index number 
  for line in open(id2seq_file):
      line = line.strip().split('\t')
      id2index[line[0]] = index
      seqs.append(line[1])
      index += 1

  seq_array = []
  id2_aid = {}
  sid = 0

  seq2t = s2t(use_emb)

  max_data = -1
  limit_data = max_data > 0
  raw_data = []
  skip_head = True
  x = None
  count = 0

  # Create sequence array as a list of protein strings
  for line in tqdm(open(ds_file)):
      if skip_head:
          skip_head = False
          continue
      line = line.rstrip('\n').rstrip('\r').split('\t')
      if id2index.get(line[sid1_index]) is None or id2index.get(line[sid2_index]) is None:
          continue
      if id2_aid.get(line[sid1_index]) is None:
          id2_aid[line[sid1_index]] = sid
          sid += 1
          seq_array.append(seqs[id2index[line[sid1_index]]])
      line[sid1_index] = id2_aid[line[sid1_index]]
      if id2_aid.get(line[sid2_index]) is None:
          id2_aid[line[sid2_index]] = sid
          sid += 1
          seq_array.append(seqs[id2index[line[sid2_index]]])
      line[sid2_index] = id2_aid[line[sid2_index]]
      raw_data.append(line)
      if limit_data:
          count += 1
          if count >= max_data:
              break

  len_m_seq = np.array([len(line.split()) for line in seq_array])
  avg_m_seq = int(np.average(len_m_seq)) + 1
  max_m_seq = max(len_m_seq)
  dim = seq2t.dim

  # seq_tensor is tensor representation of dataset having shape of (number_of_sequences, padding_length, embedding_dim_of_aa)
  # Random for distribution of class labels
  np.random.seed(42)
  np.random.shuffle(raw_data)
  seq_tensor = np.array([seq2t.embed_normalized(line, seq_size) for line in tqdm(seq_array)]).astype('float16')

  # Extract index of 1st and 2nd sequences in pairs
  seq_index1 = np.array([line[sid1_index] for line in tqdm(raw_data)])
  seq_index2 = np.array([line[sid2_index] for line in tqdm(raw_data)])

  # Assign labels for pairs of sequences
  class_map = {'0':1,'1':0}
  class_labels = np.zeros((len(raw_data), 2))
  for i in range(len(raw_data)):
      class_labels[i][class_map[raw_data[i][label_index]]] = 1


### Use language model for embedding

In [None]:
if flags_embedding == 1:
    !pip install bio-embeddings[all] # Need to restart runtime for the first run 

    # Choose protein language model for embedder
    from Bio import SeqIO # From Biopython library import SeqIO module to handle sequences when read and write different file formats

    # Chooose language model embedder class from package bio-embeddings -> moduel embed
    # from bio_embeddings.embed import CPCProtEmbedder, ProtTransT5XLU50Embedder, FastTextEmbedder, GloveEmbedder, PLUSRNNEmbedder, ProtTransBertBFDEmbedder, SeqVecEmbedder, UniRepEmbedder, Word2VecEmbedder, ProtTransXLNetUniRef100Embedder
    #   from bio_embeddings.embed import ProtTransBertBFDEmbedder
    from bio_embeddings.embed import ProtTransT5UniRef50Embedder
    embedder = ProtTransT5UniRef50Embedder()

    # Download raw sequences and create a list of sequences
    !wget https://raw.githubusercontent.com/anhvt00/PIPR/master/yeast/preprocessed/protein_preprocessed.txt
    with open('protein_preprocessed.txt') as file:
        sequences = file.readlines()
        sequences = [sequence.rstrip() for sequence in sequences]

    # Install in the case of using A100 for pytorch compatibility\
    A100_status = !nvidia-smi | grep 'A100'
    if A100_status:
        !pip install torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html

    # embeddings = []
    # for sequence in sequences:
    #     embeddings.append(embedder.embed(sequence))

    # # Start embedding 
    # # Embedding in generator form, need to iterate (flexible)
    embeddings = embedder.embed_many(sequences)

    # # Use list function to convert generator to list (true form of dataset)
    embeddings = list(embeddings)

    # # Average pooling in sequence dimension
    # reduced_embeddings = [ProtTransBertBFDEmbedder.reduce_per_protein(e) for e in embeddings]

    # # Padding to create fixed size tensor
    seq_tensor= tf.keras.preprocessing.sequence.pad_sequences(embeddings,  padding='post', dtype='float16', truncating='post', maxlen=seq_size)
    dim = seq_tensor.shape[2]

### Define custom function 

In [None]:
def leaky_relu(x, alpha = .3):
   return tf.keras.backend.maximum(alpha*x, x)

get_custom_objects().update({'leaky_relu': leaky_relu})

### Search for optimal configurations

In [None]:
# HP_EPSILON = hp.HParam('epsilon', hp.Discrete([1e-5, 1e-6, 1e-7]))
HP_EPSILON = hp.HParam('epsilon', hp.Discrete([1e-6]))

# HP_LEARNING_RATE = hp.HParam('learning_rate', hp.Discrete([1e-2, 1e-3, 1e-4]))
HP_LEARNING_RATE = hp.HParam('learning_rate', hp.Discrete([1e-3]))

# HP_FIRST_DENSE = hp.HParam('first_dense', hp.Discrete([100]))
HP_FIRST_DENSE = hp.HParam('first_dense', hp.Discrete([48]))

# HP_KERNEL_SIZE = hp.HParam('kernel_size', hp.Discrete([2, 3, 4]))
HP_KERNEL_SIZE = hp.HParam('kernel_size', hp.Discrete([3]))

# HP_POOLING_KERNEL = hp.HParam('pooling_kernel', hp.Discrete([2, 3, 4]))
HP_POOLING_KERNEL = hp.HParam('pooling_kernel', hp.Discrete([3]))

HP_CONV_HIDDEN_DIM = hp.HParam('conv_hidden_dim', hp.Discrete([50]))
HP_RNN_HIDDEN_DIM = hp.HParam('rnn_hidden_dim', hp.Discrete([50]))

HP_ACTIVATION = hp.HParam('activation', hp.Discrete(['leaky_relu']))
# HP_ACTIVATION = hp.HParam('activation', hp.Discrete(['linear']))

# HP_ACTIVATION_CONV = hp.HParam('activation_conv', hp.Discrete([ 'leaky_relu']))
HP_ACTIVATION_CONV = hp.HParam('activation_conv', hp.Discrete(['linear']))

HP_REGULARIZER = hp.HParam('regularizer', hp.Discrete([0]))

HP_CONV_PADDING = hp.HParam('conv_padding', hp.Discrete(['valid']))

HP_DROPOUT = hp.HParam('dropout', hp.Discrete([0e-1]))

HP_BATCH_SIZE = hp.HParam('batch_size', hp.Discrete([256]))

HP_LEAKY_RELU = hp.HParam('leaky_relu', hp.Discrete([3e-1]))

METRIC_ACCURACY = 'accuracy'

with tf.summary.create_file_writer('logs/hparam_tuning').as_default():
  hp.hparams_config(
    hparams=[HP_EPSILON,HP_LEARNING_RATE,HP_FIRST_DENSE, HP_KERNEL_SIZE, HP_POOLING_KERNEL, HP_CONV_HIDDEN_DIM, HP_RNN_HIDDEN_DIM, HP_ACTIVATION, HP_ACTIVATION_CONV, HP_REGULARIZER, HP_CONV_PADDING, HP_DROPOUT, HP_BATCH_SIZE, HP_LEAKY_RELU],
    metrics=[hp.Metric(METRIC_ACCURACY, display_name='Accuracy')],
  )

### Create dataset from generator

In [None]:
def generator_pair(dataset):
  for index in dataset:
    yield {"seq1": seq_tensor[seq_index1[index]], "seq2": seq_tensor[seq_index2[index]]}, class_labels[index]

def generator_pair_predict(dataset):
  for index in dataset:
    yield {"seq1": seq_tensor[seq_index1[index]], "seq2": seq_tensor[seq_index2[index]]}



### Split the dataset

In [None]:
### k-fold cross-validation
from sklearn.model_selection import KFold, ShuffleSplit
from sklearn.model_selection import train_test_split
kf = KFold(n_splits=5, shuffle=True, random_state=42)
tries = 5
cur = 0
recalls = []
accuracy = []
total = []
total_truth = []
train_test = []
for train, test in kf.split(class_labels):
    # redundant because same position
    if np.sum(class_labels[train], 0)[0] > 0.8 * len(train) or np.sum(class_labels[train], 0)[0] < 0.2 * len(train):
        continue
    train_test.append((train, test))
    cur += 1
    if cur >= tries:
        break


### Define callbacks for monitor

In [None]:
### Define tensorboard callback to optimize resource using of model
logs = "logs/" + datetime.now().strftime("%Y%m%d-%H%M%S")

tboard_callback = tf.keras.callbacks.TensorBoard(log_dir = logs,
                                                 histogram_freq = 1,
                                                 profile_batch = '20, 29')

### Learning rate schedule for optimization during training
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_accuracy",
    factor=0.5,
    patience=10,
    verbose=0,
    mode="auto",
    min_delta=1e-2,
    min_lr=1e-4)

# Schedule early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy', 
    verbose=1,
    patience=30,
    mode='max',
    restore_best_weights=True)


### Define performance metrics

In [None]:
!pip install tensorflow-addons
import tensorflow_addons as tfa
METRICS = [
      # keras.metrics.Accuracy(name='accuracy'),
      # keras.metrics.TruePositives(name='tp'),
      # keras.metrics.FalsePositives(name='fp'),
      # keras.metrics.TrueNegatives(name='tn'),
      # keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      tfa.metrics.MatthewsCorrelationCoefficient(num_classes=2, name='mcc'),
      tfa.metrics.F1Score(num_classes=2, threshold=0.5, name='f1-score'),
      keras.metrics.AUC(name='auc'),
      keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]



### Original architecture PIPR

In [None]:
def build_model(hparams):
    # Input of sequence tensor representations 
    seq_input1 = Input(shape=(seq_size, dim), name='seq1')
    seq_input2 = Input(shape=(seq_size, dim), name='seq2')

    # Define Conv1D and Bi-RNN (GRU/LSTM) use in architecture
    l1=Conv1D(hparams[HP_CONV_HIDDEN_DIM], hparams[HP_KERNEL_SIZE], activation=hparams[HP_ACTIVATION_CONV], padding=hparams[HP_CONV_PADDING])
    r1=Bidirectional(LSTM(hparams[HP_RNN_HIDDEN_DIM], return_sequences=True))
    l2=Conv1D(hparams[HP_CONV_HIDDEN_DIM], hparams[HP_KERNEL_SIZE], activation=hparams[HP_ACTIVATION_CONV], padding=hparams[HP_CONV_PADDING])
    r2=Bidirectional(LSTM(hparams[HP_RNN_HIDDEN_DIM], return_sequences=True))
    l3=Conv1D(hparams[HP_CONV_HIDDEN_DIM], hparams[HP_KERNEL_SIZE], activation=hparams[HP_ACTIVATION_CONV], padding=hparams[HP_CONV_PADDING])
    r3=Bidirectional(LSTM(hparams[HP_RNN_HIDDEN_DIM], return_sequences=True))
    l4=Conv1D(hparams[HP_CONV_HIDDEN_DIM], hparams[HP_KERNEL_SIZE], activation=hparams[HP_ACTIVATION_CONV], padding=hparams[HP_CONV_PADDING])
    r4=Bidirectional(LSTM(hparams[HP_RNN_HIDDEN_DIM], return_sequences=True))
    l5=Conv1D(hparams[HP_CONV_HIDDEN_DIM], hparams[HP_KERNEL_SIZE], activation=hparams[HP_ACTIVATION_CONV], padding=hparams[HP_CONV_PADDING])
    r5=Bidirectional(LSTM(hparams[HP_RNN_HIDDEN_DIM], return_sequences=True))
    l6=Conv1D(hparams[HP_CONV_HIDDEN_DIM], hparams[HP_KERNEL_SIZE], activation=hparams[HP_ACTIVATION_CONV], padding=hparams[HP_CONV_PADDING])
    
    # Siamese architecture

    ### 1st sibling

    # 1st Block RCNN 
    s1=MaxPooling1D(hparams[HP_POOLING_KERNEL])(l1(seq_input1))
    s1=concatenate([r1(s1), s1])

    # 2nd Block RCNN
    s1=MaxPooling1D(hparams[HP_POOLING_KERNEL])(l2(s1))
    s1=concatenate([r2(s1), s1])

    # 3rd Block RCNN
    s1=MaxPooling1D(hparams[HP_POOLING_KERNEL])(l3(s1))
    s1=concatenate([r3(s1), s1])

    # 4th Block RCNN 
    s1=MaxPooling1D(hparams[HP_POOLING_KERNEL])(l4(s1))
    s1=concatenate([r4(s1), s1])

    # 5th Block RCNN
    s1=MaxPooling1D(hparams[HP_POOLING_KERNEL])(l5(s1))
    s1=concatenate([r5(s1), s1])
    
    # Last convolution
    s1=l6(s1)
    s1=GlobalAveragePooling1D()(s1)

    ### 2nd sibling

    # 1st block RCNN
    s2=MaxPooling1D(hparams[HP_POOLING_KERNEL])(l1(seq_input2))
    s2=concatenate([r1(s2), s2])

    # 2nd block RCNN
    s2=MaxPooling1D(hparams[HP_POOLING_KERNEL])(l2(s2))
    s2=concatenate([r2(s2), s2])

    # 3rd block RCNN
    s2=MaxPooling1D(hparams[HP_POOLING_KERNEL])(l3(s2))
    s2=concatenate([r3(s2), s2])

    # 4th block RCNN
    s2=MaxPooling1D(hparams[HP_POOLING_KERNEL])(l4(s2))
    s2=concatenate([r4(s2), s2])

    # 5th block RCNN
    s2=MaxPooling1D(hparams[HP_POOLING_KERNEL])(l5(s2))
    s2=concatenate([r5(s2), s2])

    # Last convolution
    s2=l6(s2)
    s2=GlobalAveragePooling1D()(s2)

    ### Combine two siblings of siamese architecture
    merge_text = multiply([s1, s2])
    

    #### MLP Part
    # Set initializer
    he_initializer = tf.keras.initializers.HeUniform()
    
    # First dense
    x = Dense(hparams[HP_FIRST_DENSE], activation=hparams[HP_ACTIVATION])(merge_text)
    # x = tf.keras.layers.LeakyReLU(alpha=.3)(x)
    x = Dropout(hparams[HP_DROPOUT])(x)

    # Second dense
    x = Dense(int((hparams[HP_CONV_HIDDEN_DIM]+7)/2), activation=hparams[HP_ACTIVATION])(x)
    # x = tf.keras.layers.LeakyReLU(alpha=.3)(x)
    x = Dropout(hparams[HP_DROPOUT])(x)

    # Last softmax
    main_output = Dense(2, activation='softmax')(x)

    # Combine to form functional model
    merge_model = Model(inputs=[seq_input1, seq_input2], outputs=[main_output])
    return merge_model

In [None]:
def build_model(hparams):
    # Input of sequence tensor representations 
    seq_input1 = Input(shape=(seq_size, dim), name='seq1')
    seq_input2 = Input(shape=(seq_size, dim), name='seq2')

    merge = concatenate([seq_input1, seq_input2])
    x = GlobalAveragePooling1D()(merge)
    main_output = Dense(2, activation='softmax')(x)
    merge_model = Model(inputs=[seq_input1, seq_input2], outputs=[main_output])
    
    return merge_model

### Summary of model architecture

In [None]:
hparams = {
  HP_EPSILON: EPSILON,
  HP_LEARNING_RATE: LEARNING_RATE,
  HP_FIRST_DENSE: 100,
  HP_KERNEL_SIZE: 3,
  HP_POOLING_KERNEL: 3,
  HP_CONV_HIDDEN_DIM: 50,
  HP_RNN_HIDDEN_DIM: 50,
  HP_ACTIVATION: 'leaky_relu',
  HP_ACTIVATION_CONV: 'relu',
  HP_REGULARIZER: 0,
  HP_CONV_PADDING: 'valid',
  HP_DROPOUT: 3e-1,
  HP_BATCH_SIZE: 256,
  HP_LEAKY_RELU: 3e-1
}

model = build_model(hparams)
tf.keras.utils.plot_model(model, to_file='model.png', show_shapes=True)

### Config Train-test process

In [None]:
def train_test_model(hparams):
  training_time = 1
  num_hit = 0.
  num_total = 0.
  num_pos = 0.
  num_true_pos = 0.
  num_false_pos = 0.
  num_true_neg = 0.
  num_false_neg = 0.
  for train, test in train_test:
      merge_model = None
      merge_model = build_model(hparams)  


      merge_model.compile(optimizer=Adam(learning_rate=hparams[HP_LEARNING_RATE], amsgrad=True, epsilon=hparams[HP_EPSILON]), loss='categorical_crossentropy', metrics=METRICS)
      # Create train
      train_dataset = tf.data.Dataset.from_generator(generator_pair, args=[train], output_types=({"seq1": DTYPE, "seq2": DTYPE}, DTYPE), output_shapes = ({"seq1": (seq_size, dim), "seq2": (seq_size, dim)}, (2,)) )
      train_dataset = train_dataset.shuffle(1024).repeat(N_EPOCHS).batch(hparams[HP_BATCH_SIZE])
      train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)


      # Create val
      val_dataset = tf.data.Dataset.from_generator(generator_pair, args=[test], output_types=({"seq1": DTYPE, "seq2": DTYPE}, DTYPE), output_shapes = ({"seq1": (seq_size, dim), "seq2": (seq_size, dim)}, (2,)) )
      val_dataset = val_dataset.batch(hparams[HP_BATCH_SIZE])
      val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)

      # Save the best model base on val_accuracy
      checkpoint = ModelCheckpoint(filepath='my_best_model.hdf5', monitor='val_accuracy',verbose=1, save_best_only=True, mode='max')
      # Fit model
      print(f'==================== Training time  {training_time} =====================')
      merge_model.fit(train_dataset, steps_per_epoch=len(train)//hparams[HP_BATCH_SIZE], epochs=N_EPOCHS, validation_data=val_dataset, callbacks=[early_stopping, checkpoint, reduce_lr])
      
      print(f'==================End training {training_time}========================')
      # # Create test
      # test_dataset = tf.data.Dataset.from_generator(generator_pair, args=[test], output_types=({"seq1": DTYPE, "seq2": DTYPE}, DTYPE), output_shapes = ({"seq1": (seq_size, dim), "seq2": (seq_size, dim)}, (2,)) )
      # test_dataset = test_dataset.batch(hparams[HP_BATCH_SIZE])
      # res = merge_model.evaluate(test_dataset)
      # Create pred
      pred_dataset = tf.data.Dataset.from_generator(generator_pair_predict, args=[test], output_types=({"seq1": DTYPE, "seq2": DTYPE}), output_shapes = ({"seq1": (seq_size, dim), "seq2": (seq_size, dim)}) )
      pred_dataset = pred_dataset.batch(BATCH_SIZE)
      pred_dataset = pred_dataset.prefetch(tf.data.AUTOTUNE)
      pred = merge_model.predict(pred_dataset)

      # Performance metrics
      for i in range(len(class_labels[test])):
          num_total += 1
          if np.argmax(class_labels[test][i]) == np.argmax(pred[i]):
              num_hit += 1
          if class_labels[test][i][0] > 0.:
              num_pos += 1.
              if pred[i][0] > pred[i][1]:
                  num_true_pos += 1
              else:
                  num_false_neg += 1
          else:
              if pred[i][0] > pred[i][1]:
                  num_false_pos += 1
              else:
                  num_true_neg += 1
      accuracy = num_hit / num_total
      prec = num_true_pos / (num_true_pos + num_false_pos)
      recall = num_true_pos / num_pos
      spec = num_true_neg / (num_true_neg + num_false_neg)
      f1 = 2. * prec * recall / (prec + recall)
      mcc = (num_true_pos * num_true_neg - num_false_pos * num_false_neg) / ((num_true_pos + num_true_neg) * (num_true_pos + num_false_neg) * (num_false_pos + num_true_neg) * (num_false_pos + num_false_neg)) ** 0.5
      training_time += 1
      print (f'accuracy: {accuracy}, precision: {prec}, recall: {recall}, specificity: {spec}, mcc: {mcc} ,f1-score: {f1}')
  return accuracy

### Log configurations and accuracy

In [None]:
def run(run_dir, hparams):
  with tf.summary.create_file_writer(run_dir).as_default():
    hp.hparams(hparams)  # record the values used in this trial
    accuracy = train_test_model(hparams)
    tf.summary.scalar(METRIC_ACCURACY, accuracy, step=1)

### Loop over all configurations

In [None]:
# Remove log directory from last run
%%time
!rm -rf ./logs
session_num = 0
for epsilon in HP_EPSILON.domain.values:
  for learning_rate in HP_LEARNING_RATE.domain.values:
    for first_dense in HP_FIRST_DENSE.domain.values:
      for kernel_size in HP_KERNEL_SIZE.domain.values:
        for pooling_kernel in HP_POOLING_KERNEL.domain.values:
          for conv_hidden_dim in HP_CONV_HIDDEN_DIM.domain.values:
            for rnn_hidden_dim in HP_RNN_HIDDEN_DIM.domain.values:
              for activation in HP_ACTIVATION.domain.values:
                for activation_conv in HP_ACTIVATION_CONV.domain.values:
                  for regularizer in HP_REGULARIZER.domain.values:
                    for conv_padding in HP_CONV_PADDING.domain.values:
                      for dropout in HP_DROPOUT.domain.values:
                        for batch_size in HP_BATCH_SIZE.domain.values:
                          for leaky_relu in HP_LEAKY_RELU.domain.values:
                            hparams = {
                                HP_EPSILON: epsilon,
                                HP_LEARNING_RATE: learning_rate,
                                HP_FIRST_DENSE: first_dense,
                                HP_KERNEL_SIZE: kernel_size,
                                HP_POOLING_KERNEL: pooling_kernel,
                                HP_CONV_HIDDEN_DIM: conv_hidden_dim,
                                HP_RNN_HIDDEN_DIM: rnn_hidden_dim,
                                HP_ACTIVATION: activation,
                                HP_ACTIVATION_CONV: activation_conv,
                                HP_REGULARIZER: regularizer,
                                HP_CONV_PADDING: conv_padding,
                                HP_DROPOUT: dropout,
                                HP_BATCH_SIZE: batch_size,
                                HP_LEAKY_RELU: leaky_relu
                            }
                            run_name = "run-%d" % session_num
                            print('--- Starting trial: %s' % run_name)
                            print({h.name: hparams[h] for h in hparams})
                            run('logs/hparam_tuning/' + run_name, hparams)
                            session_num += 1


### Tensorboard monitor

In [None]:
%reload_ext tensorboard
%tensorboard --logdir=/content/logs