Baseline Seqlab KMER uses Untari et. al. paradigm.

# Libraries

Import Libraries

In [7]:
# Import Lib
import numpy as np
import pandas as pd
import pickle
import time
import keras
import keras.utils
import tensorflow as tf
from keras.layers import Embedding, Dense, Flatten, Dropout, SpatialDropout1D, TimeDistributed, LSTM, GRU, Bidirectional
from keras.models import Sequential
from keras.layers.convolutional import Conv1D
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from keras import Input, Model
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from collections import Counter
from sklearn.metrics import roc_curve, roc_auc_score, auc
from sklearn.model_selection import KFold
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
import os

# Hyperparameter and Paths

In [8]:
import os
import tensorflow as tf

num_classes = 2
vocab_size = 5
embedding_dim = 4
window_size = 150
units = 256
dropout = 0.2
numlayer = 2
metrics = [
    'accuracy', 
    tf.keras.metrics.Precision(),
    tf.keras.metrics.Recall()
]

data_dir = os.path.join("workspace", "baseline")
work_dir = os.path.join("run", "baseline")
training_data_file = "gene_index.01_train_validation_ss_all_pos_train.csv"
validation_data_file = "gene_index.01_train_validation_ss_all_pos_validation.csv"
test_data_file = "gene_index.01_test_ss_all_pos.csv"

# Dictionary

In [9]:
def compute_f1_score(precision, recall):
  f1_score = (2 * precision * recall) / (precision + recall)
  return f1_score

metrics = [
  'accuracy', 
  tf.keras.metrics.Precision(name="precision"),
  tf.keras.metrics.Recall(name="recall"),
]

# Nucleotide order: T, C, A, G
nucleotide_dict = {
    "T": 1,
    "C": 2,
    "A": 3,
    "G": 4, 
    "N": 0
}

exon_intron_dict = {
    "i": 0,
    "E": 1,
    "N": -100,
}

embedding_matrix = np.array([[0, 0, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]])
vocab_size = 4 + 1
embedding_dim = embedding_matrix.shape[1]

def convert_label(y, num_classes):
  if y in [0, 1]:
    return tf.keras.utils.to_categorical(y, num_classes)
  else:
    return [0, 0]

def preprocessing(data_path):
  encoded_sequences = []
  encoded_labels = []
  df = pd.read_csv(data_path)
  for i, r in df.iterrows():
    sequence = r["sequence"]
    label = r["label"]

    # padding sequence.
    encoded_sequence = [nucleotide_dict[a] for a in list(sequence)]
    if len(encoded_sequence) < 150:
      delta = 150 - len(encoded_sequence)
      for j in range(delta):
        encoded_sequence.append(0)

    # padding label.
    encoded_label = [exon_intron_dict[a] for a in list(label)]
    if len(encoded_label) < 150:
      delta = 150 - len(encoded_label)
      for j in range(delta):
        encoded_label.append(-100)

    encoded_sequences.append(
        encoded_sequence
    )
    encoded_labels.append(
        encoded_label
    )
  return encoded_sequences, encoded_labels


# Model BiLSTM

In [10]:
def model_bilstm():
  # Architecture:
  input = Input(shape=(window_size,)) # Input layer
  model = Embedding(vocab_size, 
                    embedding_dim, 
                    weights=[embedding_matrix],
                    input_length = window_size, 
                    trainable=False)(input)
  model = Bidirectional(LSTM(units, return_sequences=True))(model)
  if dropout>0:
    model = Dropout(dropout)(model)
  if numlayer==2:
    model = Bidirectional(LSTM(units, return_sequences=True))(model)
    if dropout>0:
      model = Dropout(dropout)(model)
  out = TimeDistributed(Dense(num_classes, activation="softmax"))(model)  # TimeDistributed wrapper layer, return sequences. Fully connected layer. 
  model = Model(input, out)
  opt = tf.keras.optimizers.Adam(learning_rate=0.0001, decay=1e-6)
  model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=metrics)
  return model


# Model BiGRU

In [11]:
def model_bigru():
  # Architecture:
  input = Input(shape=(window_size,)) # Input layer
  model = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix],
                          input_length = window_size, trainable=False)(input)
  model = Bidirectional(GRU(units, return_sequences=True))(model)
  if dropout>0:
    model = Dropout(dropout)(model)
  if numlayer==2:
    model = Bidirectional(GRU(units, return_sequences=True))(model)
    if dropout>0:
      model = Dropout(dropout)(model)
  out = TimeDistributed(Dense(num_classes, activation="softmax"))(model)  # TimeDistributed wrapper layer, return sequences. Fully connected layer. 
  model = Model(input, out)
  opt = tf.keras.optimizers.Adam(learning_rate=0.0001, decay=1e-6)
  model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=metrics)
  return model


# Training

In [13]:
from tqdm import tqdm

scenarios = ["stride1"] #, "ss1"]

for s in tqdm(scenarios, total=1, desc="Processing"):
    cur_data_dir = os.path.join(data_dir, s)
    training_data_path = os.path.join(cur_data_dir, training_data_file)
    validation_data_path = os.path.join(cur_data_dir, validation_data_file)
    test_data_path = os.path.join(cur_data_dir, test_data_file)

    model_dir = os.path.join(work_dir, s, "model")
    log_dir = os.path.join(work_dir, s, "log")

    for p in [model_dir, log_dir]:
        os.makedirs(p, exist_ok=True)

    X_train, Y_train = preprocessing(training_data_path)
    X_train = np.array(X_train)
    Y_train = np.array([[convert_label(_y, num_classes) for _y in y] for y in Y_train])
    X_val, Y_val = preprocessing(validation_data_path)
    X_val = np.array(X_val)
    Y_val = np.array([[convert_label(_y, num_classes) for _y in y] for y in Y_val])
    X_test, Y_test = preprocessing(test_data_path)
    X_test = np.array(X_test)
    Y_test = np.array([[convert_label(_y, num_classes) for _y in y] for y in Y_test])

    model_bilstm = model_bilstm()
    model_bigru = model_bigru()
    model_collection = [("bilstm", model_bilstm), ("bigru", model_bigru)]

    for model_name, model in model_collection:
        train_history = model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=20, batch_size=48)
        model.save(
            os.path.join(model_dir, f"model_{model_name}.h5")
        )
        # put history into single data map.
        history_keys = (train_history.history.keys())
        print(f"training and validation history {history_keys}")
        data = {}
        for k in history_keys:
            data[k] = train_history.history[k]
        
        # compute f1 score.
        train_f1_score = []
        val_f1_score = []

        for p, r in zip(data.get("precision"), data.get("recall")):
            train_f1_score.append(
                compute_f1_score(p, r)
            )

        for p, r in zip(data.get("val_precision"), data.get("val_recall")):
            val_f1_score.append(
                compute_f1_score(p, r)
            )
        
        data["f1_score"] = train_f1_score
        data["val_f1_score"] = val_f1_score

        training_validation_result_df = pd.DataFrame(data=data)
        training_validation_result_df.to_csv(
            os.path.join(log_dir, f"training_validation_log.arch_{model_name}.csv"), 
            index=False)

    

Processing:   0%|          | 0/1 [19:33<?, ?it/s]


ValueError: The first argument to `Layer.call` must always be passed.