In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
import os
os.chdir("..")
os.chdir("Preprocessing")
from preprocessing import *
import selfies as sf

#Ensemble learning and random forest

df = preprocessing("C:\\Users\Gilbert\Documents\BCB_Research\Kcat_Benchmark_ML_Models\Data\kcat_transferase.csv")
encoded_df = pd.read_csv("C:\\Users\Gilbert\Documents\BCB_Research\Kcat_Benchmark_ML_Models\Data\encoded_amino.csv")

In [17]:
import tensorflow as tf

x = encoded_df
y = np.log10(df["Kcat"])

In [19]:

try:
    x_new = np.reshape(x, (4136, 2511, 21))
    print("Reshape successful, new shape:", x_new.shape)
except ValueError as e:
    print("Reshape failed:", e)

Reshape successful, new shape: (4136, 2511, 21)


In [22]:
x_train, x_test, y_train , y_test = train_test_split(x_new, y, test_size=0.2, random_state=42)

[[[0 0 0 ... 0 0 0]
  [1 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 1]
  [0 0 0 ... 0 0 1]
  [0 0 0 ... 0 0 1]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 1 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 1]
  [0 0 0 ... 0 0 1]
  [0 0 0 ... 0 0 1]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 1]
  [0 0 0 ... 0 0 1]
  [0 0 0 ... 0 0 1]]

 ...

 [[0 0 0 ... 0 0 0]
  [1 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 1]
  [0 0 0 ... 0 0 1]
  [0 0 0 ... 0 0 1]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 1]
  [0 0 0 ... 0 0 1]
  [0 0 0 ... 0 0 1]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [1 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 1]
  [0 0 0 ... 0 0 1]
  [0 0 0 ... 0 0 1]]]


In [None]:
def scaled_dot_product_attention(q, k, v):
    matmul_qk = tf.matmul(q, k, transpose_b=True)
    d_k = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(d_k)
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
    output = tf.matmul(attention_weights, v)
    return output

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        assert d_model % self.num_heads == 0
        self.depth = d_model // self.num_heads
        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)
        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q):
        batch_size = tf.shape(q)[0]
        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)
        scaled_attention = scaled_dot_product_attention(q, k, v)
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
        output = self.dense(concat_attention)
        return output

def get_positional_encoding(sequence_length, d_model):
    angles = 1 / np.power(10000, (2 * (np.arange(d_model)[np.newaxis, :] // 2)) / np.float32(d_model))
    angle_rads = np.arange(sequence_length)[:, np.newaxis] * angles
    sines = np.sin(angle_rads[:, 0::2])
    cosines = np.cos(angle_rads[:, 1::2])
    pos_encoding = np.concatenate([sines, cosines], axis=-1)
    pos_encoding = pos_encoding[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)


def encoder_layer(units, d_model, num_heads, dropout_rate):
    inputs = tf.keras.Input(shape=(None, d_model))
    mha = MultiHeadAttention(d_model, num_heads)(inputs, inputs, inputs)
    dropout1 = tf.keras.layers.Dropout(dropout_rate)(mha)
    norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(inputs + dropout1)
    ffn_output = tf.keras.layers.Dense(units, activation='relu')(norm1)
    ffn_output = tf.keras.layers.Dense(d_model)(ffn_output)
    dropout2 = tf.keras.layers.Dropout(dropout_rate)(ffn_output)
    outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6)(norm1 + dropout2)
    return tf.keras.Model(inputs=inputs, outputs=outputs)

def encoder(sequence_length, d_model, num_heads, dropout_rate, num_layers, units):
    inputs = tf.keras.Input(shape=(sequence_length, d_model))
    pos_encoding = get_positional_encoding(sequence_length, d_model)
    x = inputs + pos_encoding[:, :tf.shape(inputs)[1], :]
    for _ in range(num_layers):
        x = encoder_layer(units, d_model, num_heads, dropout_rate)(x)
    return tf.keras.Model(inputs=inputs, outputs=x)

def build_model(sequence_length, one_hot_dim, num_layers, units, d_model, num_heads, dropout_rate):
    inputs = tf.keras.Input(shape=(sequence_length, one_hot_dim))
    x = tf.keras.layers.Dense(d_model)(inputs)
    x = encoder(sequence_length, d_model, num_heads, dropout_rate, num_layers, units)(x)
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    outputs = tf.keras.layers.Dense(1)(x)
    return tf.keras.Model(inputs=inputs, outputs=outputs)

# Hyperparameters
SEQUENCE_LENGTH = 2511  # Changed from 100 to 2511
ONE_HOT_DIM = 21  
NUM_LAYERS = 4
UNITS = 512
D_MODEL = 128
NUM_HEADS = 8
DROPOUT_RATE = 0.1

model = build_model(SEQUENCE_LENGTH, ONE_HOT_DIM, NUM_LAYERS, UNITS, D_MODEL, NUM_HEADS, DROPOUT_RATE)
model.summary()

Model: "model_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 100, 20)]         0         
                                                                 
 dense_28 (Dense)            (None, 100, 128)          2688      
                                                                 
 model_10 (Functional)       (None, 100, 128)          793088    
                                                                 
 global_average_pooling1d_1  (None, 128)               0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_53 (Dense)            (None, 128)               16512     
                                                                 
 dropout_17 (Dropout)        (None, 128)               0         
                                                          

Model: "model_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_13 (InputLayer)       [(None, 2511, 21)]        0         
                                                                 
 dense_56 (Dense)            (None, 2511, 128)         2816      
                                                                 
 model_16 (Functional)       (None, 2511, 128)         793088    
                                                                 
 global_average_pooling1d_2  (None, 128)               0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_81 (Dense)            (None, 128)               16512     
                                                                 
 dropout_26 (Dropout)        (None, 128)               0         
                                                          

In [25]:
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])
history = model.fit(x_train, y_train, epochs=100, batch_size=32, validation_split=0.1)

Epoch 1/100


ValueError: in user code:

    File "c:\Users\Gilbert\anaconda3\envs\bcb_2\Lib\site-packages\keras\src\engine\training.py", line 1377, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\Gilbert\anaconda3\envs\bcb_2\Lib\site-packages\keras\src\engine\training.py", line 1360, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\Gilbert\anaconda3\envs\bcb_2\Lib\site-packages\keras\src\engine\training.py", line 1349, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\Gilbert\anaconda3\envs\bcb_2\Lib\site-packages\keras\src\engine\training.py", line 1126, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\Gilbert\anaconda3\envs\bcb_2\Lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\Gilbert\anaconda3\envs\bcb_2\Lib\site-packages\keras\src\engine\input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "model_11" is incompatible with the layer: expected shape=(None, 100, 20), found shape=(None, 2511, 21)


In [None]:
test_loss, test_mae = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test MAE: {test_mae}")