In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Concatenate, TimeDistributed, Bidirectional, LSTM, Dense, Dropout, Conv1D, MaxPooling1D, Flatten, Embedding
from sklearn.metrics import classification_report
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from itertools import product
import pandas as pd
from keras.models import Model
import numpy as np
import seqdata

tf.random.set_seed(7)

2023-04-03 07:26:49.555780: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Plot model
#
# tf.keras.utils.plot_model(
#     model,
#     to_file='model.png',
#     show_shapes=False,
#     show_dtype=False,
#     show_layer_names=True,
#     rankdir='TB',
#     expand_nested=False,
#     dpi=96,
#     layer_range=None,
#     show_layer_activations=False
# )

## One-hot encoding

In [18]:
train_data = seqdata.Seq('train/', 'ohe')
test_data = seqdata.Seq('test/', 'ohe')

max_len = seqdata.pad_data(train_data, test_data)

train_data.feature_extraction([1, 2, 3, 4, 5, 6, 7, 8], True)
test_data.feature_extraction([1, 2, 3, 4, 5, 6, 7, 8], False)

### CNN

In [34]:
# Input layer
input_layer = Input(shape=(max_len, 4))

# CNN layers
conv1 = Conv1D(filters=128, kernel_size=3, activation='relu')(input_layer)
conv2 = Conv1D(filters=128, kernel_size=3, activation='relu')(conv1)
dropout1 = Dropout(0.5)(conv2)
max_pool1 = MaxPooling1D(pool_size=2)(dropout1)
flatten1 = Flatten()(max_pool1)

# Dense layers
dense1 = Dense(128, activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4))(flatten1)
dropout2 = Dropout(0.5)(dense1)
output_layer = Dense(8, activation='softmax')(dropout2)

model = Model(inputs=input_layer, outputs=output_layer)

# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam', 
              metrics= [tf.keras.metrics.Precision(name="Precision")])

model.summary()

Model: "model_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_15 (InputLayer)       [(None, 3339, 4)]         0         
                                                                 
 conv1d_21 (Conv1D)          (None, 3337, 128)         1664      
                                                                 
 conv1d_22 (Conv1D)          (None, 3335, 128)         49280     
                                                                 
 dropout_25 (Dropout)        (None, 3335, 128)         0         
                                                                 
 max_pooling1d_14 (MaxPoolin  (None, 1667, 128)        0         
 g1D)                                                            
                                                                 
 flatten_10 (Flatten)        (None, 213376)            0         
                                                          

In [35]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, verbose=1)
]

model.fit(train_data.seqs, train_data.labels, batch_size=32, epochs=10, validation_split=0.1, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 6/10
Epoch 7/10
Epoch 7: ReduceLROnPlateau reducing learning rate to 4.0000001899898055e-05.
Epoch 8/10
Epoch 8: early stopping


<keras.callbacks.History at 0x7fae60102110>

In [36]:
# Model prediction
model_pred = model.predict(test_data.seqs)
y_pred = np.argmax(model_pred, axis=1)
y_true = np.argmax(test_data.labels, axis=1)

# Classification report
report = classification_report(y_true, y_pred, target_names=test_data.names, output_dict=True)
pd.DataFrame(report).T



Unnamed: 0,precision,recall,f1-score,support
snoRNA,0.617647,0.6,0.608696,35.0
rRNA,0.895833,0.934783,0.914894,138.0
snRNA,0.8,0.363636,0.5,22.0
mRNA,0.87,0.935484,0.901554,186.0
pre_miRNA,0.6,0.66,0.628571,50.0
miRNA,0.45614,0.5,0.477064,52.0
tRNA,0.977273,0.966292,0.971751,89.0
tmRNA,1.0,0.771429,0.870968,70.0
accuracy,0.827103,0.827103,0.827103,0.827103
macro avg,0.777112,0.716453,0.734187,642.0


### CNN + LSTM

In [6]:
# Input layer
input_layer = Input(shape=(max_len, 4))

# CNN layers
conv1 = Conv1D(filters=128, kernel_size=3, activation='relu')(input_layer)
dropout1 = Dropout(0.5)(conv1)
max_pool1 = MaxPooling1D(pool_size=2)(dropout1)

lstm1 = LSTM(128)(max_pool1)

dense1 = Dense(128, activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4))(lstm1)
dropout2 = Dropout(0.5)(dense1)
output_layer = Dense(8, activation='softmax')(dropout2)

model = Model(inputs=input_layer, outputs=output_layer)

# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam', 
              metrics= [tf.keras.metrics.Precision(name="Precision")])

model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 3339, 4)]         0         
                                                                 
 conv1d_1 (Conv1D)           (None, 3337, 128)         1664      
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 1668, 128)        0         
 1D)                                                             
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 dense_2 (Dense)             (None, 128)               16512     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                           

In [7]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, verbose=1)
]

model.fit(train_data.seqs, train_data.labels, batch_size=32, epochs=10, validation_split=0.1, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 3: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 4/10
Epoch 5/10
Epoch 5: ReduceLROnPlateau reducing learning rate to 4.0000001899898055e-05.
Epoch 6/10
Epoch 6: early stopping


<keras.callbacks.History at 0x7fee38cbeaa0>

In [8]:
# Model prediction
model_pred = model.predict(test_data.seqs)
y_pred = np.argmax(model_pred, axis=1)
y_true = np.argmax(test_data.labels, axis=1)

# Classification report
report = classification_report(y_true, y_pred, target_names=test_data.names, output_dict=True)
pd.DataFrame(report).T



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
snoRNA,0.0,0.0,0.0,35.0
rRNA,0.214953,1.0,0.353846,138.0
snRNA,0.0,0.0,0.0,22.0
mRNA,0.0,0.0,0.0,186.0
pre_miRNA,0.0,0.0,0.0,50.0
miRNA,0.0,0.0,0.0,52.0
tRNA,0.0,0.0,0.0,89.0
tmRNA,0.0,0.0,0.0,70.0
accuracy,0.214953,0.214953,0.214953,0.214953
macro avg,0.026869,0.125,0.044231,642.0


### CNN + BiLSTM

In [3]:
# Input layer
input_layer = Input(shape=(max_len, 4))

# CNN layers
conv1 = Conv1D(filters=128, kernel_size=3, activation='relu')(input_layer)
dropout1 = Dropout(0.5)(conv1)
max_pool1 = MaxPooling1D(pool_size=2)(dropout1)

lstm1 = Bidirectional(LSTM(128))(max_pool1)

dense1 = Dense(128, activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4))(lstm1)
dropout2 = Dropout(0.5)(dense1)
output_layer = Dense(8, activation='softmax')(dropout2)

model = Model(inputs=input_layer, outputs=output_layer)

# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam', 
              metrics= [tf.keras.metrics.Precision(name="Precision")])

model.summary()

2023-04-02 08:21:04.705592: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-02 08:21:04.707393: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 3339, 4)]         0         
                                                                 
 conv1d (Conv1D)             (None, 3337, 128)         1664      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 1668, 128)        0         
 )                                                               
                                                                 
 bidirectional (Bidirectiona  (None, 256)              263168    
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               32896     
                                                                 
 dropout (Dropout)           (None, 128)               0     

In [4]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, verbose=1)
]

model.fit(train_data.seqs, train_data.labels, batch_size=32, epochs=10, validation_split=0.1, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 3: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 6: ReduceLROnPlateau reducing learning rate to 4.0000001899898055e-05.
Epoch 7/10
Epoch 8/10
Epoch 8: ReduceLROnPlateau reducing learning rate to 8.000000525498762e-06.
Epoch 9/10
Epoch 9: early stopping


<keras.callbacks.History at 0x7fee492d7c70>

In [5]:
# Model prediction
model_pred = model.predict(test_data.seqs)
y_pred = np.argmax(model_pred, axis=1)
y_true = np.argmax(test_data.labels, axis=1)

# Classification report
report = classification_report(y_true, y_pred, target_names=test_data.names, output_dict=True)
pd.DataFrame(report).T



Unnamed: 0,precision,recall,f1-score,support
snoRNA,0.357143,0.428571,0.38961,35.0
rRNA,0.621891,0.905797,0.737463,138.0
snRNA,0.25,0.045455,0.076923,22.0
mRNA,0.829016,0.860215,0.844327,186.0
pre_miRNA,0.532258,0.66,0.589286,50.0
miRNA,0.413793,0.230769,0.296296,52.0
tRNA,0.768519,0.932584,0.84264,89.0
tmRNA,1.0,0.042857,0.082192,70.0
accuracy,0.672897,0.672897,0.672897,0.672897
macro avg,0.596577,0.513281,0.482342,642.0


### CNN + Features

In [52]:
input_layer = Input(shape=(max_len, 4))

conv1 = Conv1D(filters=128, kernel_size=3, activation='relu')(input_layer)
conv2 = Conv1D(filters=128, kernel_size=3, activation='relu')(conv1)
dropout1 = Dropout(0.5)(conv2)
max_pool1 = MaxPooling1D(pool_size=2)(dropout1)
cnn_out = Flatten()(max_pool1)

# feature extraction input

feat_extraction_input = Input(shape=(426,))
feat_extraction_out = Flatten()(feat_extraction_input)

concat = Concatenate()([cnn_out, feat_extraction_out])

dense1 = Dense(128, activation='relu')(concat)
dropout2 = Dropout(0.5)(dense1)
output_layer = Dense(8, activation='softmax')(dropout2)

model = Model(inputs=[input_layer, feat_extraction_input], outputs=output_layer)

# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam', 
              metrics= [tf.keras.metrics.Precision(name="Precision")])

model.summary()

Model: "model_20"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_28 (InputLayer)          [(None, 3339, 4)]    0           []                               
                                                                                                  
 conv1d_38 (Conv1D)             (None, 3337, 128)    1664        ['input_28[0][0]']               
                                                                                                  
 conv1d_39 (Conv1D)             (None, 3335, 128)    49280       ['conv1d_38[0][0]']              
                                                                                                  
 dropout_42 (Dropout)           (None, 3335, 128)    0           ['conv1d_39[0][0]']              
                                                                                           

In [53]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, verbose=1)
]

model.fit([train_data.seqs, train_data.features], train_data.labels, batch_size=32, epochs=10, validation_split=0.1, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 6: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 7/10
Epoch 8/10
Epoch 8: ReduceLROnPlateau reducing learning rate to 4.0000001899898055e-05.
Epoch 9/10
Epoch 9: early stopping


<keras.callbacks.History at 0x7faee446cca0>

In [55]:
# Model prediction
model_pred = model.predict([test_data.seqs, test_data.features])
y_pred = np.argmax(model_pred, axis=1)
y_true = np.argmax(test_data.labels, axis=1)

# Classification report
report = classification_report(y_true, y_pred, target_names=test_data.names, output_dict=True)
pd.DataFrame(report).T



Unnamed: 0,precision,recall,f1-score,support
snoRNA,0.575,0.657143,0.613333,35.0
rRNA,0.894737,0.985507,0.937931,138.0
snRNA,0.666667,0.454545,0.540541,22.0
mRNA,0.983333,0.951613,0.967213,186.0
pre_miRNA,0.666667,0.72,0.692308,50.0
miRNA,0.54902,0.538462,0.543689,52.0
tRNA,1.0,0.988764,0.99435,89.0
tmRNA,1.0,0.885714,0.939394,70.0
accuracy,0.872274,0.872274,0.872274,0.872274
macro avg,0.791928,0.772719,0.778595,642.0


### CNN + BiLSTM + Features

In [19]:
input_layer = Input(shape=(max_len, 4))

conv1 = Conv1D(filters=128, kernel_size=3, activation='relu')(input_layer)
dropout1 = Dropout(0.5)(conv1)
max_pool1 = MaxPooling1D(pool_size=2)(dropout1)
lstm1 = Bidirectional(LSTM(128))(max_pool1)
cnn_out = Flatten()(lstm1)

# feature extraction input

feat_extraction_input = Input(shape=(426,))
feat_extraction_out = Flatten()(feat_extraction_input)

concat = Concatenate()([cnn_out, feat_extraction_out])

dense1 = Dense(128, activation='relu')(concat)
dropout2 = Dropout(0.5)(dense1)
output_layer = Dense(8, activation='softmax')(dropout2)

model = Model(inputs=[input_layer, feat_extraction_input], outputs=output_layer)

# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam', 
              metrics= [tf.keras.metrics.Precision(name="Precision")])

model.summary()

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_11 (InputLayer)          [(None, 3339, 4)]    0           []                               
                                                                                                  
 conv1d_8 (Conv1D)              (None, 3337, 128)    1664        ['input_11[0][0]']               
                                                                                                  
 dropout_6 (Dropout)            (None, 3337, 128)    0           ['conv1d_8[0][0]']               
                                                                                                  
 max_pooling1d_6 (MaxPooling1D)  (None, 1668, 128)   0           ['dropout_6[0][0]']              
                                                                                            

In [20]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, verbose=1)
]

model.fit([train_data.seqs, train_data.features], train_data.labels, batch_size=32, epochs=10, validation_split=0.1, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fee547892d0>

In [21]:
# Model prediction
model_pred = model.predict([test_data.seqs, test_data.features])
y_pred = np.argmax(model_pred, axis=1)
y_true = np.argmax(test_data.labels, axis=1)

# Classification report
report = classification_report(y_true, y_pred, target_names=test_data.names, output_dict=True)
pd.DataFrame(report).T



Unnamed: 0,precision,recall,f1-score,support
snoRNA,0.527778,0.542857,0.535211,35.0
rRNA,0.964539,0.985507,0.97491,138.0
snRNA,0.478261,0.5,0.488889,22.0
mRNA,0.989011,0.967742,0.978261,186.0
pre_miRNA,0.622951,0.76,0.684685,50.0
miRNA,0.531915,0.480769,0.505051,52.0
tRNA,0.966667,0.977528,0.972067,89.0
tmRNA,1.0,0.885714,0.939394,70.0
accuracy,0.869159,0.869159,0.869159,0.869159
macro avg,0.76014,0.762515,0.759808,642.0


## Label Encoding (Embedding Layer)

In [9]:
train_data = seqdata.Seq('train/', 'label')
test_data = seqdata.Seq('test/', 'label')

max_len = seqdata.pad_data(train_data, test_data)

train_data.feature_extraction([1, 2, 3, 4, 5, 6, 7, 8], True)
test_data.feature_extraction([1, 2, 3, 4, 5, 6, 7, 8], False)

### CNN

In [3]:
# Functional Model Keras

# cnn input

input_layer = Input(shape=(max_len,))

embedding1 = Embedding(5, 32, input_length=max_len)(input_layer)
conv1 = Conv1D(filters=128, kernel_size=3, activation='relu')(embedding1)
conv2 = Conv1D(filters=128, kernel_size=3, activation='relu')(conv1)
dropout1 = Dropout(0.5)(conv2)
max_pool1 = MaxPooling1D(pool_size=2)(dropout1)
flatten1 = Flatten()(max_pool1)

dense1 = Dense(128, activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4))(flatten1)
dropout2 = Dropout(0.5)(dense1)
output_layer = Dense(8, activation='softmax')(dropout2)

model = Model(inputs=input_layer, outputs=output_layer)

# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam', 
              metrics= [tf.keras.metrics.Precision(name="Precision")])

model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 3339)]            0         
                                                                 
 embedding (Embedding)       (None, 3339, 32)          160       
                                                                 
 conv1d (Conv1D)             (None, 3337, 128)         12416     
                                                                 
 conv1d_1 (Conv1D)           (None, 3335, 128)         49280     
                                                                 
 dropout (Dropout)           (None, 3335, 128)         0         
                                                                 
 max_pooling1d (MaxPooling1D  (None, 1667, 128)        0         
 )                                                               
                                                             

2023-04-01 21:17:24.654617: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-01 21:17:24.656675: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [4]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, verbose=1)
]

model.fit(train_data.seqs, train_data.labels, batch_size=32, epochs=10, validation_split=0.1, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 7: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 8/10
Epoch 9/10
Epoch 9: ReduceLROnPlateau reducing learning rate to 4.0000001899898055e-05.
Epoch 10/10
Epoch 10: early stopping


<keras.callbacks.History at 0x7fd206af0250>

In [5]:
# Model prediction
model_pred = model.predict(test_data.seqs)
y_pred = np.argmax(model_pred, axis=1)
y_true = np.argmax(test_data.labels, axis=1)

# Classification report
report = classification_report(y_true, y_pred, target_names=test_data.names, output_dict=True)
pd.DataFrame(report).T



Unnamed: 0,precision,recall,f1-score,support
snoRNA,0.527778,0.542857,0.535211,35.0
rRNA,0.882759,0.927536,0.904594,138.0
snRNA,0.692308,0.409091,0.514286,22.0
mRNA,0.901554,0.935484,0.918206,186.0
pre_miRNA,0.661017,0.78,0.715596,50.0
miRNA,0.509804,0.5,0.504854,52.0
tRNA,1.0,0.966292,0.982857,89.0
tmRNA,1.0,0.842857,0.914729,70.0
accuracy,0.841121,0.841121,0.841121,0.841121
macro avg,0.771902,0.738015,0.748792,642.0


### CNN + BiLSTM

In [20]:
input_layer = Input(shape=(max_len,))

embedding1 = Embedding(5, 32, input_length=max_len)(input_layer)
conv1 = Conv1D(filters=128, kernel_size=3, activation='relu')(embedding1)
dropout1 = Dropout(0.5)(conv1)
max_pool1 = MaxPooling1D(pool_size=2)(dropout1)

lstm1 = Bidirectional(LSTM(128))(max_pool1)

dense1 = Dense(128, activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4))(lstm1)
dropout2 = Dropout(0.5)(dense1)
output_layer = Dense(8, activation='softmax')(dropout2)

model = Model(inputs=input_layer, outputs=output_layer)

# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam', 
              metrics= [tf.keras.metrics.Precision(name="Precision")])

model.summary()

Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 3339)]            0         
                                                                 
 embedding_7 (Embedding)     (None, 3339, 128)         640       
                                                                 
 conv1d_6 (Conv1D)           (None, 3337, 128)         49280     
                                                                 
 max_pooling1d_5 (MaxPooling  (None, 1668, 128)        0         
 1D)                                                             
                                                                 
 bidirectional_2 (Bidirectio  (None, 256)              263168    
 nal)                                                            
                                                                 
 dense_14 (Dense)            (None, 128)               3289

In [21]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, verbose=1)
]

model.fit(train_data.seqs, train_data.labels, batch_size=32, epochs=10, validation_split=0.1, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 6/10
Epoch 7/10
Epoch 7: ReduceLROnPlateau reducing learning rate to 4.0000001899898055e-05.
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fd104532920>

In [22]:
# Model prediction
model_pred = model.predict(test_data.seqs)
y_pred = np.argmax(model_pred, axis=1)
y_true = np.argmax(test_data.labels, axis=1)

# Classification report
report = classification_report(y_true, y_pred, target_names=test_data.names, output_dict=True)
pd.DataFrame(report).T



Unnamed: 0,precision,recall,f1-score,support
snoRNA,0.333333,0.4,0.363636,35.0
rRNA,0.563063,0.905797,0.694444,138.0
snRNA,0.5,0.045455,0.083333,22.0
mRNA,0.819277,0.731183,0.772727,186.0
pre_miRNA,0.555556,0.8,0.655738,50.0
miRNA,0.333333,0.192308,0.243902,52.0
tRNA,0.865979,0.94382,0.903226,89.0
tmRNA,1.0,0.157143,0.271605,70.0
accuracy,0.655763,0.655763,0.655763,0.655763
macro avg,0.621318,0.521963,0.498577,642.0


### CNN + Features

In [13]:
input_layer = Input(shape=(max_len,))

embedding1 = Embedding(5, 32, input_length=max_len)(input_layer)
conv1 = Conv1D(filters=64, kernel_size=3, activation='relu')(embedding1)
conv2 = Conv1D(filters=64, kernel_size=3, activation='relu')(conv1)
dropout1 = Dropout(0.5)(conv2)
max_pool1 = MaxPooling1D(pool_size=2)(dropout1)
cnn_out = Flatten()(max_pool1)

# feature extraction input

feat_extraction_input = Input(shape=(426,))
feat_extraction_out = Flatten()(feat_extraction_input)

concat = Concatenate()([cnn_out, feat_extraction_out])

dense = Dense(128, activation='relu')(concat)

output_layer = Dense(8, activation='softmax')(dense)

model = Model(inputs=[input_layer, feat_extraction_input], outputs=output_layer)

# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam', 
              metrics= [tf.keras.metrics.Precision(name="Precision")])

model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 3339)]       0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 3339, 32)     160         ['input_5[0][0]']                
                                                                                                  
 conv1d_4 (Conv1D)              (None, 3337, 64)     6208        ['embedding_1[0][0]']            
                                                                                                  
 dropout_3 (Dropout)            (None, 3337, 64)     0           ['conv1d_4[0][0]']               
                                                                                            

In [11]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, verbose=1)
]

model.fit([train_data.seqs, train_data.features], train_data.labels, batch_size=32, epochs=10, validation_split=0.1, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 6/10
Epoch 7/10
Epoch 7: ReduceLROnPlateau reducing learning rate to 4.0000001899898055e-05.
Epoch 8/10
Epoch 8: early stopping


<keras.callbacks.History at 0x7fee3849a8f0>

In [12]:
# Model prediction
model_pred = model.predict([test_data.seqs, test_data.features])
y_pred = np.argmax(model_pred, axis=1)
y_true = np.argmax(test_data.labels, axis=1)

# Classification report
report = classification_report(y_true, y_pred, target_names=test_data.names, output_dict=True)
pd.DataFrame(report).T



Unnamed: 0,precision,recall,f1-score,support
snoRNA,0.641026,0.714286,0.675676,35.0
rRNA,0.92517,0.985507,0.954386,138.0
snRNA,0.529412,0.409091,0.461538,22.0
mRNA,0.967391,0.956989,0.962162,186.0
pre_miRNA,0.645833,0.62,0.632653,50.0
miRNA,0.517241,0.576923,0.545455,52.0
tRNA,1.0,0.988764,0.99435,89.0
tmRNA,1.0,0.871429,0.931298,70.0
accuracy,0.869159,0.869159,0.869159,0.869159
macro avg,0.778259,0.765374,0.76969,642.0


### CNN + BiLSTM + Features

In [15]:
input_layer = Input(shape=(max_len,))

embedding1 = Embedding(5, 32, input_length=max_len)(input_layer)
conv1 = Conv1D(filters=64, kernel_size=3, activation='relu')(embedding1)
dropout1 = Dropout(0.5)(conv1)
max_pool1 = MaxPooling1D(pool_size=2)(dropout1)
lstm1 = Bidirectional(LSTM(128))(max_pool1)
cnn_out = Flatten()(lstm1)

# feature extraction input

feat_extraction_input = Input(shape=(426,))
feat_extraction_out = Flatten()(feat_extraction_input)

concat = Concatenate()([cnn_out, feat_extraction_out])

dense = Dense(128, activation='relu')(concat)

output_layer = Dense(8, activation='softmax')(dense)

model = Model(inputs=[input_layer, feat_extraction_input], outputs=output_layer)

# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam', 
              metrics= [tf.keras.metrics.Precision(name="Precision")])

model.summary()

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_9 (InputLayer)           [(None, 3339)]       0           []                               
                                                                                                  
 embedding_3 (Embedding)        (None, 3339, 32)     160         ['input_9[0][0]']                
                                                                                                  
 conv1d_7 (Conv1D)              (None, 3337, 64)     6208        ['embedding_3[0][0]']            
                                                                                                  
 dropout_5 (Dropout)            (None, 3337, 64)     0           ['conv1d_7[0][0]']               
                                                                                            

In [16]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, verbose=1)
]

model.fit([train_data.seqs, train_data.features], train_data.labels, batch_size=32, epochs=10, validation_split=0.1, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 6: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 10: ReduceLROnPlateau reducing learning rate to 4.0000001899898055e-05.


<keras.callbacks.History at 0x7fee51bd1210>

In [17]:
# Model prediction
model_pred = model.predict([test_data.seqs, test_data.features])
y_pred = np.argmax(model_pred, axis=1)
y_true = np.argmax(test_data.labels, axis=1)

# Classification report
report = classification_report(y_true, y_pred, target_names=test_data.names, output_dict=True)
pd.DataFrame(report).T



Unnamed: 0,precision,recall,f1-score,support
snoRNA,0.542857,0.542857,0.542857,35.0
rRNA,0.950704,0.978261,0.964286,138.0
snRNA,0.52381,0.5,0.511628,22.0
mRNA,0.988889,0.956989,0.972678,186.0
pre_miRNA,0.685185,0.74,0.711538,50.0
miRNA,0.509091,0.538462,0.523364,52.0
tRNA,0.956522,0.988764,0.972376,89.0
tmRNA,1.0,0.9,0.947368,70.0
accuracy,0.870717,0.870717,0.870717,0.870717
macro avg,0.769632,0.768167,0.768262,642.0


## k-mer Encoding (Embedding layer)

In [2]:
train_data = seqdata.Seq('train/', 'k-mer')
test_data = seqdata.Seq('test/', 'k-mer')

max_len = seqdata.pad_data(train_data, test_data)

train_data.feature_extraction([1, 2, 3, 4, 5, 6, 7, 8], True)
test_data.feature_extraction([1, 2, 3, 4, 5, 6, 7, 8], False)

In [3]:
num_words = len([comb for comb in product(['A', 'C', 'G', 'T'], repeat= 6)])

### CNN

In [13]:
input_layer = Input(shape=(max_len,))

embedding1 = Embedding(num_words, 32, input_length=max_len)(input_layer)
conv1 = Conv1D(filters=128, kernel_size=3, activation='relu')(embedding1)
conv2 = Conv1D(filters=128, kernel_size=3, activation='relu')(conv1)
dropout1 = Dropout(0.5)(conv2)
max_pool1 = MaxPooling1D(pool_size=2)(dropout1)
flatten1 = Flatten()(max_pool1)

dense1 = Dense(128, activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4))(flatten1)
dropout2 = Dropout(0.5)(dense1)
output_layer = Dense(8, activation='softmax')(dropout2)

model = Model(inputs=input_layer, outputs=output_layer)

# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam', 
              metrics= [tf.keras.metrics.Precision(name="Precision")])

model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 3334)]            0         
                                                                 
 embedding_1 (Embedding)     (None, 3334, 32)          131072    
                                                                 
 conv1d_2 (Conv1D)           (None, 3332, 128)         12416     
                                                                 
 conv1d_3 (Conv1D)           (None, 3330, 128)         49280     
                                                                 
 dropout_2 (Dropout)         (None, 3330, 128)         0         
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 1665, 128)        0         
 1D)                                                             
                                                           

In [14]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, verbose=1)
]

model.fit(train_data.seqs, train_data.labels, batch_size=32, epochs=10, validation_split=0.1, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 6/10
Epoch 7/10
Epoch 7: ReduceLROnPlateau reducing learning rate to 4.0000001899898055e-05.
Epoch 8/10
Epoch 8: early stopping


<keras.callbacks.History at 0x7f9a2421e050>

In [15]:
# Model prediction
model_pred = model.predict(test_data.seqs)
y_pred = np.argmax(model_pred, axis=1)
y_true = np.argmax(test_data.labels, axis=1)

# Classification report
report = classification_report(y_true, y_pred, target_names=test_data.names, output_dict=True)
pd.DataFrame(report).T



Unnamed: 0,precision,recall,f1-score,support
snoRNA,0.392857,0.314286,0.349206,35.0
rRNA,0.928571,0.942029,0.935252,138.0
snRNA,0.714286,0.227273,0.344828,22.0
mRNA,0.932642,0.967742,0.949868,186.0
pre_miRNA,0.540984,0.66,0.594595,50.0
miRNA,0.45614,0.5,0.477064,52.0
tRNA,0.917526,1.0,0.956989,89.0
tmRNA,0.983051,0.828571,0.899225,70.0
accuracy,0.82866,0.82866,0.82866,0.82866
macro avg,0.733257,0.679988,0.688378,642.0


### CNN + BiLSTM

In [19]:
input_layer = Input(shape=(max_len,))

embedding1 = Embedding(num_words, 32, input_length=max_len)(input_layer)
conv1 = Conv1D(filters=128, kernel_size=3, activation='relu')(embedding1)
dropout1 = Dropout(0.5)(conv1)
max_pool1 = MaxPooling1D(pool_size=2)(dropout1)

lstm1 = Bidirectional(LSTM(128))(max_pool1)

dense1 = Dense(128, activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4))(lstm1)
dropout2 = Dropout(0.5)(dense1)
output_layer = Dense(8, activation='softmax')(dropout2)

model = Model(inputs=input_layer, outputs=output_layer)

# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam', 
              metrics= [tf.keras.metrics.Precision(name="Precision")])

model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 3334)]            0         
                                                                 
 embedding_3 (Embedding)     (None, 3334, 32)          131072    
                                                                 
 conv1d_5 (Conv1D)           (None, 3332, 128)         12416     
                                                                 
 dropout_6 (Dropout)         (None, 3332, 128)         0         
                                                                 
 max_pooling1d_3 (MaxPooling  (None, 1666, 128)        0         
 1D)                                                             
                                                                 
 bidirectional_1 (Bidirectio  (None, 256)              263168    
 nal)                                                      

In [20]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, verbose=1)
]

model.fit(train_data.seqs, train_data.labels, batch_size=32, epochs=10, validation_split=0.1, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 9: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 10/10


<keras.callbacks.History at 0x7f99e07705e0>

In [21]:
# Model prediction
model_pred = model.predict(test_data.seqs)
y_pred = np.argmax(model_pred, axis=1)
y_true = np.argmax(test_data.labels, axis=1)

# Classification report
report = classification_report(y_true, y_pred, target_names=test_data.names, output_dict=True)
pd.DataFrame(report).T



Unnamed: 0,precision,recall,f1-score,support
snoRNA,0.361111,0.371429,0.366197,35.0
rRNA,0.909091,0.869565,0.888889,138.0
snRNA,0.109589,0.363636,0.168421,22.0
mRNA,0.830601,0.817204,0.823848,186.0
pre_miRNA,0.681818,0.6,0.638298,50.0
miRNA,0.636364,0.403846,0.494118,52.0
tRNA,0.976471,0.932584,0.954023,89.0
tmRNA,0.785714,0.628571,0.698413,70.0
accuracy,0.733645,0.733645,0.733645,0.733645
macro avg,0.661345,0.623355,0.629026,642.0


### CNN + Features

In [26]:
input_layer = Input(shape=(max_len,))

embedding1 = Embedding(num_words, 32, input_length=max_len)(input_layer)
dropout1 = Dropout(0.2)(embedding1)
conv1 = Conv1D(filters=128, kernel_size=3, activation='relu')(dropout1)
conv2 = Conv1D(filters=128, kernel_size=3, activation='relu')(conv1)
max_pool1 = MaxPooling1D(pool_size=2)(conv2)
dropout2 = Dropout(0.2)(max_pool1)
cnn_out = Flatten()(dropout2)

# feature extraction input

feat_extraction_input = Input(shape=(426,))
feat_extraction_out = Flatten()(feat_extraction_input)

concat = Concatenate()([cnn_out, feat_extraction_out])

dense1 = Dense(128, activation='relu')(concat)
dropout3 = Dropout(0.2)(dense1)
output_layer = Dense(8, activation='softmax')(dropout3)

model = Model(inputs=[input_layer, feat_extraction_input], outputs=output_layer)

# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam', 
              metrics= [tf.keras.metrics.Precision(name="Precision")])

model.summary()

Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_17 (InputLayer)          [(None, 3334)]       0           []                               
                                                                                                  
 embedding_8 (Embedding)        (None, 3334, 32)     131072      ['input_17[0][0]']               
                                                                                                  
 dropout_21 (Dropout)           (None, 3334, 32)     0           ['embedding_8[0][0]']            
                                                                                                  
 conv1d_16 (Conv1D)             (None, 3332, 128)    12416       ['dropout_21[0][0]']             
                                                                                            

In [27]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, verbose=1)
]

model.fit([train_data.seqs, train_data.features], train_data.labels, batch_size=32, epochs=10, validation_split=0.1, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 4: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 5/10
Epoch 6/10
Epoch 6: ReduceLROnPlateau reducing learning rate to 4.0000001899898055e-05.
Epoch 7/10
Epoch 7: early stopping


<keras.callbacks.History at 0x7f10c0792770>

In [28]:
# Model prediction
model_pred = model.predict([test_data.seqs, test_data.features])
y_pred = np.argmax(model_pred, axis=1)
y_true = np.argmax(test_data.labels, axis=1)

# Classification report
report = classification_report(y_true, y_pred, target_names=test_data.names, output_dict=True)
pd.DataFrame(report).T



Unnamed: 0,precision,recall,f1-score,support
snoRNA,0.657895,0.714286,0.684932,35.0
rRNA,0.910959,0.963768,0.93662,138.0
snRNA,0.8125,0.590909,0.684211,22.0
mRNA,0.936842,0.956989,0.946809,186.0
pre_miRNA,0.673077,0.7,0.686275,50.0
miRNA,0.591837,0.557692,0.574257,52.0
tRNA,0.967033,0.988764,0.977778,89.0
tmRNA,1.0,0.857143,0.923077,70.0
accuracy,0.873832,0.873832,0.873832,0.873832
macro avg,0.818768,0.791194,0.801745,642.0
