In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Concatenate, LSTM, Dense, Dropout, Conv1D, MaxPooling1D, Flatten, Embedding, GlobalMaxPooling1D
from sklearn.metrics import classification_report
import tensorflow.keras.backend as K
import pandas as pd
from keras.models import Model
import numpy as np
import seqdata

tf.random.set_seed(7)

2023-03-28 18:33:13.384808: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Plot model
#
# tf.keras.utils.plot_model(
#     model,
#     to_file='model.png',
#     show_shapes=False,
#     show_dtype=False,
#     show_layer_names=True,
#     rankdir='TB',
#     expand_nested=False,
#     dpi=96,
#     layer_range=None,
#     show_layer_activations=False
# )

## One-hot encoding

In [3]:
train = seqdata.Seq('train/', 'ohe')
test = seqdata.Seq('test/', 'ohe')

max_len = seqdata.pad_data(train, test)

train.feature_extraction([1, 2, 3, 4, 5, 6, 7, 8], True)
test.feature_extraction([1, 2, 3, 4, 5, 6, 7, 8], False)

### CNN

In [4]:
# cnn input

cnn_input = Input(shape=(max_len, 4))

x = Conv1D(filters=64, kernel_size=3, activation='relu')(cnn_input)
x = Conv1D(filters=64, kernel_size=3, activation='relu')(x)
x = Dropout(0.5)(x)
x = MaxPooling1D(pool_size=2)(x)
cnn_out = Flatten()(x)

dense = Dense(128, activation='relu')(cnn_out)

main_output = Dense(8, activation='softmax')(dense)

model = Model(inputs=cnn_input, outputs=main_output)

# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam', 
              metrics= [tf.keras.metrics.Precision(name="Precision")])

model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 3339, 4)]         0         
                                                                 
 conv1d (Conv1D)             (None, 3337, 64)          832       
                                                                 
 conv1d_1 (Conv1D)           (None, 3335, 64)          12352     
                                                                 
 dropout (Dropout)           (None, 3335, 64)          0         
                                                                 
 max_pooling1d (MaxPooling1D  (None, 1667, 64)         0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 106688)            0         
                                                             

2023-03-28 18:35:35.127018: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-28 18:35:35.129209: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [5]:
model.fit(train.seqs, train.labels, batch_size=32, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0f226e6fb0>

In [6]:
model_pred = model.predict(test.seqs)

y_pred = []

for row in model_pred:
    pred = [0 for i in range(8)]
    pred[np.argmax(row)] = 1
    y_pred.append(pred)

y_pred = np.array(y_pred)

pd.DataFrame(classification_report(test.labels, y_pred, target_names=test.names, output_dict=True)).T



Unnamed: 0,precision,recall,f1-score,support
snoRNA,0.472222,0.485714,0.478873,35.0
rRNA,0.880282,0.905797,0.892857,138.0
snRNA,0.75,0.272727,0.4,22.0
mRNA,0.904762,0.919355,0.912,186.0
pre_miRNA,0.596491,0.68,0.635514,50.0
miRNA,0.517241,0.576923,0.545455,52.0
tRNA,0.977273,0.966292,0.971751,89.0
tmRNA,0.984375,0.9,0.940299,70.0
micro avg,0.82866,0.82866,0.82866,642.0
macro avg,0.760331,0.713351,0.722094,642.0


### CNN + Features

In [7]:
# Functional Model Keras

# cnn input

cnn_input = Input(shape=(max_len, 4))

x = Conv1D(filters=64, kernel_size=3, activation='relu')(cnn_input)
x = Conv1D(filters=64, kernel_size=3, activation='relu')(x)
x = Dropout(0.5)(x)
x = MaxPooling1D(pool_size=2)(x)
cnn_out = Flatten()(x)

# feature extraction input

feat_extraction_input = Input(shape=(426,))
feat_extraction_out = Flatten()(feat_extraction_input)

concat = Concatenate()([cnn_out, feat_extraction_out])

dense = Dense(128, activation='relu')(concat)

main_output = Dense(8, activation='softmax')(dense)

model = Model(inputs=[cnn_input,feat_extraction_input], outputs=main_output)

# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam', 
              metrics= [tf.keras.metrics.Precision(name="Precision")])

model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 3339, 4)]    0           []                               
                                                                                                  
 conv1d_2 (Conv1D)              (None, 3337, 64)     832         ['input_2[0][0]']                
                                                                                                  
 conv1d_3 (Conv1D)              (None, 3335, 64)     12352       ['conv1d_2[0][0]']               
                                                                                                  
 dropout_1 (Dropout)            (None, 3335, 64)     0           ['conv1d_3[0][0]']               
                                                                                            

In [8]:
model.fit([train.seqs, train.features], train.labels, batch_size=32, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0f2166a830>

In [9]:
model_pred = model.predict([test.seqs, test.features])

y_pred = []

for row in model_pred:
    pred = [0 for i in range(8)]
    pred[np.argmax(row)] = 1
    y_pred.append(pred)

y_pred = np.array(y_pred)

pd.DataFrame(classification_report(test.labels, y_pred, target_names=test.names, output_dict=True)).T



Unnamed: 0,precision,recall,f1-score,support
snoRNA,0.521739,0.685714,0.592593,35.0
rRNA,0.918367,0.978261,0.947368,138.0
snRNA,0.6,0.409091,0.486486,22.0
mRNA,0.98324,0.946237,0.964384,186.0
pre_miRNA,0.647059,0.66,0.653465,50.0
miRNA,0.520833,0.480769,0.5,52.0
tRNA,1.0,0.988764,0.99435,89.0
tmRNA,0.985294,0.957143,0.971014,70.0
micro avg,0.867601,0.867601,0.867601,642.0
macro avg,0.772067,0.763247,0.763708,642.0


### Label Encoding (Embedding Layer)

In [10]:
train = seqdata.Seq('train/', 'label')
test = seqdata.Seq('test/', 'label')

max_len = seqdata.pad_data(train, test)

train.feature_extraction([1, 2, 3, 4, 5, 6, 7, 8], True)
test.feature_extraction([1, 2, 3, 4, 5, 6, 7, 8], False)

### CNN

In [11]:
# Functional Model Keras

# cnn input

cnn_input = Input(shape=(max_len,))

x = Embedding(5, 32, input_length=max_len)(cnn_input)
x = Conv1D(filters=64, kernel_size=3, activation='relu')(x)
x = Conv1D(filters=64, kernel_size=3, activation='relu')(x)
x = Dropout(0.5)(x)
x = MaxPooling1D(pool_size=2)(x)
cnn_out = Flatten()(x)

dense = Dense(128, activation='relu')(cnn_out)

main_output = Dense(8, activation='softmax')(dense)

model = Model(inputs=cnn_input, outputs=main_output)

# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam', 
              metrics= [tf.keras.metrics.Precision(name="Precision")])

model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 3339)]            0         
                                                                 
 embedding (Embedding)       (None, 3339, 32)          160       
                                                                 
 conv1d_4 (Conv1D)           (None, 3337, 64)          6208      
                                                                 
 conv1d_5 (Conv1D)           (None, 3335, 64)          12352     
                                                                 
 dropout_2 (Dropout)         (None, 3335, 64)          0         
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 1667, 64)         0         
 1D)                                                             
                                                           

In [12]:
model.fit(train.seqs, train.labels, batch_size=32, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0f20adbd00>

In [13]:
model_pred = model.predict(test.seqs)

y_pred = []

for row in model_pred:
    pred = [0 for i in range(8)]
    pred[np.argmax(row)] = 1
    y_pred.append(pred)

y_pred = np.array(y_pred)

pd.DataFrame(classification_report(test.labels, y_pred, target_names=test.names, output_dict=True)).T



Unnamed: 0,precision,recall,f1-score,support
snoRNA,0.586207,0.485714,0.53125,35.0
rRNA,0.828947,0.913043,0.868966,138.0
snRNA,0.769231,0.454545,0.571429,22.0
mRNA,0.9,0.870968,0.885246,186.0
pre_miRNA,0.645161,0.8,0.714286,50.0
miRNA,0.481481,0.5,0.490566,52.0
tRNA,0.966292,0.966292,0.966292,89.0
tmRNA,0.968254,0.871429,0.917293,70.0
micro avg,0.82243,0.82243,0.82243,642.0
macro avg,0.768197,0.732749,0.743166,642.0


### CNN + Features

In [15]:
cnn_input = Input(shape=(max_len,))

x = Embedding(5, 32, input_length=max_len)(cnn_input)
x = Conv1D(filters=64, kernel_size=3, activation='relu')(x)
x = Conv1D(filters=64, kernel_size=3, activation='relu')(x)
x = Dropout(0.5)(x)
x = MaxPooling1D(pool_size=2)(x)
cnn_out = Flatten()(x)

# feature extraction input

feat_extraction_input = Input(shape=(426,))
feat_extraction_out = Flatten()(feat_extraction_input)

concat = Concatenate()([cnn_out, feat_extraction_out])

dense = Dense(128, activation='relu')(concat)

main_output = Dense(8, activation='softmax')(dense)

model = Model(inputs=[cnn_input,feat_extraction_input], outputs=main_output)

# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam', 
              metrics= [tf.keras.metrics.Precision(name="Precision")])

model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 3339)]       0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 3339, 32)     160         ['input_5[0][0]']                
                                                                                                  
 conv1d_6 (Conv1D)              (None, 3337, 64)     6208        ['embedding_1[0][0]']            
                                                                                                  
 conv1d_7 (Conv1D)              (None, 3335, 64)     12352       ['conv1d_6[0][0]']               
                                                                                            

In [16]:
model.fit([train.seqs, train.features], train.labels, batch_size=32, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0f219ffa90>

In [17]:
model_pred = model.predict([test.seqs, test.features])

y_pred = []

for row in model_pred:
    pred = [0 for i in range(8)]
    pred[np.argmax(row)] = 1
    y_pred.append(pred)

y_pred = np.array(y_pred)

pd.DataFrame(classification_report(test.labels, y_pred, target_names=test.names, output_dict=True)).T



Unnamed: 0,precision,recall,f1-score,support
snoRNA,0.487179,0.542857,0.513514,35.0
rRNA,0.9375,0.978261,0.957447,138.0
snRNA,0.458333,0.5,0.478261,22.0
mRNA,0.994048,0.897849,0.943503,186.0
pre_miRNA,0.738095,0.62,0.673913,50.0
miRNA,0.537313,0.692308,0.605042,52.0
tRNA,0.967033,0.988764,0.977778,89.0
tmRNA,0.970149,0.928571,0.948905,70.0
micro avg,0.859813,0.859813,0.859813,642.0
macro avg,0.761206,0.768576,0.762295,642.0
