In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import classification_report
from tensorflow.keras.layers import (Concatenate, Conv1D, Dense, Dropout, Flatten,
                                    Input, LSTM, MaxPooling1D, TimeDistributed)
from tensorflow.keras.models import Model
import seqdata
from transformers import AutoTokenizer, BertTokenizer, TFAutoModel


train = seqdata.Seq('train/')
test = seqdata.Seq('test/')

max_len = seqdata.pad_data(train, test)
max_len

train.feature_extraction([1, 2, 3, 4, 5, 6, 7, 8], True)
test.feature_extraction([1, 2, 3, 4, 5, 6, 7, 8], False)



Error: feat_extraction/train - Acesso negado.
Creating Directory...


  dataframes = pd.concat([pd.read_csv(f) for f in datasets], axis=1)


Error: feat_extraction/test - Acesso negado.
Creating Directory...


# LTSM + feature extraction

In [5]:
# Functional Model Keras

# CNN input
cnn_input = Input(shape=(max_len, 4))

# CNN layers
conv1 = Conv1D(filters=64, kernel_size=3, activation='relu')(cnn_input)
conv2 = Conv1D(filters=64, kernel_size=3, activation='relu')(conv1)
dropout1 = Dropout(0.5)(conv2)
max_pool1 = MaxPooling1D(pool_size=2)(dropout1)
cnn_output = TimeDistributed(Flatten())(max_pool1)

# LSTM layer
lstm_output = LSTM(128)(cnn_output)

# Feature extraction input
feature_input = Input(shape=(426,))
feature_output = Flatten()(feature_input)

# Concatenate LSTM and feature extraction outputs
concat_output = Concatenate()([lstm_output, feature_output])

# Dense layer
dense_layer = Dense(128, activation='relu')(concat_output)

# Output layer
output_layer = Dense(8, activation='softmax')(dense_layer)

# Model definition
model = Model(inputs=[cnn_input, feature_input], outputs=output_layer)

# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=[tf.keras.metrics.Precision(name="Precision")])

model.summary()

# Plot model
tf.keras.utils.plot_model(
    model,
    to_file='model.png',
    show_shapes=True,
    show_dtype=False,
    show_layer_names=True,
    rankdir='TB',
    expand_nested=False,
    dpi=96,
    layer_range=None,
    show_layer_activations=False
)

# Model training
model.fit([train.seqs, train.features], train.labels, batch_size=64, epochs=10)

# Model prediction
model_predictions = model.predict([test.seqs, test.features])

# Convert predictions to one-hot encoded format
y_pred = []
for row in model_predictions:
    pred = [0 for _ in range(8)]
    pred[np.argmax(row)] = 1
    y_pred.append(pred)

y_pred = np.array(y_pred)

# Classification report
report = classification_report(test.labels, y_pred, target_names=test.names, output_dict=True)
pd.DataFrame(report).T


Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 3339, 4)]    0           []                               
                                                                                                  
 conv1d_6 (Conv1D)              (None, 3337, 64)     832         ['input_5[0][0]']                
                                                                                                  
 conv1d_7 (Conv1D)              (None, 3335, 64)     12352       ['conv1d_6[0][0]']               
                                                                                                  
 dropout_5 (Dropout)            (None, 3335, 64)     0           ['conv1d_7[0][0]']               
                                                                                            

Unnamed: 0,precision,recall,f1-score,support
miRNA,0.575758,0.365385,0.447059,52.0
mRNA,0.99435,0.946237,0.969697,186.0
pre_miRNA,0.649123,0.74,0.691589,50.0
rRNA,0.964286,0.978261,0.971223,138.0
snoRNA,0.509434,0.771429,0.613636,35.0
snRNA,0.55,0.5,0.52381,22.0
tmRNA,0.985294,0.957143,0.971014,70.0
tRNA,0.93617,0.988764,0.961749,89.0
micro avg,0.872274,0.872274,0.872274,642.0
macro avg,0.770552,0.780902,0.768722,642.0


 # Only LSTM 

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Conv1D, Dense, Dropout, Flatten, Input, Bidirectional, LSTM, MaxPooling1D, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import seqdata

train_data = seqdata.Seq('train/')
test_data = seqdata.Seq('test/')

seqdata.pad_data(train_data, test_data)

# CNN layers
input_layer = Input(shape=(train_data.seqs.shape[1], 4))
conv1 = Conv1D(128, 3, activation='relu')(input_layer)
conv2 = Conv1D(128, 3, activation='relu')(conv1)
dropout1 = Dropout(0.5)(conv2)
max_pool1 = MaxPooling1D(2)(dropout1)
cnn_output = TimeDistributed(Flatten())(max_pool1)

# Bidirectional LSTM layers
lstm1 = Bidirectional(LSTM(128, return_sequences=True))(cnn_output)
lstm2 = Bidirectional(LSTM(128))(lstm1)

# Dense layers
dense1 = Dense(128, activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4))(lstm2)
dropout2 = Dropout(0.5)(dense1)
output_layer = Dense(8, activation='softmax')(dropout2)

model = Model(inputs=input_layer, outputs=output_layer)

# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.Precision(name="Precision")])
model.summary()

callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, verbose=1)
]

model.fit(train_data.seqs, train_data.labels, batch_size=32, epochs=10, validation_split=0.1, callbacks=callbacks)

# Model prediction
model_pred = model.predict(test_data.seqs)
y_pred = np.argmax(model_pred, axis=1)
y_true = np.argmax(test_data.labels, axis=1)

# Classification report
report = classification_report(y_true, y_pred, target_names=test_data.names, output_dict=True)
pd.DataFrame(report).T


Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 3339, 4)]         0         
                                                                 
 conv1d_2 (Conv1D)           (None, 3337, 128)         1664      
                                                                 
 conv1d_3 (Conv1D)           (None, 3335, 128)         49280     
                                                                 
 dropout_2 (Dropout)         (None, 3335, 128)         0         
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 1667, 128)        0         
 1D)                                                             
                                                                 
 time_distributed_1 (TimeDis  (None, 1667, 128)        0         
 tributed)                                                 

Unnamed: 0,precision,recall,f1-score,support
miRNA,0.408163,0.769231,0.533333,52.0
mRNA,0.924855,0.860215,0.891365,186.0
pre_miRNA,0.75,0.48,0.585366,50.0
rRNA,0.791667,0.963768,0.869281,138.0
snoRNA,0.529412,0.257143,0.346154,35.0
snRNA,0.571429,0.181818,0.275862,22.0
tmRNA,0.948276,0.785714,0.859375,70.0
tRNA,0.910112,0.910112,0.910112,89.0
accuracy,0.788162,0.788162,0.788162,0.788162
macro avg,0.729239,0.651,0.658856,642.0


# LTSM OTIMIZADO

In [8]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import classification_report
from tensorflow.keras.layers import (BatchNormalization, Concatenate, Conv1D, Dense, Dropout, Flatten,
                                    Input, LSTM, MaxPooling1D)
from tensorflow.keras.models import Model
import seqdata
from transformers import AutoTokenizer, BertTokenizer, TFAutoModel

# Functional Model Keras

# cnn input
cnn_input = Input(shape=(max_len, 4))

x = Conv1D(filters=128, kernel_size=3, activation='relu')(cnn_input)
x = BatchNormalization()(x)
x = Conv1D(filters=128, kernel_size=3, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)
x = MaxPooling1D(pool_size=2)(x)
cnn_out = Flatten()(x)

# lstm layer
lstm_input = Input(shape=(max_len, 4))
lstm_out = LSTM(128)(lstm_input)

# feature extraction input
feat_extraction_input = Input(shape=(426,))
feat_extraction_out = Dense(128, activation='relu')(feat_extraction_input)

concat = Concatenate()([cnn_out, lstm_out, feat_extraction_out])

dense = Dense(256, activation='relu')(concat)
dense = Dropout(0.5)(dense)

main_output = Dense(8, activation='softmax')(dense)

model = Model(inputs=[cnn_input, lstm_input, feat_extraction_input], outputs=main_output)

# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=[tf.keras.metrics.Precision(name="Precision")])

model.summary()

tf.keras.utils.plot_model(
    model,
    to_file='model.png',
    show_shapes=True,
    show_dtype=False,
    show_layer_names=True,
    rankdir='TB',
    expand_nested=False,
    dpi=96,
    layer_range=None,
    show_layer_activations=False
)

model.fit([train.seqs, train.seqs, train.features], train.labels, batch_size=32, epochs=10, validation_split=0.1)

model_pred = model.predict([test.seqs, test.seqs, test.features])

y_pred = []

for row in model_pred:
    pred = [0 for i in range(8)]
    pred[np.argmax(row)] = 1
    y_pred.append(pred)

y_pred = np.array(y_pred)

pd.DataFrame(classification_report(test.labels, y_pred, target_names=test.names, output_dict=True)).T


Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_9 (InputLayer)           [(None, 3339, 4)]    0           []                               
                                                                                                  
 conv1d_10 (Conv1D)             (None, 3337, 128)    1664        ['input_9[0][0]']                
                                                                                                  
 batch_normalization (BatchNorm  (None, 3337, 128)   512         ['conv1d_10[0][0]']              
 alization)                                                                                       
                                                                                                  
 conv1d_11 (Conv1D)             (None, 3335, 128)    49280       ['batch_normalization[0][0]

# CNNs 1D e BiLSTM

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import classification_report
from tensorflow.keras.layers import (Concatenate, Conv1D, Dense, Dropout, Flatten,
                                    Input, Bidirectional, LSTM, MaxPooling1D, TimeDistributed)
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import seqdata

train = seqdata.Seq('train/')
test = seqdata.Seq('test/')

max_len = seqdata.pad_data(train, test)

#train.feature_extraction([1, 2, 3, 4, 5, 6, 7, 8], True)
#test.feature_extraction([1, 2, 3, 4, 5, 6, 7, 8], False)
# cnn input
cnn_input = Input(shape=(max_len, 4))

x = Conv1D(filters=128, kernel_size=3, activation='relu')(cnn_input)
x = Conv1D(filters=128, kernel_size=3, activation='relu')(x)
x = Dropout(0.5)(x)
x = MaxPooling1D(pool_size=2)(x)
cnn_out = TimeDistributed(Flatten())(x)

# bidirectional lstm layers
lstm_out1 = Bidirectional(LSTM(128, return_sequences=True))(cnn_out)
lstm_out2 = Bidirectional(LSTM(128))(lstm_out1)

dense = Dense(128, activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4))(lstm_out2)
dense = Dropout(0.5)(dense)

main_output = Dense(8, activation='softmax')(dense)

model = Model(inputs=cnn_input, outputs=main_output)

# Compile and train the model
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=[tf.keras.metrics.Precision(name="Precision")])

model.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, verbose=1)

model.fit(train.seqs, train.labels, batch_size=32, epochs=10, validation_split=0.1, callbacks=[early_stopping, reduce_lr])

model_pred = model.predict(test.seqs)

y_pred = []

for row in model_pred:
    pred = [0 for i in range(8)]
    pred[np.argmax(row)] = 1
    y_pred.append(pred)

y_pred = np.array(y_pred)

pd.DataFrame(classification_report(test.labels, y_pred, target_names=test.names, output_dict=True)).T



Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 3339, 4)]         0         
                                                                 
 conv1d (Conv1D)             (None, 3337, 128)         1664      
                                                                 
 conv1d_1 (Conv1D)           (None, 3335, 128)         49280     
                                                                 
 dropout (Dropout)           (None, 3335, 128)         0         
                                                                 
 max_pooling1d (MaxPooling1D  (None, 1667, 128)        0         
 )                                                               
                                                                 
 time_distributed (TimeDistr  (None, 1667, 128)        0         
 ibuted)                                                     

Unnamed: 0,precision,recall,f1-score,support
miRNA,0.333333,0.365385,0.348624,52.0
mRNA,0.854369,0.946237,0.897959,186.0
pre_miRNA,0.632353,0.86,0.728814,50.0
rRNA,0.972727,0.775362,0.862903,138.0
snoRNA,0.456522,0.6,0.518519,35.0
snRNA,0.454545,0.227273,0.30303,22.0
tmRNA,0.924242,0.871429,0.897059,70.0
tRNA,0.935897,0.820225,0.874251,89.0
micro avg,0.786604,0.786604,0.786604,642.0
macro avg,0.695499,0.683239,0.678895,642.0
