In [1]:
from bioinformatics import na_read as nr
#from bioinformatics import NCBIDataset as nds
from bioinformatics import FASTADataset as fads
from bioinformatics import KmerVectors as kvec

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
#from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.naive_bayes import GaussianNB
#from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
NGDC_PATH = "../data/bioinformatics/ngdc/"
IDOG_PATH = NGDC_PATH + "idog/"
#!ls -al $IDOG_PATH

In [3]:
dhole_cds_dataset_file = IDOG_PATH + "dhole.cds.fa"
wolfe_cds_dataset_file = IDOG_PATH + "wolf.cds.fa"
#!ls $dhole_cds_dataset_file
#!ls $wolfe_cds_dataset_file

In [4]:
dhole_cds_fads = fads.FASTADataset('dhole', dhole_cds_dataset_file)
wolfe_cds_fads = fads.FASTADataset('wolfe', wolfe_cds_dataset_file)

In [5]:
#dhole_cds_fads.fasta_dataset

In [6]:
#wolfe_cds_fads.fasta_dataset

In [7]:
kv_fasta = kvec.KmerVectors(['A','G','C','T'], 6, fastadatasets=[dhole_cds_fads,wolfe_cds_fads], verbose=True)
print(f'dictionary size: [{len(kv_fasta.dict)}]')
print(kv_fasta.labels)

KmerVectors Object -
alphabet [['A', 'G', 'C', 'T']]
dict: [['AAAAAA', 'AAAAAG', 'AAAAAC', 'AAAAAT']]...[['TTTTTA', 'TTTTTG', 'TTTTTC', 'TTTTTT']]
Labels: [{'dhole': 1, 'wolfe': 2}]
[dhole]
[../data/bioinformatics/ngdc/idog/dhole.cds.fa]
[wolfe]
[../data/bioinformatics/ngdc/idog/wolf.cds.fa]
dictionary size: [4096]
{'dhole': 1, 'wolfe': 2}


In [8]:
def get_metrics(y_test, y_predicted):
    accuracy = accuracy_score(y_test, y_predicted)
    precision = precision_score(y_test, y_predicted, average='weighted')
    recall = recall_score(y_test, y_predicted, average='weighted')
    f1 = f1_score(y_test, y_predicted, average='weighted')
    return accuracy, precision, recall, f1

In [None]:
df_fasta = kv_fasta.seq2KmerSentences(seq_type='fna2', base_count_max=4, length_min=1000, dataset_limit=10000, verbose=True)

FASTA Dataset
seq2KmerSentencesFASTA
fasta dataset: [dhole]
10002000300040005000600070008000900010000110001200013000140001500016000

In [None]:
df_fasta=pd.DataFrame(data=df_fasta)
df_fasta

In [None]:
texts=df_fasta['v2']
texts

In [None]:
df=pd.DataFrame(data=df_fasta)
df = df_fasta.replace({"v1": kv_fasta.labels})

df

In [None]:
sns.countplot(df.v1)
plt.xlabel('Class Label')
plt.title('Class distribution of Dhole(1) and Wolf(2)')

In [None]:
X = df.v2
Y = df.v1
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1,1)

Split into training and test data.

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.33)

In [None]:
print(f'X_train len: [{len(X_train)}]')
print(f'y_train len: [{len(Y_train)}]')

print(f'X_test len: [{len(X_test)}]')
print(f'y_test len: [{len(Y_test)}]')

In [None]:
Y_test

### Process the data
* Tokenize the data and convert the text to sequences.
* Add padding to ensure that all the sequences have the same shape.
* There are many ways of taking the max_len and here an arbitrary length of 150 is chosen.

In [None]:
max_words = 100
#max_words = 1000
max_len = 1000
#max_len = 1000
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

### RNN
Define the RNN structure.

In [None]:
#tf.keras.layers.Embedding(
#    input_dim,
#    output_dim,
#    embeddings_initializer="uniform",
#    embeddings_regularizer=None,
#    activity_regularizer=None,
#    embeddings_constraint=None,
#    mask_zero=False,
#    input_length=None,
#    **kwargs
#)
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
#    layer = Embedding(max_words,5000,input_length=max_len)(inputs)
    layer = Embedding(max_words,5000,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

Call the function and compile the model.

In [None]:
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer="Adam",metrics=['accuracy'])

In [None]:
#history = model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
#          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])
#history = model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
#          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss')])
#history = model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
#          validation_split=0.4,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])
history = model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
          validation_split=0.2,callbacks=[])

In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()