In [None]:
import pandas as pd
import numpy as np
import os

import joblib

from keras import utils as np_utils
from sklearn.model_selection import train_test_split, GridSearchCV
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv1D, MaxPooling1D, MaxPooling2D, Conv2D, LSTM, GRU, Bidirectional
from keras import regularizers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.wrappers.scikit_learn import KerasClassifier
import keras

In [None]:
seq_data = pd.read_table('/content/variants.aln4.fas.txt')

In [None]:
seq_data.head()

Unnamed: 0,Seq_Id,sequence,Variant_Id
0,hCoV19/gamma/Italy/CAMUniSa10/2021|EPI_ISL_101...,atgtttgtttttcttgttttattgccactagtctctagtcagtgtg...,gamma
1,hCoV19/gamma/Italy/CAMUniSa23/2021|EPI_ISL_101...,atgtttgtttttcttgttttattgccactagtctctagtcagtgtg...,gamma
2,hCoV19/gamma/Italy/CAMUniSa111/2021|EPI_ISL_10...,atgtttgtttttcttgttttattgccactagtctctagtcagtgtg...,gamma
3,hCoV19/gamma/South Korea/NMCnCoV09/2021|EPI_IS...,atgtttgtttttcttgttttattgccactagtctctagtcagtgtg...,gamma
4,hCoV19/gamma/Brazil/PRBT74803FI/2021|EPI_ISL_9...,atgtttgtttttcttgttttattgccactagtctctagtcagtgtg...,gamma


In [None]:
seq_data.Variant_Id.values

array(['gamma', 'gamma', 'gamma', 'gamma', 'gamma', 'gamma', 'gamma',
       'gamma', 'gamma', 'gamma', 'gamma', 'gamma', 'gamma', 'gamma',
       'gamma', 'gamma', 'gamma', 'gamma', 'gamma', 'beta', 'beta',
       'beta', 'beta', 'beta', 'beta', 'beta', 'beta', 'beta', 'beta',
       'beta', 'beta', 'beta', 'beta', 'beta', 'beta', 'beta', 'beta',
       'beta', 'beta', 'beta', 'beta', 'beta', 'beta', 'delta', 'delta',
       'delta', 'delta', 'delta', 'delta', 'delta', 'delta', 'delta',
       'delta', 'delta', 'delta', 'delta', 'delta', 'delta', 'delta',
       'delta', 'beta', 'Alpha', 'Alpha', 'Alpha', 'Alpha', 'Alpha',
       'Alpha', 'Alpha', 'Alpha', 'Alpha', 'Alpha', 'omicron', 'omicron',
       'omicron', 'omicron', 'omicron', 'omicron', 'omicron', 'omicron',
       'omicron', 'omicron', 'omicron', 'omicron', 'omicron', 'omicron',
       'omicron'], dtype=object)

In [None]:
#map the variants to numeric form
variant = {"gamma" : 1, "delta" : 2, "beta":3, "Alpha":4, "omicron":5}
seq_data["Variant_Id"] = seq_data["Variant_Id"].map(variant)

#drop  seq_id column and have the nucleotides in uppercase
seq_data = seq_data.drop(["Seq_Id"], axis=1)
seq_data['sequence'] = seq_data['sequence'].str.upper()

In [None]:
seq_data.head()

Unnamed: 0,sequence,Variant_Id
0,ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTG...,1
1,ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTG...,1
2,ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTG...,1
3,ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTG...,1
4,ATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTG...,1


In [None]:
# Check characters in the seqeunce column
ntds = []
for i in seq_data['sequence']:
    for seq in i:

        ntd = seq.strip()
        ntds.append(ntd)
    
list(set(ntds))

['A', 'T', '-', 'N', 'G', 'C']

In [None]:
sequence_data = list(seq_data.loc[:, 'sequence'])

In [None]:
encoded_list =[]

def encode_seq(s):
    Encode = {'A':[1,0,0,0,0,0],'T':[0,1,0,0,0,0],'C':[0,0,1,0,0,0],'G':[0,0,0,1,0,0], '-':[0,0,0,0,0,0],'N':[0,0,0,0,0,0]}
    return [Encode[x] for x in s]

for i in sequence_data:
    x = encode_seq(i)
    encoded_list.append(x)

X_mine = np.array(encoded_list)
X_mine.shape

(86, 3831, 6)

In [None]:
encoded_list[1]

[[1, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 0, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 0, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 0, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 0, 0, 1, 0, 0],
 [0, 0, 1, 0, 0, 0],
 [0, 0, 1, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [0, 0, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [0, 0, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [0, 0, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 0, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 0, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 0],
 [0, 1, 0, 0,

In [None]:
seq_data["Variant_Id"].unique()

array([1, 3, 2, 4, 5])

In [None]:
y_mine = seq_data['Variant_Id']
y_mine.shape

(86,)

In [None]:
X_train,X_test, y_train, y_test=train_test_split(X_mine, y_mine,random_state=42, stratify=y_mine)
from keras import utils as np_utils
y_train= keras.utils.np_utils.to_categorical(y_train)
y_test=keras.utils.np_utils.to_categorical(y_test)

In [None]:
# define model
model = Sequential()
model.add(Dense(100, input_dim=2, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
# compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# fit model
history = gru_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=300, verbose=0,batch_size=128)

In [None]:
#evaluate the model
train_acc = model.evaluate(trainX, trainy, verbose=0)
test_acc = model.evaluate(testX, testy, verbose=0)

In [None]:
params ={
    'first_node': [128, 64],
    'second_node': [32, 64],
    'alpha': [0.001, 0.01],
    'first_filter': [9, 16, 32], 
    'dropout': [0.1, 0.2, 0.5]
}

gru_model = Sequential()

#Layer 1 - feature extraction
#Performs dimension reduction

gru_model.add(Conv1D(filters = 27, kernel_size = (4), activation = 'relu', input_shape = (3831, 6))) #ensure you change the shape
gru_model.add(MaxPooling1D(pool_size= (3)))
gru_model.add(Dropout(0.2))
gru_model.add(Conv1D(filters = 14, kernel_size = (2), activation = 'relu', padding = 'same'))


gru_model.add(Bidirectional(GRU(128, activation = 'relu')))
gru_model.add(Dropout(0.2))

In [None]:
params= {
    'first_node': [128, 64],
    'second_node': [32, 64],
    'alpha': [0.001, 0.01],
    'first_filter': [9, 16, 32], 
    'dropout': [0.1, 0.2, 0.5]
}

gru_model = Sequential()

#Layer 1 - feature extraction
#Performs dimension reduction

gru_model.add(Conv1D(filters = 27, kernel_size = (4), activation = 'relu', input_shape = (3831, 6))) #ensure you change the shape
gru_model.add(MaxPooling1D(pool_size= (3)))
gru_model.add(Dropout(0.2))
gru_model.add(Conv1D(filters = 14, kernel_size = (2), activation = 'relu', padding = 'same'))


gru_model.add(Bidirectional(GRU(128, activation = 'relu')))
gru_model.add(Dropout(0.2))
gru_model.add(Dense(128, activation = 'relu'))
gru_model.add(Dense(64, activation = 'relu'))
gru_model.add(Dense(64, activation = 'relu'))
gru_model.add(Dense(16, activation = 'relu', kernel_regularizer = regularizers.l2(0.01)))
gru_model.add(Dense(5, activation = 'softmax'))

#gru_model.add(Flatten())

gru_model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
import tensorflow as tf
early_stop = keras.callbacks.EarlyStopping(monitor = 'val_accuracy', min_delta = 0.0005, patience=8, restore_best_weights=True)

In [None]:
history = gru_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=300, verbose=0)


In [None]:
# Plot the loss and accuracy curves for training and validation 

import matplotlib.pyplot as plt
plt.plot(history.history['val_loss'], color='b', label="validation loss")
plt.title("Test Loss")
plt.xlabel("Number of Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
#evaluate the model
 train_acc = gru_model.evaluate(X_train, y_train, verbose=0)
 test_acc = gru_model.evaluate(X_test, y_test, verbose=0)

In [None]:
# plot loss during training
plt.subplot(211)
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
# plot accuracy during training
plt.subplot(212)
plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()
plt.show()

In [None]:
#generate and prepare the dataset
def get_data():
	# generate dataset
	X, y = make_circles(n_samples=1000, noise=0.1, random_state=1)
	# split into train and test
	n_test = 500
	X_train, X_test = X[:n_test, :], X[n_test:, :]
	y_train, y_test = y[:n_test], y[n_test:]
	return (X_train, y_train, X_test, y_test)

In [None]:
# define and fit the model
def get_model(X_train, y_train):
	# define model
	model = Sequential()
	gru_model.add(Dense(100, input_dim=2, activation='relu'))
	gru_model.add(Dense(1, activation='sigmoid'))
	# compile model
	gru_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
	# fit model
	gru_model.fit(X_train, y_train, epochs=300, verbose=0)
	return model

In [None]:
#generate data
X_train, y_train, X_test, y_test = get_data()
# fit model
model = gru_model(X_train, y_train)

In [None]:
#predict probabilities for test set
yhat_probs = gru_model.predict(X_test, verbose=0)
# predict crisp classes for test set
yhat_classes = gru_model.predict_classes(X_test, verbose=0)

In [None]:
# reduce to 1d array
yhat_probs =yhat_probs[:, 0]
yhat_classes = yhat_classes[:, 0]

In [None]:
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_test, yhat_classes)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test, yhat_classes)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, yhat_classes)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, yhat_classes)
print('F1 score: %f' % f1)