In [35]:
# RandomForestClassifier(n_estimators=1000, criterion='gini', max_depth=None,  bootstrap=True, n_jobs=-1, random_state=0)
# SVC(C=1.0, kernel='linear', tol=0.001, cache_size=2000, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False)
# NearestNeighbors(n_neighbors=10, radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=None)
# MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)

In [1]:
import os
import re
import json
import exrex
import numpy
import Bio.SeqIO
import pandas as pd
from sklearn.svm import SVC
from joblib import Parallel, delayed
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [2]:
# Function to convert degenerative k-mers into a list of k-mers 
def convert_degenerative_k_mers(k_mer):
	k_mers = []
	temp_k_mer = ""
	
	for n in range(len(k_mer)): 
		if k_mer[n] == "A": temp_k_mer = temp_k_mer + "[A]"
		if k_mer[n] == "C": temp_k_mer = temp_k_mer + "[C]"
		if k_mer[n] == "G": temp_k_mer = temp_k_mer + "[G]"
		if k_mer[n] == "T": temp_k_mer = temp_k_mer + "[T]"
		if k_mer[n] == "N": temp_k_mer = temp_k_mer + "[ACGT]"
		if k_mer[n] == "X": temp_k_mer = temp_k_mer + "[ACGT]"
		if k_mer[n] == "V": temp_k_mer = temp_k_mer + "[^ACG]"
		if k_mer[n] == "H": temp_k_mer = temp_k_mer + "[^ACT]"
		if k_mer[n] == "D": temp_k_mer = temp_k_mer + "[^AGT]"
		if k_mer[n] == "B": temp_k_mer = temp_k_mer + "[^CGT]"
		if k_mer[n] == "M": temp_k_mer = temp_k_mer + "[AC]"
		if k_mer[n] == "W": temp_k_mer = temp_k_mer + "[AT]"
		if k_mer[n] == "R": temp_k_mer = temp_k_mer + "[AG]"
		if k_mer[n] == "K": temp_k_mer = temp_k_mer + "[GT]"
		if k_mer[n] == "S": temp_k_mer = temp_k_mer + "[CG]"
		if k_mer[n] == "Y": temp_k_mer = temp_k_mer + "[CT]"

	for i in exrex.generate(temp_k_mer): 
		k_mers.append(i)
	return k_mers

In [3]:
# Function to load test data from a fasta file
def loadDataTrain(file_path):
	# Initialize the data matrix
	D = []
	# Iterate through the fasta file
	for record in Bio.SeqIO.parse(file_path, "fasta"):
		# If there is a class label, save id, sequence and class label in the data list
		try: 
			indexes = [i for i, c in enumerate(record.description) if c == "|"]
			D.append([record.description, str(record.seq.upper()).replace('N',''), record.description[indexes[len(indexes)-1] +1 :]])
		# If there is a no class label, save id and sequence in the data list
		except: D.append([record.descrition, str(record.seq.upper())])
	# Return the data matrix
	return D

In [4]:
# Function to load test data from a fasta file
def loadDataTest(file_path, D_train):
	# Load train data to remove it from test data
	id = [d[0] for d in D_train]
	D_train.clear()
	# Initialize the data matrix
	D = []
	# Iterate through the fasta file
	for record in Bio.SeqIO.parse(file_path, "fasta"):
		# If there is a class label, save id, sequence and class label in the data list
		try: 
			# Get index of last separator
			indexes = [i for i, c in enumerate(record.description) if c == "|"]
			# Save id, sequence and class label
			if record.description not in id:
				D.append([record.description, str(record.seq.upper()), record.description[indexes[len(indexes)-1] +1 :]])
		# If there is a no class label, save id and sequence in the data list
		except: D.append([record.descrition, str(record.seq.upper())])
	# Return the data matrix
	return D

In [5]:
# Function to compute the occurence vector of sequence
def computeSequenceVector(d, K, k):
	# Generate an empty dictionary
	x = {}
	# Initialize the dictionary with targets as keys and 0 as value
	x = x.fromkeys(K.keys(), 0)
	# Go through the sequence 
	for i in range(0, len(d[1]) - k + 1, 1):
		# Try to increment the current k-mer value by 1
		try: x[d[1][i:i + k]] = x[d[1][i:i + k]] + 1
		# Pass if the k-mers does not exist
		except: pass
	# Return the vector and associated target
	return [list(x.values()), d[2]]

In [6]:
# Function to generate the samples matrix (X) and the target values (y)
def generateSamplesTargets(D, K, k):
	# Samples matrix
	X = []
	# Target values
	y = []
	# Iterate through the data
	data = Parallel(n_jobs = -1)(delayed(computeSequenceVector)(d, K, k) for d in D)
	# Add to the matrices
	for d in data: 
		X.append(d[0])
		y.append(d[1])

	# Convert to numpy array format
	X = numpy.asarray(X)
	y = numpy.asarray(y)
	# Return the samples matrix (X) and the target values (y)
	return X, y

In [7]:
# Compute and save performance metrics for STREME
data = {}

# Define the list of variants
variants = ["Alpha", "Beta", "Delta", "Epsilon", "Eta", "Gamma", "Iota", "Kappa", "Lambda", "Omicron"]
# Iterate throught prediction files
for n in range(1, 101): 
    y_true = []
    y_pred = []
    k_mers = []
    for variant in variants:
        with open("MEME_ZOOPS/" + str(n) + "/" + variant +  "/meme.txt") as file:
            for line in file: 
                if line.strip().count("MOTIF") and line.strip().count("width =   9"): 
                    line = line.strip()
                    start = 6
                    end = start + 9
                    k_mer =  line[start:end]
                    if bool(re.match('^[ACGT]+$', k_mer)) == True: 
                       k_mers.append(k_mer)
                    else: 
                        temp_k_mers = convert_degenerative_k_mers(k_mer)
                        for temp_k_mer in temp_k_mers: 
                            k_mers.append(temp_k_mer)
    k_mers = set(k_mers)

    K = dict.fromkeys(k_mers, 0)
    D_train = loadDataTrain("data/SARS-CoV-2_train_" + str(n) + ".fasta")
    X_train, y_train = generateSamplesTargets(D_train, K , 9)

    D_test = loadDataTest("data/SARS-CoV-2.fasta", D_train)
    X_test, y_test = generateSamplesTargets(D_test, K , 9)

    model = SVC(kernel = 'linear', C = 1, cache_size = 1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Compute the performance metrics
    f1 = f1_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    precision = precision_score(y_test, y_pred, average='macro')
    print(str(n) +") precision", precision, "recall", recall, "f1_score", f1)

    # Save the predictions
    file = open("MEME_ZOOPS/" + str(n) + "/prediction.csv", "w")
    file.write("id,y_pred\n")
    for i, y in enumerate(y_pred): file.write(D_test[i][0] + "," + y + "\n")
    file.close()


1) precision 0.8138297493164192 recall 0.8117761905213605 f1_score 0.7237909561051737
2) precision 0.8205162903855114 recall 0.835295503237537 f1_score 0.8268446081517457
3) precision 0.9665179405367246 recall 0.9949846180101426 f1_score 0.9800767963958883
4) precision 0.9471666018970915 recall 0.9928995322566697 f1_score 0.9633432367467659
5) precision 0.8202493989539434 recall 0.9140218833285093 f1_score 0.8416836656385358




6) precision 0.9664725361116966 recall 0.992531959522928 f1_score 0.9787590297777988




7) precision 0.8974702682056422 recall 0.9804613234634825 f1_score 0.9072820835317662




8) precision 0.864810258427465 recall 0.9066798321185152 f1_score 0.860311262670878




9) precision 0.7815656711601977 recall 0.9047166363811515 f1_score 0.8048736013117352




10) precision 0.7686986072339083 recall 0.9112864187526422 f1_score 0.7926289697650754




11) precision 0.8187561689973759 recall 0.8730105529244993 f1_score 0.8355701451069718




12) precision 0.8724109347782534 recall 0.8384556506605142 f1_score 0.7742281356233081




13) precision 0.7157339570959704 recall 0.7528842386103134 f1_score 0.7231104797593607




14) precision 0.8184424302438311 recall 0.9942105865761415 f1_score 0.8565054002082633




15) precision 0.8529450550766657 recall 0.9957507595380172 f1_score 0.8832419242839078




16) precision 0.9005723845857647 recall 0.9041260017176821 f1_score 0.9012233945501027




17) precision 0.660755482087626 recall 0.7357537654921356 f1_score 0.6548496015689892




18) precision 0.8209549324359321 recall 0.8494338258883867 f1_score 0.8041755311071406




19) precision 0.780365094400317 recall 0.822170650340845 f1_score 0.7962611905327969




20) precision 0.8117217972122859 recall 0.8093246880717151 f1_score 0.8078512784778352




21) precision 0.7258545830970127 recall 0.8202992326064147 f1_score 0.6689595603712389




22) precision 0.8366845071790456 recall 0.8042251410283239 f1_score 0.7731314859099212
23) precision 0.8867939017496062 recall 0.9968675267022444 f1_score 0.9229708739553347
24) precision 0.830300764536822 recall 0.9022355280698922 f1_score 0.8367643671930237
25) precision 0.7348942962036085 recall 0.8957981212152575 f1_score 0.7601827809627869




26) precision 0.7981994919381695 recall 0.8140651287158747 f1_score 0.7198592084193198
27) precision 0.827362017466615 recall 0.9602782890368624 f1_score 0.827770128110668
28) precision 0.8967000216608698 recall 0.8960464837575255 f1_score 0.8961824614334525
29) precision 0.7455123976339857 recall 0.824384219750732 f1_score 0.7070761768188889
30) precision 0.8072602107167764 recall 0.7958946240325286 f1_score 0.7553403626205406




31) precision 0.8311081902023156 recall 0.8511710236094437 f1_score 0.7687814986189478
32) precision 0.8801772861096653 recall 0.9949905446064637 f1_score 0.9195338238680562
33) precision 0.7816347172187494 recall 0.8315552635648391 f1_score 0.7258473833291446
34) precision 0.8819177740325557 recall 0.9028729361439833 f1_score 0.8915465666880291
35) precision 0.8636278933002206 recall 0.9669913795619347 f1_score 0.8773345663138012
36) precision 0.809787872062475 recall 0.8773867490599926 f1_score 0.8022534703259374




37) precision 0.7848877944069951 recall 0.9087553242570021 f1_score 0.8165974049834246




38) precision 0.8027997446464429 recall 0.8053973348183892 f1_score 0.7338717657318143




39) precision 0.8743729350001965 recall 0.8999789778968829 f1_score 0.7960180678706454




40) precision 0.8176789736355372 recall 0.89265886026712 f1_score 0.7821426182143321




41) precision 0.8004785555223538 recall 0.800442366736615 f1_score 0.7996486218120294




42) precision 0.9173454210098386 recall 0.9825310415644355 f1_score 0.9238282493678887




43) precision 0.7043381504891044 recall 0.7053428623372473 f1_score 0.6059837621559703




44) precision 0.7830585540217043 recall 0.8230797344753149 f1_score 0.7311506130157004




45) precision 0.8593619515549464 recall 0.9039304382996219 f1_score 0.8772116860110616




46) precision 0.8719728103942925 recall 0.913891370015163 f1_score 0.8293971009227246




47) precision 0.8769377586987084 recall 0.9191965446713641 f1_score 0.8914210526471141




48) precision 0.8471554151249656 recall 0.9069441306429435 f1_score 0.8645225874862754




49) precision 0.8794567992846982 recall 0.9594438101585488 f1_score 0.8709551245952788




50) precision 0.8601834971540541 recall 0.9297742911422224 f1_score 0.883292731517941




51) precision 0.871718901204769 recall 0.9930445844826167 f1_score 0.8987839124651144




52) precision 0.8239954740155699 recall 0.8566608686314743 f1_score 0.7772856297827758




53) precision 0.9076058291671119 recall 0.9906961732843452 f1_score 0.9245987661070917




54) precision 0.8162437311865144 recall 0.9096584508330382 f1_score 0.8395665264002485




55) precision 0.8720669262169259 recall 0.9078749694182532 f1_score 0.840873345272931




56) precision 0.7596215284946616 recall 0.7865652229302666 f1_score 0.768798329832207




57) precision 0.9014217517766007 recall 0.972336593832526 f1_score 0.9277479252382255




58) precision 0.903492767043649 recall 0.9946139904848812 f1_score 0.9298360453751243




59) precision 0.4356303799715981 recall 0.5163742221328003 f1_score 0.4335391821354303




60) precision 0.83948983347682 recall 0.8861218931602888 f1_score 0.8008102681340492




61) precision 0.8775490071372166 recall 0.9123811223330056 f1_score 0.8185893058320017




62) precision 0.9403699815504714 recall 0.9958584354307728 f1_score 0.9574900965573377




63) precision 0.907018759078354 recall 0.9157301701151134 f1_score 0.867174070072329




64) precision 0.8359162450275344 recall 0.9089751991331282 f1_score 0.8522908694142002




65) precision 0.8947927587663248 recall 0.9143951759846172 f1_score 0.9036345226006761




66) precision 0.7883350983535905 recall 0.8481007778892943 f1_score 0.8088131132551858




67) precision 0.7980762853186409 recall 0.85021141561409 f1_score 0.8004060930340623




68) precision 0.8547871331655322 recall 0.8418602458858155 f1_score 0.8208646455111823




69) precision 0.7503052862932582 recall 0.9090425639730361 f1_score 0.7708927501184073




70) precision 0.7599052931684984 recall 0.8759182226753783 f1_score 0.7136420453133742




71) precision 0.7789125512749004 recall 0.8175241984114431 f1_score 0.7927747047033502




72) precision 0.810909771590578 recall 0.8981948693217076 f1_score 0.8055765101943099




73) precision 0.8258627131854412 recall 0.9089180032201896 f1_score 0.8335095405512514




74) precision 0.8355484015449536 recall 0.8226608529467697 f1_score 0.7389445620587605




75) precision 0.9430561394236314 recall 0.9950635097446355 f1_score 0.9633103455766931




76) precision 0.629321785109189 recall 0.7425008046967769 f1_score 0.6191042584326862




77) precision 0.8891828569509828 recall 0.9331683948049605 f1_score 0.8547797579474448




78) precision 0.83597599369058 recall 0.9053424415245951 f1_score 0.858430089451067




79) precision 0.80612900926159 recall 0.8284429822227167 f1_score 0.7297428906132231




80) precision 0.8095009158500142 recall 0.8098445735747946 f1_score 0.8073029786765338




81) precision 0.9519394671149582 recall 0.9247804487872339 f1_score 0.9250328809046915




82) precision 0.8678462466896535 recall 0.8071982089937197 f1_score 0.7226516708064251




83) precision 0.819396209581624 recall 0.8237891440625434 f1_score 0.8146691447260052




84) precision 0.8306642478200239 recall 0.8572321935517067 f1_score 0.782448462628907




85) precision 0.8897786354901133 recall 0.9019006131726861 f1_score 0.8944499918017893




86) precision 0.7911035927660799 recall 0.8252562990435395 f1_score 0.7206436984794562




87) precision 0.9090116373260665 recall 0.9207286441382422 f1_score 0.9142250816995368




88) precision 0.7414236796066979 recall 0.8380330371133207 f1_score 0.7088344706639023




89) precision 0.8912395863710636 recall 0.8958104886228172 f1_score 0.8921201381581767




90) precision 0.8013950346702561 recall 0.8255821535970289 f1_score 0.7240314766004718




91) precision 0.8912868431879278 recall 0.9895380513760909 f1_score 0.912880076977768




92) precision 0.7900103243122355 recall 0.7651934109841629 f1_score 0.7639408759595144




93) precision 0.8567374761702684 recall 0.9367820617900586 f1_score 0.871818854419162




94) precision 0.7111309481274761 recall 0.7371970521095793 f1_score 0.6963575630267812




95) precision 0.7989481196464352 recall 0.9003381654545966 f1_score 0.8130305890269829




96) precision 0.938439441153488 recall 0.9962693838301521 f1_score 0.9539072385659665




97) precision 0.8833583953613549 recall 0.9932175964420006 f1_score 0.912642855488303




98) precision 0.7771852837256192 recall 0.9075195701938604 f1_score 0.8010711254197013
99) precision 0.7250626219653056 recall 0.8203432869672989 f1_score 0.7427327844675851




100) precision 0.9015387258591085 recall 0.9768536047937811 f1_score 0.8954995283769
