# Import all required libraries


In [None]:
from pandas import read_excel
import numpy as np
import random
from scipy.signal import stft
from scipy.stats import ttest_ind
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix


# Read the data from excel


In [None]:
#Read data from excel
def EIIP(data_): # numeric representation function
    row=len(data_)
    data= np.zeros([row,n])
    for i in range(row):
        for j in range(n):
            char= data_[i][0][j]
            if char == 'A':
                data[i, j]= 0.1260
            if char == 'C':
                data[i, j]= 0.1340
            if char == 'G':
                data[i, j]= 0.0806
            if char == 'T':
                data[i, j]= 0.1335
    return data

# Chatching data from excel file
seq= np.array(read_excel('D:\\OneDrive\\collage...D\\G.P\\Data\\Databsae\\filtered_data.xlsx', header= None) )
# set the data shape
m, n= 1156, 26592
target= np.reshape(np.array( [np.full(int(m/2),1),np.full(int(m/2),0)] ), [m,1]) # ('Other Human Coronavirus Types'= 0 , 'COVID-19'= 1)
IDX= np.arange(m)    # indices
all_data= EIIP(seq)


# Apply STFT feature extraction


In [None]:
# STFT feature extraction
STFT_= np.zeros( (m, 32*int(n/256)) ) # 32 features extracted from each slice
def shortTFT(all_data, STFT_, m, n):
    for i in range(m):
        r=[]
        for j in range(int(n/256)):
            rig= all_data[i, j*256: (j+1)*256]
            rig_m= np.mean(rig)
            conc= np.array([Q-rig_m for Q in rig])
            stft_= np.abs(stft(conc, nperseg=7, nfft=8, window='hamming', return_onesided=False, noverlap=-1, padded=False)[2])
            STFT_M= np.mean(stft_, axis=0)
            r.append(STFT_M)
            if j==102:
                r= np.array(r).reshape((-1))
                STFT_[i]= r
    return STFT_
STFT= shortTFT(all_data,STFT_ , m,n)


# Select 15% random samples for testing

In [None]:
# seed for reproducibility
seed= 5
random.seed(seed)
# acquire testing data/labels from dataset
x= random.sample(range(m), 176)
x= np.sort(x, axis=0)
testing= all_data[(x)]
testing_STFT= STFT[(x)]
testing_label= target[x]


# Remove obtained testing data from (training & validation) data


In [None]:
# remove obtained testing data from training/validation data
data= list(all_data)
data_STFT= list(STFT)
data_label= list(target)
IDX= list(IDX)
for i in range(len(x)):
  data.pop( x[ -(i+1) ] )
  data_STFT.pop( x[ -(i+1) ] )
  data_label.pop( x[ -(i+1) ] )
  IDX.pop( x[ -(i+1) ] )

data= np.array(data)
data_STFT= np.array(data_STFT)
data_label=np.array(data_label)
IDX=np.array(IDX)


# Collect the indices of folds data


In [None]:
kf= StratifiedKFold(n_splits=5, shuffle=True, random_state=5)
indices=[]
for idxtrain, idxval in kf.split(data, data_label) :
    indices.append(idxtrain)


# Select the best fold data for training


In [None]:
numeric_scores= np.zeros((9,5))
STFT_scores= np.zeros((9,5))
count=0
for i in ['KNN-3', 'KNN-5', 'KNN-7', 'LSVM', 'GSVM', 'Decision tree', 'Random forest-11', 'Random forest-51', 'Random forest-101']:
    if i == 'KNN-3':
        k= KNeighborsClassifier(n_neighbors=3, weights= 'distance')
    elif i == 'KNN-5':
        k= KNeighborsClassifier(n_neighbors=5, weights= 'distance')
    elif i == 'KNN-7':
        k= KNeighborsClassifier(n_neighbors=7, weights= 'distance')
    elif i == 'LSVM':
        k= SVC(kernel='linear')
    elif i == 'GSVM':
        k= SVC(kernel='rbf')
    elif i == 'Decision tree':
        k= DecisionTreeClassifier()
    elif i== 'Random forest-11':
        k= RandomForestClassifier(n_estimators=11)
    elif i== 'Random forest-51':
        k= RandomForestClassifier(n_estimators=51)
    elif i== 'Random forest-101':
        k= RandomForestClassifier(n_estimators=101)
    # calculate the scores
    z= cross_val_score(k, data, data_label, cv=kf)       # for the numeric data
    numeric_scores[count]= z
    zz= cross_val_score(k, data_STFT, data_label, cv=kf) # for the STFT data
    STFT_scores[count]= zz
    count+=1
# calculate the average of folds data for all algorithms and then select the maximum score
best_numeric= np.argmax(np.mean(numeric_scores, axis=0))
best_STFT= np.argmax(np.mean(STFT_scores, axis=0))
numeric_data= data[indices[best_numeric] ]
STFT_data= data_STFT[indices[best_STFT] ]


# Apply the T-test feature selection (with p-value < 0.01)


In [None]:
def selection(data_, IDX):
    stat, p_values= ttest_ind(data_[np.where(IDX < (m/2))[0], :], data_[np.where(IDX > (m/2))[0], :], equal_var= False)
    p_values_idx= np.argsort(p_values)
    alpha= len(np.nonzero(p_values < 0.01)[0][:] )
    return data_[:, p_values_idx[ :alpha]], p_values_idx[ :alpha]

# set the data selected for numeric/STFT data
numeric_data_selected, indx= selection(numeric_data, IDX[indices[best_numeric]] )
testing_selected= testing[:,indx]
STFT_data_selected, indx_STFT= selection(STFT_data, IDX[indices[best_STFT] ]  )
testing_STFT_selected= testing_STFT[:, indx_STFT]


# Performance Metrics evaluation function


In [None]:
def metrics(y_true, y_predict, name, count, Print= False):
    tests= ['testing', 'testing_selected', 'testing_STFT', 'testing_STFT_selected']
    acc= accuracy_score( y_true, y_predict)                                    # accuracy
    t_p, f_p, f_n, t_n= confusion_matrix( y_true, y_predict).ravel()           # true_pos, false_pos, false_neg, true_neg
    pre= t_p/ (t_p + f_p)                                                      # precision
    rec= t_p/ (t_p + f_n)                                                      # recall
    f_sc= (2*pre *rec) / (pre+rec)                                             # F-1 score
    sp= t_n/ (t_n + f_p)
    if Print == True:
        print('\n%s\n%s: \nAccuracy= %f\nSpecificity= %f\nSensitivity: %f' %(name, tests[count], acc, sp, rec))
        print('Precision= %f\nF_scores= %f\nT_p= %d\nT_n= %d\nF_p= %d\nF_n= %d' %(pre, f_sc,t_p, t_n, f_p, f_n))
    return acc, sp, rec, pre, f_sc, t_p, t_n, f_p, f_n


# Call the algorithms, fit, predict, and evaluate the performance


In [None]:
TESTS= [testing, testing_selected, testing_STFT, testing_STFT_selected]
IND=[best_numeric, best_numeric, best_STFT, best_STFT]

for i in ['KNN-3', 'KNN-5', 'KNN-7', 'LSVM', 'GSVM', 'Decision tree', 'Random forest-11', 'Random forest-51', 'Random forest-101']:
    if i == 'KNN-3':
        k= KNeighborsClassifier(n_neighbors=3, weights= 'distance')
    elif i == 'KNN-5':
        k= KNeighborsClassifier(n_neighbors=5, weights= 'distance')
    elif i == 'KNN-7':
        k= KNeighborsClassifier(n_neighbors=7, weights= 'distance')
    elif i == 'LSVM':
        k= SVC(kernel='linear')
    elif i == 'GSVM':
        k= SVC(kernel='rbf')
    elif i == 'Decision tree':
        k= DecisionTreeClassifier()
    elif i== 'Random forest-11':
        k= RandomForestClassifier(n_estimators=11)
    elif i== 'Random forest-51':
        k= RandomForestClassifier(n_estimators=51)
    elif i== 'Random forest-101':
        k= RandomForestClassifier(n_estimators=101)

    # numeric_data          --> #testing
    # numeric_data_selected -->  #testing_selected
    # STFT_data             -->   #testing_STFT
    # STFT_data_selected    -->    #testing_STFT_selected
    count=0
    for j in [numeric_data, numeric_data_selected, STFT_data, STFT_data_selected]:
        prediction= k.fit(j, target[IDX[indices[ IND[count] ]]] ).predict( TESTS[count])
        metrics(testing_label, prediction, i, count, True)
        count+=1
