In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import KFold
import joblib
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
import datetime

In [2]:
def seq_encoding(path, outfile):
    dataMat = []; labelMat = []
    dataList = []; labelList = []
    feature_val = []
    encoded_20D = []
    result_1D = []
    
    files = os.listdir(path)
    for file in files:
        if not os.path.isdir(file):  #open when it is a folder
            f = open(path+'/'+file,'r')
            lines = f.readlines()  # return a list
            
            for line in lines:
                #separate the sequences and labels
                l_line = line.strip()
                line_list = l_line.split(',')
                while '' in line_list:
                    line_list.remove('')

                dataList.append(line_list[0])
                labelList.append(line_list[1])
    
    for i in range(len(labelList)):
        result_1D.append(int(labelList[i]))
    # dataMat = np.array(encoded_20D) #dataMat as a X feature matrix
    
    dataMat = np.array(dataList) #dataMat as a X feature matrix
    labelMat = np.asarray(result_1D) #labelMat as a y label matrix
    # Combine the feature matrix and the label matrix into one matrix
    window_Mat = np.column_stack((dataMat, labelMat))
    np.random.shuffle(window_Mat)

    np.savetxt(outfile,window_Mat,fmt='%s')
    f.close()
    return window_Mat

In [3]:
def splitDataSetbyKFold(window_Mat,split_size,outdir):
    if not os.path.exists(outdir): #if not outdir, makedir
        os.makedirs(outdir)
    train_all = [];
    test_all = []
    each_split_tr = []
    each_split_te = []
    count_split = 0
    kf = KFold(n_splits=split_size)
    for train_index, test_index in kf.split(window_Mat):
        count_split += 1
        for index in train_index:
            each_split_tr.append(list(window_Mat[index]))
        array_ = np.array(each_split_tr)
        np.savetxt(outdir + "/train_" + str(count_split) + '.txt',array_, fmt="%s", delimiter='\t')  # output each piece of data
        train_all.append(each_split_tr)  # Add each piece of data to a list '[[[],[],...[]]]' 3-D list
        each_split_tr = []

        for index in test_index:
            each_split_te.append(list(window_Mat[index]))
        array_ = np.array(each_split_te)
        np.savetxt(outdir + "/test_" + str(count_split) + '.txt',array_, fmt="%s", delimiter='\t')  # output each piece of data
        test_all.append(each_split_te)  # Add each piece of data to a list
        each_split_te = []

    #train_all = train_all[0]
    #test_all = test_all[0]
    return train_all, test_all

In [4]:
def performance(labelArr, predictArr):
    #labelArr[i] is actual value,predictArr[i] is predict value
    TP = 0.0; TN = 0.0; FP = 0.0; FN = 0.0
    for i in range(len(labelArr)):
        if labelArr[i] == 1 and predictArr[i] == 1:
            TP += 1.0
        if labelArr[i] == 1 and predictArr[i] == 0:
            FN += 1.0
        if labelArr[i] == 0 and predictArr[i] == 1:
            FP += 1.0
        if labelArr[i] == 0 and predictArr[i] == 0:
            TN += 1.0
    print(TP)
    print(FN)
    print(TN)
    print(FP)
    
    if ((TP+FN) == 0):
        SN = 0
        SP = 0
    elif ((FP+FN) == 0):
        SN = 0
        SP = 0
    else:
        SN = TP/(TP + FN) #Sensitivity = TP/P  and P = TP + FN
        SP = TN/(FP + TN) #Specificity = TN/N  and N = TN + FP

    #MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
    return SN,SP

In [5]:
def classifier(clf,clfname,train_X, train_y, test_X, test_y,i):#X:feature matrix，y:label matrix
    # train with train set
    print(" training begin...")
    clf = clf.fit(train_X,train_y)
    print(" training end.")
    #==========================================================================
    # test with validation set
    print(" test begin.")
    predict_ = clf.predict(test_X) #return type is float64
    proba = clf.predict_proba(test_X)[:,1] #return type is float64
    score_ = clf.score(test_X, test_y)
    
    # Report
    sk_report = classification_report(
    digits=6,
    y_true=test_y, 
    y_pred=clf.predict(test_X))
    print(sk_report)
    
    print(" test end.")
    
    #==========================================================================
    
    ACC = accuracy_score(test_y, predict_)
    SN, SP = performance(test_y, predict_)
    MCC = matthews_corrcoef(test_y, predict_)
    #AUC = roc_auc_score(test_y, proba)
    AUC = 0
    
    # Model Evaluation
    #==========================================================================
    #save output
    
    eval_output = []
    eval_output.append(ACC);eval_output.append(SN);eval_output.append(AUC)
    eval_output.append(SP);eval_output.append(MCC)
    eval_output.append(score_)
    eval_output = np.array(eval_output,dtype=float)
    
    np.savetxt("proba.data",proba,fmt="%f",delimiter="\t")
    np.savetxt("test_y.data",test_y,fmt="%f",delimiter="\t")
    np.savetxt("predict.data",predict_,fmt="%f",delimiter="\t")
    #np.savetxt("eval_output.data",eval_output,fmt="%f",delimiter="\t")
    print("Wrote results to output.data...EOF...")
    # ==========================================================================
    # save Model
    os.chdir("/home/fyp1920/Desktop/coding/ML_Model")

    joblib.dump(clf,'train_'+clfname+str(i)+'.model')
    return ACC,SN,SP,MCC,AUC

In [6]:
# mean_fun used to find the average value of the values in the list,
# mainly ACC mean,SP mean and SN mean, to evaluate the model
def mean_fun(onelist):
    count = 0
    for i in onelist:
        count += i
    return float(count/len(onelist))

In [7]:
def crossValidation(clf, clfname, curdir, train_all, test_all):
    os.chdir(curdir)
    cur_path = curdir
    ACCs = [];SNs = [];SPs = [];MCCs = [];AUCs = []

    for i in range(len(train_all)):
        print('----- Round ', i, ' -----' )
        print('Start Time: ', datetime.datetime.now())
        
        os.chdir(cur_path)
        train_data = train_all[i]; train_X = []; train_y = []
        test_data = test_all[i]; test_X = []; test_y = []

        #Divide train_all into train_X and train_y
        for eachline_train in train_data:
            one_train = eachline_train
            one_train_format = []
            for index in range(0, len(one_train) - 1):
                one_train_format.append(float(one_train[index]))
            train_X.append(one_train_format)
            train_y.append(int(one_train[-1]))

        #Divide test_all into test_X and test_y
        for eachline_test in test_data:
            one_test = eachline_test
            one_test_format = []
            for index in range(0, len(one_test) - 1):
                one_test_format.append(float(one_test[index]))
            test_X.append(one_test_format)
            test_y.append(int(one_test[-1]))
        # ======================================================================
        # classifier start here
        if not os.path.exists(clfname):
            os.mkdir(clfname)
        out_path = clfname + "/" + clfname + "_00" + str(i)  # the folder that save result of each fold
        if not os.path.exists(out_path):
            os.mkdir(out_path)
        os.chdir(out_path)
        ACC, SN, SP, MCC, AUC = classifier(clf, clfname, train_X, train_y, test_X, test_y,i)
        ACCs.append(ACC)
        SNs.append(SN)
        SPs.append(SP)
        MCCs.append(MCC)
        AUCs.append(AUC)
        
        print('End Time: ', datetime.datetime.now())
        print('---------------')
        print('')
        
    # ======================================================================
    ACC_mean = mean_fun(ACCs)
    SN_mean = mean_fun(SNs)
    SP_mean = mean_fun(SPs)
    MCC_mean = mean_fun(MCCs)
    AUC_mean = mean_fun(AUCs)
    # ==========================================================================
    # output experiment result
    ("/home/fyp1920/Desktop/coding/")
    os.system("echo `date`'" + str(clf) + "' >> log.out")
    os.system("echo ACC_mean=" + str(ACC_mean) + " >> log.out")
    os.system("echo SN_mean=" + str(SN_mean) + " >> log.out")
    os.system("echo SP_mean=" + str(SP_mean) + " >> log.out")
    os.system("echo MCC_mean=" + str(MCC_mean) + " >> log.out")
    os.system("echo AUC_mean=" + str(AUC_mean) + " >> log.out")
    
    return ACC_mean, SN_mean, SP_mean, MCC_mean, AUC_mean

In [8]:
if __name__ == '__main__':
    path = 'PTB_LABELED_DATASET'
    outfile = 'seq_encoded.txt'
    outdir = 'KFold'
    a = []
    # encode the original dataset
    window_Mat = seq_encoding(path,outfile)

    # split the feature matrix into N fold
    train_all, test_all = splitDataSetbyKFold(window_Mat, 100, outdir)

    print("Generate dataset end and cross validation start")

    clf = svm.SVC(C=1, kernel='rbf', gamma=0.5, probability=True)
    curdir = '/home/fyp1920/Desktop/coding'
    clfname = 'SVM'

    crossValidation(clf, clfname, curdir, train_all, test_all)



Generate dataset end and cross validation start
----- Round  0  -----
Start Time:  2020-05-27 01:49:31.258968
 training begin...
 training end.
 test begin.
              precision    recall  f1-score   support

           1   0.535714  0.625000  0.576923        48
           2   0.671875  0.843137  0.747826        51
           3   0.609756  0.500000  0.549451        50
           4   0.547945  0.800000  0.650407        50
           5   0.769231  0.192308  0.307692        52
           6   0.645833  0.632653  0.639175        49
           7   0.836735  0.719298  0.773585        57
           8   0.853659  0.700000  0.769231        50
           9   0.461538  0.697674  0.555556        43

    accuracy                       0.633333       450
   macro avg   0.659143  0.634452  0.618872       450
weighted avg   0.666075  0.633333  0.621308       450

 test end.
30.0
0.0
0.0
0.0
Wrote results to output.data...EOF...
End Time:  2020-05-27 01:50:36.919614
---------------

----- Round  1  -

 training end.
 test begin.
              precision    recall  f1-score   support

           1   0.766667  0.779661  0.773109        59
           2   0.839286  0.839286  0.839286        56
           3   0.531250  0.693878  0.601770        49
           4   0.620000  0.738095  0.673913        42
           5   0.923077  0.235294  0.375000        51
           6   0.708333  0.809524  0.755556        42
           7   0.901961  0.836364  0.867925        55
           8   0.764706  0.787879  0.776119        33
           9   0.608108  0.714286  0.656934        63

    accuracy                       0.713333       450
   macro avg   0.740376  0.714918  0.702179       450
weighted avg   0.742857  0.713333  0.702217       450

 test end.
46.0
0.0
0.0
0.0
Wrote results to output.data...EOF...
End Time:  2020-05-27 02:01:01.138812
---------------

----- Round  10  -----
Start Time:  2020-05-27 02:01:01.139093
 training begin...
 training end.
 test begin.
              precision    recall  f

 training end.
 test begin.
              precision    recall  f1-score   support

           1   0.595745  0.682927  0.636364        41
           2   0.719298  0.891304  0.796117        46
           3   0.553191  0.541667  0.547368        48
           4   0.636364  0.792453  0.705882        53
           5   0.857143  0.235294  0.369231        51
           6   0.762712  0.750000  0.756303        60
           7   0.850000  0.739130  0.790698        46
           8   0.895833  0.843137  0.868687        51
           9   0.583333  0.777778  0.666667        54

    accuracy                       0.695556       450
   macro avg   0.717069  0.694854  0.681924       450
weighted avg   0.719018  0.695556  0.682848       450

 test end.
28.0
0.0
0.0
0.0
Wrote results to output.data...EOF...
End Time:  2020-05-27 02:10:44.561760
---------------

----- Round  19  -----
Start Time:  2020-05-27 02:10:44.561869
 training begin...
 training end.
 test begin.
              precision    recall  f

 training end.
 test begin.
              precision    recall  f1-score   support

           1   0.633333  0.730769  0.678571        52
           2   0.750000  0.913043  0.823529        46
           3   0.630435  0.557692  0.591837        52
           4   0.738462  0.872727  0.800000        55
           5   0.625000  0.222222  0.327869        45
           6   0.804348  0.685185  0.740000        54
           7   0.853659  0.729167  0.786517        48
           8   0.760870  0.777778  0.769231        45
           9   0.567568  0.792453  0.661417        53

    accuracy                       0.702222       450
   macro avg   0.707075  0.697893  0.686552       450
weighted avg   0.705971  0.702222  0.689069       450

 test end.
38.0
0.0
0.0
0.0
Wrote results to output.data...EOF...
End Time:  2020-05-27 02:20:20.202813
---------------

----- Round  28  -----
Start Time:  2020-05-27 02:20:20.202950
 training begin...
 training end.
 test begin.
              precision    recall  f

 training end.
 test begin.
              precision    recall  f1-score   support

           1   0.698413  0.733333  0.715447        60
           2   0.745098  0.844444  0.791667        45
           3   0.545455  0.666667  0.600000        45
           4   0.641509  0.723404  0.680000        47
           5   0.933333  0.280000  0.430769        50
           6   0.700000  0.686275  0.693069        51
           7   0.963636  0.828125  0.890756        64
           8   0.826087  0.826087  0.826087        46
           9   0.483871  0.714286  0.576923        42

    accuracy                       0.702222       450
   macro avg   0.726378  0.700291  0.689413       450
weighted avg   0.738872  0.702222  0.696969       450

 test end.
44.0
0.0
0.0
0.0
Wrote results to output.data...EOF...
End Time:  2020-05-27 02:29:57.377047
---------------

----- Round  37  -----
Start Time:  2020-05-27 02:29:57.377348
 training begin...
 training end.
 test begin.
              precision    recall  f

 training end.
 test begin.
              precision    recall  f1-score   support

           1   0.706897  0.732143  0.719298        56
           2   0.811321  0.843137  0.826923        51
           3   0.552239  0.587302  0.569231        63
           4   0.593220  0.744681  0.660377        47
           5   0.571429  0.190476  0.285714        42
           6   0.804878  0.647059  0.717391        51
           7   0.787234  0.804348  0.795699        46
           8   0.880000  0.862745  0.871287        51
           9   0.491803  0.697674  0.576923        43

    accuracy                       0.684444       450
   macro avg   0.688780  0.678841  0.669205       450
weighted avg   0.690945  0.684444  0.675079       450

 test end.
41.0
0.0
0.0
0.0
Wrote results to output.data...EOF...
End Time:  2020-05-27 02:39:30.478934
---------------

----- Round  46  -----
Start Time:  2020-05-27 02:39:30.479049
 training begin...
 training end.
 test begin.
              precision    recall  f

 training end.
 test begin.
              precision    recall  f1-score   support

           1   0.644444  0.805556  0.716049        36
           2   0.784615  0.910714  0.842975        56
           3   0.568627  0.659091  0.610526        44
           4   0.770492  0.758065  0.764228        62
           5   0.950000  0.333333  0.493506        57
           6   0.809524  0.790698  0.800000        43
           7   0.972973  0.818182  0.888889        44
           8   0.781818  0.843137  0.811321        51
           9   0.621622  0.807018  0.702290        57

    accuracy                       0.742222       450
   macro avg   0.767124  0.747310  0.736643       450
weighted avg   0.771120  0.742222  0.733952       450

 test end.
29.0
0.0
0.0
0.0
Wrote results to output.data...EOF...
End Time:  2020-05-27 02:49:04.423948
---------------

----- Round  55  -----
Start Time:  2020-05-27 02:49:04.424101
 training begin...
 training end.
 test begin.
              precision    recall  f

 training end.
 test begin.
              precision    recall  f1-score   support

           1   0.660000  0.702128  0.680412        47
           2   0.666667  0.883721  0.760000        43
           3   0.627119  0.755102  0.685185        49
           4   0.611111  0.897959  0.727273        49
           5   0.785714  0.211538  0.333333        52
           6   0.756757  0.700000  0.727273        40
           7   0.869565  0.666667  0.754717        60
           8   0.882353  0.789474  0.833333        57
           9   0.609375  0.735849  0.666667        53

    accuracy                       0.700000       450
   macro avg   0.718740  0.704715  0.685355       450
weighted avg   0.725005  0.700000  0.685356       450

 test end.
33.0
0.0
0.0
0.0
Wrote results to output.data...EOF...
End Time:  2020-05-27 02:58:40.852313
---------------

----- Round  64  -----
Start Time:  2020-05-27 02:58:40.852417
 training begin...
 training end.
 test begin.
              precision    recall  f

 training end.
 test begin.
              precision    recall  f1-score   support

           1   0.653846  0.693878  0.673267        49
           2   0.696429  0.829787  0.757282        47
           3   0.526316  0.545455  0.535714        55
           4   0.595238  0.657895  0.625000        38
           5   0.727273  0.170213  0.275862        47
           6   0.720000  0.720000  0.720000        50
           7   0.818182  0.818182  0.818182        44
           8   0.841270  0.791045  0.815385        67
           9   0.546667  0.773585  0.640625        53

    accuracy                       0.671111       450
   macro avg   0.680580  0.666671  0.651257       450
weighted avg   0.684127  0.671111  0.656325       450

 test end.
34.0
0.0
0.0
0.0
Wrote results to output.data...EOF...
End Time:  2020-05-27 03:08:12.120241
---------------

----- Round  73  -----
Start Time:  2020-05-27 03:08:12.120370
 training begin...
 training end.
 test begin.
              precision    recall  f

 training end.
 test begin.
              precision    recall  f1-score   support

           1   0.735849  0.750000  0.742857        52
           2   0.746032  0.810345  0.776860        58
           3   0.636364  0.573770  0.603448        61
           4   0.691176  0.810345  0.746032        58
           5   0.833333  0.300000  0.441176        50
           6   0.650000  0.742857  0.693333        35
           7   0.914894  0.826923  0.868687        52
           8   0.717391  0.767442  0.741573        43
           9   0.466667  0.682927  0.554455        41

    accuracy                       0.695556       450
   macro avg   0.710190  0.696068  0.685380       450
weighted avg   0.716473  0.695556  0.688631       450

 test end.
39.0
0.0
0.0
0.0
Wrote results to output.data...EOF...
End Time:  2020-05-27 03:17:41.854052
---------------

----- Round  82  -----
Start Time:  2020-05-27 03:17:41.854159
 training begin...
 training end.
 test begin.
              precision    recall  f

 training end.
 test begin.
              precision    recall  f1-score   support

           1   0.704918  0.811321  0.754386        53
           2   0.714286  0.853659  0.777778        41
           3   0.547619  0.547619  0.547619        42
           4   0.736842  0.792453  0.763636        53
           5   0.857143  0.244898  0.380952        49
           6   0.796296  0.754386  0.774775        57
           7   0.900000  0.837209  0.867470        43
           8   0.854839  0.913793  0.883333        58
           9   0.633803  0.833333  0.720000        54

    accuracy                       0.737778       450
   macro avg   0.749527  0.732075  0.718883       450
weighted avg   0.752431  0.737778  0.723528       450

 test end.
43.0
0.0
0.0
0.0
Wrote results to output.data...EOF...
End Time:  2020-05-27 03:27:10.715658
---------------

----- Round  91  -----
Start Time:  2020-05-27 03:27:10.716124
 training begin...
 training end.
 test begin.
              precision    recall  f

 training end.
 test begin.
              precision    recall  f1-score   support

           1   0.647059  0.511628  0.571429        43
           2   0.701754  0.869565  0.776699        46
           3   0.666667  0.622951  0.644068        61
           4   0.725806  0.789474  0.756303        57
           5   0.791667  0.372549  0.506667        51
           6   0.568182  0.641026  0.602410        39
           7   0.868421  0.804878  0.835443        41
           8   0.729167  0.700000  0.714286        50
           9   0.569767  0.790323  0.662162        62

    accuracy                       0.680000       450
   macro avg   0.696499  0.678044  0.674385       450
weighted avg   0.693478  0.680000  0.673450       450

 test end.
22.0
0.0
0.0
0.0
Wrote results to output.data...EOF...
End Time:  2020-05-27 03:36:35.136075
---------------

