In [3]:
import pandas
import numpy as np
import tensorflow as tf
from keras.utils import np_utils
# CNN related
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.recurrent import GRU
#from keras.layers.recurrent import LSTM, GRU
from keras.layers import Conv1D, MaxPooling1D, AtrousConvolution1D, RepeatVector
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, CSVLogger
from keras.layers.wrappers import Bidirectional
from keras import regularizers
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import *
from keras.optimizers import RMSprop, Adam, SGD, Nadam
from keras.initializers import *
import matplotlib.pyplot as plt

from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE # doctest: +NORMALIZE_WHITESPACE
from imblearn.over_sampling import RandomOverSampler # doctest: +NORMALIZE_WHITESPACE
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import KMeansSMOTE
from imblearn.over_sampling import BorderlineSMOTE # doctest: +NORMALIZE_WHITESPACE
from imblearn.combine import SMOTETomek # doctest: +NORMALIZE_WHITESPACE

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from keras import backend as K


##################
# AUC for a binary classifier
def auc(y_true, y_pred):
    ptas = tf.stack([binary_PTA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
    pfas = tf.stack([binary_PFA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
    pfas = tf.concat([tf.ones((1,)) ,pfas],axis=0)
    binSizes = -(pfas[1:]-pfas[:-1])
    s = ptas*binSizes
    return K.sum(s, axis=0)
    
##################
# PFA, prob false alert for binary classifier
def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    # N = total number of negative labels
    N = K.sum(1 - y_true)
    # FP = total number of false alerts, alerts from the negative class labels
    FP = K.sum(y_pred - y_pred * y_true)
    #print(FP/N)
    return FP/N

##################
# P_TA prob true alerts for binary classifier
def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    # P = total number of positive labels
    P = K.sum(y_true)
    # TP = total number of correct alerts, alerts from the positive class labels
    TP = K.sum(y_pred * y_true)
    return TP/P

class ecgcnn: 
    np.set_printoptions(threshold=np.inf)
    def __init__(self, base_path, stkid, infosize, onehotsize ):
        self.stkid = stkid
        fname = base_path + stkid + '.csv'
        self.INFOSIZE = infosize
        self.ONEHOTSIZE = onehotsize
        self.load_data(fname)
        self.EMB_SIZE = self.INFOSIZE+self.ONEHOTSIZE
        #self.ONEHOTSIZE = onehotsize
        

##################
    def load_data(self, fname):
        # load csv data
        mat = pandas.read_csv(fname, sep=",", header=0, error_bad_lines=False).as_matrix()

        self.info = []
        self.sid=[]
        for i in range(self.INFOSIZE):
            self.info.append(self.matrix_col(mat, i))   
        self.sid.append(self.matrix_col(mat,14))
       # print(self.sid) 
        #x = self.find_category(self.sid)
        #print(x)
##################        
    '''
    def find_category0(self, sid):
        print(sid[0][0])
        y = list(map(int, sid[3:-1])) 
        #y = map(eval,sid[3:-1])
        print(y)    
        for n in range(len(sid)):
            sidnum = [0]*51 
            if ( y == 11 ):
                print("456") 
                sidnum [n]=sidnum [n]+1
                print(sidnum)
                return  sidnum
    '''
    def find_category(self, sid):
        result = []
        for sidstr in sid[0]:
            index = int(sidstr.replace('sid', ''))
            result.append(index)
            
        return result
        
    '''           
            sidnum = [0]*51
            #print(index)
            sidnum[index-1] = 1
            #print(sidnum)
            result.append(sidnum)
           
            return result
    ''' 
######################################################
    def matrix_col(self, matrix, i):
        return [row[i] for row in matrix]    
    
######################################################
    def normalize(self, data):
        #return (np.array(data)-np.mean(data))/np.std(data) 
        std = np.std(data)
        if (std == 0):
            data2 = np.zeros(len(data))
            return data2
        return (np.array(data)-np.mean(data))/std
    
######################################################
    
    def backtest_training_data(self,num,s):
        x_train0, y_train_onehot = [], []  
        x_train1, x_test1, y_train1, y_test1 = [], [], [], []
        # normalize
        nvals = []
        for cval in self.info:
            nvals.append(self.normalize(cval))
        
        # training preparation            
        l2t = nvals
        x_train0 = np.column_stack(l2t)
        y_train_onehot = self.find_category(self.sid)

        x_train = []
        for rec in x_train0:
            x_train.append(rec)
        
        x_train = np.array(x_train)
        #y_train_onehot = np_utils.to_categorical(y_train)
        y_train_onehot = np.array(y_train_onehot)

        if num==0:
            n = 40
        if num==1:
            n = 37
        if num==2:
            n = 16
        if num==3:
            n = 83
        x_train1, x_test1, y_train1, y_test1 = train_test_split(x_train, y_train_onehot, test_size=0.2, random_state=None)
        
        
        if s==0:
            x_res, y_res = x_train1, y_train1
        else:    
            if s==1:
                sm = SMOTE(random_state=0)
            if s==2:
                sm = SVMSMOTE(random_state=0)
            if s==3:
                sm = RandomOverSampler(random_state=0)
            if s==4:
                sm = BorderlineSMOTE(random_state=0) 
            if s==5:
                sm = SMOTETomek(random_state=0)
            #sm = KMeansSMOTE(random_state=0)  #X need 25 samples
            
            x_res, y_res = sm.fit_resample(x_train1, y_train1)
            
            #print(x_train)
            #print('Resampled dataset shape %s' % Counter(y_res))

        x_train_new = []
        for rec in x_res:
            x_train_new.append([rec])
            #x_train_new.append(np.reshape(rec,(-1,self.EMB_SIZE)))
        x_test_new = []
        for rec1 in x_test1:
            x_test_new.append([rec1])
                
        x_train_new = np.array(x_train_new)  
        x_test_new = np.array(x_test_new)
        #print(x_train_new)
        
        y_train_onehot_new = []
        y_test_new = [] 
        y_test_new1 = [] 
        
        for sid in y_res:
            index = int(sid)
            sidnum = [0]*n
           
            #print(index)
            sidnum[index-1] = 1
            #print(index)
            #print(sidnum)
            y_train_onehot_new.append(sidnum)  
        for sidstr in y_test1:
            index = int(sidstr)
            sidnum1 = [0]*n
            
            #print(index)
            sidnum1[index-1] = 1
            #print(sidnum)
            y_test_new.append(sidnum1) 
            y_test_new1.append([sidnum1])

        return x_train_new, y_train_onehot_new, x_test_new, y_test_new, y_test_new1

######################################################
    def build_model(self,num):
        return self.simple_model(num)

######################################################
    def simple_model(self,num):
        model = Sequential()
        model.add(Conv1D(input_shape=(1, self.EMB_SIZE), filters=128, kernel_size=16, padding='same'))
        model.add(BatchNormalization())
        model.add(LeakyReLU())
        model.add(Dropout(0.5))

        model.add(Conv1D(filters=64, kernel_size=8, padding='same'))
        model.add(BatchNormalization())
        model.add(LeakyReLU())
        model.add(Dropout(0.5))
       
        model.add(Flatten())
        model.add(Dense(128))
        model.add(BatchNormalization())
        model.add(LeakyReLU())
        if num==0:
            model.add(Dense(40))
        if num==1:
            model.add(Dense(37))
        if num==2:
            model.add(Dense(16))
        if num==3:
            model.add(Dense(83))
            
        model.add(Activation('softmax'))

        opt = Nadam(lr=0.002)
        
        #model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=[auc])
        model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
        #print(model.summary())
        return model

######################################################
    def train_model(self, model, x_train, y_train_onehot, x_test1, y_test1 ):
        model_name = 'cnn.'+self.stkid+'.model'
        input_shape = x_train[0].shape
        #reduce_lr = ReduceLROnPlateau(monitor='roc_auc_val', factor=0.8, patience=30, min_lr=0.000001, verbose=2)
        reduce_lr = ReduceLROnPlateau(monitor='roc_accuracy', factor=0.8, patience=30, min_lr=0.000001, verbose=2)
        checkpointer = ModelCheckpoint(filepath=model_name, verbose=0, save_best_only=True)
        '''
        train_history = model.fit(np.array(x_train),np.array(y_train_onehot), epochs = 200, batch_size = 10, verbose=2, \
                                  validation_data=(np.array(x_test1), np.array(y_test1)), callbacks=[reduce_lr, checkpointer], shuffle=True)
        '''
        train_history = model.fit(np.array(x_train),np.array(y_train_onehot), epochs = 200, batch_size = 100, verbose=0, \
                                   callbacks=[reduce_lr, checkpointer], shuffle=True)
        
        #show_train_history(train_history, 'accuracy', 'val_accuracy')
        #show_train_history(train_history, 'auc', 'val_auc')
        
        #show_train_history(train_history, 'loss', 'val_loss')
        return train_history

######################################################
    def do_backtest(self,num,s):

        # cut data 
        x_train, y_train_onehot, x_test1, y_test1, y_test2 = self.backtest_training_data(num,s)
        # build model 
        model = self.build_model(num)
        train_history = self.train_model(model, x_train, y_train_onehot, x_test1, y_test1 )
        
        if num==0:
            n = 40
        if num==1:
            n = 37
        if num==2:
            n = 16
        if num==3:
            n = 83
            
        count = 0
        total = 0
        #print(x_test1)
        #print(np.array(y_test2))
        #print("x : "+str(x_test1.shape)+" / y : "+str(np.array(y_test2).shape))
        for x_test, y_test in zip(x_test1, np.array(y_test2)):
            
            prediction = model.predict_classes(np.array([x_test]),verbose=0)
            #print(proba[0], proba[1])
            #print(datestr, prediction[0], y_test_onehot[0], scores[1], last_close)
            
      
            index = int(prediction)+1
            #print("prediction: "+str(index))
        
            sidnum = [0]*n
            #print(index)
            sidnum[index-1] = 1
            sidnum_x = np.array(sidnum)
            
            total += 1
            
            if (y_test[0][index-1] == sidnum_x[index-1]):
                count += 1
        if (total == 0):
            return 0, 0
        
    
        #print(total)
        #print(count)
        return count, total
        
######################################################
"""
def show_train_history(train_history, train, validation):
    plt.plot(train_history.history[train])
    #plt.plot(train_history.history[validation])
    plt.title('Train History')
    plt.ylabel(train)
    plt.xlabel('Epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()
"""
def main():
    print("start")
    stkid = 'ALL_pathological_2' 
    TID = 'pathological'
    DA = ["AUSAIC Data","MIT-BIH Arrhythmia Data","MIT-BIH Normal Sinus Rhythm Data","QT Data"]
    SMO = ["Original","SMOTE","SVMSMOTE","ROS","BorderlineSMOTE","SMOTETomek"]
    ADD = ["/home/keg/桌面/ecg_data/AUSAIC Data/pathological/",
          "/home/keg/桌面/ecg_data/MIT-BIH Arrhythmia Data/pathological/",
          "/home/keg/桌面/ecg_data/MIT-BIH Normal Sinus Rhythm Data/pathological/",
          "/home/keg/桌面/ecg_data/QT Data/pathological/"]
    NUMB = [0, 1, 2, 3]
    NUMBE = [0, 1, 2, 3, 4, 5]
    info_size = 13
    onehot_size = 0
    file = open('/home/keg/桌面/ecg_data/pathological_accuracy.csv', 'w')
    file.write('Method,DATA,Ture/Total,accuracy'+'\n')
    for num in NUMB:
        base_path = ADD[num]
        #S=0  #SMOTE /S=1  #SVMSMOTE /S=2  #ROS  /S=3  #BorderlineSMOTE  /S=4  #SMOTETomek
        for S in NUMBE:
            for time in range(5):
                ecg = ecgcnn(base_path, stkid, info_size, onehot_size )
                count, total = ecg.do_backtest(num,S)
                hitrate = 0.0
                if (total > 0):
                    hitrate = float(count)/total
                    print(SMO[S]+","+DA[num]+",("+str(count)+"/"+str(total)+"),"+str(hitrate))
                    plan =SMO[S]+","+DA[num]+",("+str(count)+"/"+str(total)+"),"+str(hitrate)+'\n'
                    #file = open('/home/keg/桌面/ecg_data/pathological_accuracy.txt', 'w')
                    file.write(plan)
    file.close()
    print("finish")
    #xt, yt = ecg.backtest_training_data()
    #print(xt)
    #print(yt[-1])
    #for stkid in stklist:    
    #    model = ecgcnn(base_path, stkid,info_size)
                      
if (__name__ == "__main__"):
    main()


start




Original,AUSAIC Data,(804/867),0.9273356401384083




Original,AUSAIC Data,(779/867),0.8985005767012687




Original,AUSAIC Data,(779/867),0.8985005767012687




Original,AUSAIC Data,(784/867),0.9042675893886967




Original,AUSAIC Data,(779/867),0.8985005767012687




SMOTE,AUSAIC Data,(801/867),0.9238754325259516




SMOTE,AUSAIC Data,(802/867),0.9250288350634371




SMOTE,AUSAIC Data,(779/867),0.8985005767012687




SMOTE,AUSAIC Data,(802/867),0.9250288350634371




SMOTE,AUSAIC Data,(789/867),0.9100346020761245




SVMSMOTE,AUSAIC Data,(769/867),0.8869665513264129




SVMSMOTE,AUSAIC Data,(778/867),0.8973471741637832




SVMSMOTE,AUSAIC Data,(791/867),0.9123414071510957




SVMSMOTE,AUSAIC Data,(781/867),0.9008073817762399




SVMSMOTE,AUSAIC Data,(793/867),0.9146482122260668




ROS,AUSAIC Data,(805/867),0.9284890426758939




ROS,AUSAIC Data,(794/867),0.9158016147635525




ROS,AUSAIC Data,(786/867),0.9065743944636678




ROS,AUSAIC Data,(792/867),0.9134948096885813




ROS,AUSAIC Data,(780/867),0.8996539792387543




BorderlineSMOTE,AUSAIC Data,(767/867),0.8846597462514417




BorderlineSMOTE,AUSAIC Data,(776/867),0.895040369088812




BorderlineSMOTE,AUSAIC Data,(782/867),0.9019607843137255




BorderlineSMOTE,AUSAIC Data,(785/867),0.9054209919261822




BorderlineSMOTE,AUSAIC Data,(777/867),0.8961937716262975




SMOTETomek,AUSAIC Data,(786/867),0.9065743944636678




SMOTETomek,AUSAIC Data,(795/867),0.916955017301038




SMOTETomek,AUSAIC Data,(793/867),0.9146482122260668




SMOTETomek,AUSAIC Data,(798/867),0.9204152249134948




SMOTETomek,AUSAIC Data,(791/867),0.9123414071510957




Original,MIT-BIH Arrhythmia Data,(708/713),0.9929873772791024




Original,MIT-BIH Arrhythmia Data,(700/713),0.9817671809256662




Original,MIT-BIH Arrhythmia Data,(702/713),0.9845722300140253




Original,MIT-BIH Arrhythmia Data,(703/713),0.9859747545582047




Original,MIT-BIH Arrhythmia Data,(705/713),0.9887798036465638




SMOTE,MIT-BIH Arrhythmia Data,(705/713),0.9887798036465638




SMOTE,MIT-BIH Arrhythmia Data,(712/713),0.9985974754558204




SMOTE,MIT-BIH Arrhythmia Data,(701/713),0.9831697054698457




SMOTE,MIT-BIH Arrhythmia Data,(702/713),0.9845722300140253




SMOTE,MIT-BIH Arrhythmia Data,(707/713),0.9915848527349228




SVMSMOTE,MIT-BIH Arrhythmia Data,(705/713),0.9887798036465638




SVMSMOTE,MIT-BIH Arrhythmia Data,(706/713),0.9901823281907434




SVMSMOTE,MIT-BIH Arrhythmia Data,(707/713),0.9915848527349228




SVMSMOTE,MIT-BIH Arrhythmia Data,(705/713),0.9887798036465638




SVMSMOTE,MIT-BIH Arrhythmia Data,(703/713),0.9859747545582047




ROS,MIT-BIH Arrhythmia Data,(705/713),0.9887798036465638




ROS,MIT-BIH Arrhythmia Data,(709/713),0.9943899018232819




ROS,MIT-BIH Arrhythmia Data,(708/713),0.9929873772791024




ROS,MIT-BIH Arrhythmia Data,(712/713),0.9985974754558204




ROS,MIT-BIH Arrhythmia Data,(709/713),0.9943899018232819




BorderlineSMOTE,MIT-BIH Arrhythmia Data,(708/713),0.9929873772791024




BorderlineSMOTE,MIT-BIH Arrhythmia Data,(707/713),0.9915848527349228




BorderlineSMOTE,MIT-BIH Arrhythmia Data,(701/713),0.9831697054698457




BorderlineSMOTE,MIT-BIH Arrhythmia Data,(707/713),0.9915848527349228




BorderlineSMOTE,MIT-BIH Arrhythmia Data,(708/713),0.9929873772791024




SMOTETomek,MIT-BIH Arrhythmia Data,(706/713),0.9901823281907434




SMOTETomek,MIT-BIH Arrhythmia Data,(712/713),0.9985974754558204




SMOTETomek,MIT-BIH Arrhythmia Data,(704/713),0.9873772791023843




SMOTETomek,MIT-BIH Arrhythmia Data,(708/713),0.9929873772791024




SMOTETomek,MIT-BIH Arrhythmia Data,(706/713),0.9901823281907434




Original,MIT-BIH Normal Sinus Rhythm Data,(255/262),0.9732824427480916




Original,MIT-BIH Normal Sinus Rhythm Data,(257/262),0.9809160305343512




Original,MIT-BIH Normal Sinus Rhythm Data,(261/262),0.9961832061068703




Original,MIT-BIH Normal Sinus Rhythm Data,(259/262),0.9885496183206107




Original,MIT-BIH Normal Sinus Rhythm Data,(258/262),0.9847328244274809




SMOTE,MIT-BIH Normal Sinus Rhythm Data,(261/262),0.9961832061068703




SMOTE,MIT-BIH Normal Sinus Rhythm Data,(260/262),0.9923664122137404




SMOTE,MIT-BIH Normal Sinus Rhythm Data,(258/262),0.9847328244274809




SMOTE,MIT-BIH Normal Sinus Rhythm Data,(258/262),0.9847328244274809




SMOTE,MIT-BIH Normal Sinus Rhythm Data,(259/262),0.9885496183206107




SVMSMOTE,MIT-BIH Normal Sinus Rhythm Data,(259/262),0.9885496183206107




SVMSMOTE,MIT-BIH Normal Sinus Rhythm Data,(259/262),0.9885496183206107




SVMSMOTE,MIT-BIH Normal Sinus Rhythm Data,(257/262),0.9809160305343512




SVMSMOTE,MIT-BIH Normal Sinus Rhythm Data,(259/262),0.9885496183206107




SVMSMOTE,MIT-BIH Normal Sinus Rhythm Data,(259/262),0.9885496183206107




ROS,MIT-BIH Normal Sinus Rhythm Data,(257/262),0.9809160305343512




ROS,MIT-BIH Normal Sinus Rhythm Data,(253/262),0.9656488549618321




ROS,MIT-BIH Normal Sinus Rhythm Data,(254/262),0.9694656488549618




ROS,MIT-BIH Normal Sinus Rhythm Data,(261/262),0.9961832061068703




ROS,MIT-BIH Normal Sinus Rhythm Data,(261/262),0.9961832061068703




BorderlineSMOTE,MIT-BIH Normal Sinus Rhythm Data,(258/262),0.9847328244274809




BorderlineSMOTE,MIT-BIH Normal Sinus Rhythm Data,(259/262),0.9885496183206107




BorderlineSMOTE,MIT-BIH Normal Sinus Rhythm Data,(255/262),0.9732824427480916




BorderlineSMOTE,MIT-BIH Normal Sinus Rhythm Data,(257/262),0.9809160305343512




BorderlineSMOTE,MIT-BIH Normal Sinus Rhythm Data,(258/262),0.9847328244274809




SMOTETomek,MIT-BIH Normal Sinus Rhythm Data,(256/262),0.9770992366412213




SMOTETomek,MIT-BIH Normal Sinus Rhythm Data,(258/262),0.9847328244274809




SMOTETomek,MIT-BIH Normal Sinus Rhythm Data,(256/262),0.9770992366412213




SMOTETomek,MIT-BIH Normal Sinus Rhythm Data,(257/262),0.9809160305343512




SMOTETomek,MIT-BIH Normal Sinus Rhythm Data,(256/262),0.9770992366412213




Original,QT Data,(1588/1605),0.9894080996884735




Original,QT Data,(1588/1605),0.9894080996884735




Original,QT Data,(1580/1605),0.9844236760124611




Original,QT Data,(1590/1605),0.9906542056074766




Original,QT Data,(1580/1605),0.9844236760124611




SMOTE,QT Data,(1591/1605),0.9912772585669782




SMOTE,QT Data,(1589/1605),0.990031152647975




SMOTE,QT Data,(1588/1605),0.9894080996884735




SMOTE,QT Data,(1584/1605),0.9869158878504672




SMOTE,QT Data,(1591/1605),0.9912772585669782




SVMSMOTE,QT Data,(1594/1605),0.9931464174454828




SVMSMOTE,QT Data,(1595/1605),0.9937694704049844




SVMSMOTE,QT Data,(1590/1605),0.9906542056074766




SVMSMOTE,QT Data,(1592/1605),0.9919003115264797




SVMSMOTE,QT Data,(1587/1605),0.9887850467289719




ROS,QT Data,(1594/1605),0.9931464174454828




ROS,QT Data,(1584/1605),0.9869158878504672




ROS,QT Data,(1596/1605),0.994392523364486




ROS,QT Data,(1590/1605),0.9906542056074766




ROS,QT Data,(1593/1605),0.9925233644859813




BorderlineSMOTE,QT Data,(1593/1605),0.9925233644859813




BorderlineSMOTE,QT Data,(1594/1605),0.9931464174454828




BorderlineSMOTE,QT Data,(1596/1605),0.994392523364486




BorderlineSMOTE,QT Data,(1593/1605),0.9925233644859813




BorderlineSMOTE,QT Data,(1597/1605),0.9950155763239875




SMOTETomek,QT Data,(1591/1605),0.9912772585669782




SMOTETomek,QT Data,(1595/1605),0.9937694704049844




SMOTETomek,QT Data,(1591/1605),0.9912772585669782




SMOTETomek,QT Data,(1583/1605),0.9862928348909658




SMOTETomek,QT Data,(1598/1605),0.9956386292834891
finish
