In [3]:
import pandas
import numpy as np
import tensorflow as tf
from keras.utils import np_utils
# CNN related
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.recurrent import GRU
#from keras.layers.recurrent import LSTM, GRU
from keras.layers import Conv1D, MaxPooling1D, AtrousConvolution1D, RepeatVector
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, CSVLogger
from keras.layers.wrappers import Bidirectional
from keras import regularizers
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import *
from keras.optimizers import RMSprop, Adam, SGD, Nadam
from keras.initializers import *
import matplotlib.pyplot as plt

from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE # doctest: +NORMALIZE_WHITESPACE
from imblearn.over_sampling import RandomOverSampler # doctest: +NORMALIZE_WHITESPACE
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import KMeansSMOTE
from imblearn.over_sampling import BorderlineSMOTE # doctest: +NORMALIZE_WHITESPACE
from imblearn.combine import SMOTETomek # doctest: +NORMALIZE_WHITESPACE

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from keras import backend as K


##################
# AUC for a binary classifier
def auc(y_true, y_pred):
    ptas = tf.stack([binary_PTA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
    pfas = tf.stack([binary_PFA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
    pfas = tf.concat([tf.ones((1,)) ,pfas],axis=0)
    binSizes = -(pfas[1:]-pfas[:-1])
    s = ptas*binSizes
    return K.sum(s, axis=0)
    
##################
# PFA, prob false alert for binary classifier
def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    # N = total number of negative labels
    N = K.sum(1 - y_true)
    # FP = total number of false alerts, alerts from the negative class labels
    FP = K.sum(y_pred - y_pred * y_true)
    #print(FP/N)
    return FP/N

##################
# P_TA prob true alerts for binary classifier
def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    # P = total number of positive labels
    P = K.sum(y_true)
    # TP = total number of correct alerts, alerts from the positive class labels
    TP = K.sum(y_pred * y_true)
    return TP/P

class ecgcnn: 
    np.set_printoptions(threshold=np.inf)
    def __init__(self, base_path, stkid, infosize, onehotsize ):
        self.stkid = stkid
        fname = base_path + stkid + '.csv'
        self.INFOSIZE = infosize
        self.ONEHOTSIZE = onehotsize
        self.load_data(fname)
        self.EMB_SIZE = self.INFOSIZE+self.ONEHOTSIZE
        #self.ONEHOTSIZE = onehotsize
        

##################
    def load_data(self, fname):
        # load csv data
        mat = pandas.read_csv(fname, sep=",", header=0, error_bad_lines=False).as_matrix()

        self.info = []
        self.sid=[]
        for i in range(self.INFOSIZE):
            self.info.append(self.matrix_col(mat, i))   
        self.sid.append(self.matrix_col(mat,16))
       # print(self.sid) 
        #x = self.find_category(self.sid)
        #print(x)
##################        
    '''
    def find_category0(self, sid):
        print(sid[0][0])
        y = list(map(int, sid[3:-1])) 
        #y = map(eval,sid[3:-1])
        print(y)    
        for n in range(len(sid)):
            sidnum = [0]*51 
            if ( y == 11 ):
                print("456") 
                sidnum [n]=sidnum [n]+1
                print(sidnum)
                return  sidnum
    '''
    def find_category(self, sid):
        result = []
        for sidstr in sid[0]:
            index = int(sidstr.replace('sid', ''))
            result.append(index)
            
        return result
        
    '''           
            sidnum = [0]*51
            #print(index)
            sidnum[index-1] = 1
            #print(sidnum)
            result.append(sidnum)
           
            return result
    ''' 
######################################################
    def matrix_col(self, matrix, i):
        return [row[i] for row in matrix]    
    
######################################################
    def normalize(self, data):
        #return (np.array(data)-np.mean(data))/np.std(data) 
        std = np.std(data)
        if (std == 0):
            data2 = np.zeros(len(data))
            return data2
        return (np.array(data)-np.mean(data))/std
    
######################################################
    
    def backtest_training_data(self,num,s):
        x_train0, y_train_onehot = [], []  
        x_train1, x_test1, y_train1, y_test1 = [], [], [], []
        # normalize
        nvals = []
        for cval in self.info:
            nvals.append(self.normalize(cval))
        
        # training preparation            
        l2t = nvals
        x_train0 = np.column_stack(l2t)
        y_train_onehot = self.find_category(self.sid)

        x_train = []
        for rec in x_train0:
            x_train.append(rec)
        
        x_train = np.array(x_train)
        #y_train_onehot = np_utils.to_categorical(y_train)
        y_train_onehot = np.array(y_train_onehot)

        if num==0:
            n = 40
        if num==1:
            n = 38
        if num==2:
            n = 16
        if num==3:
            n = 81
        x_train1, x_test1, y_train1, y_test1 = train_test_split(x_train, y_train_onehot, test_size=0.2, random_state=None)
        
        
        if s==0:
            x_res, y_res = x_train1, y_train1
        else:    
            if s==1:
                sm = SMOTE(random_state=0)
            if s==2:
                sm = SVMSMOTE(random_state=0)
            if s==3:
                sm = RandomOverSampler(random_state=0)
            if s==4:
                sm = BorderlineSMOTE(random_state=0) 
            if s==5:
                sm = SMOTETomek(random_state=0)
            #sm = KMeansSMOTE(random_state=0)  #X need 25 samples
            
            x_res, y_res = sm.fit_resample(x_train1, y_train1)
            
            #print(x_train)
            #print('Resampled dataset shape %s' % Counter(y_res))

        x_train_new = []
        for rec in x_res:
            x_train_new.append([rec])
            #x_train_new.append(np.reshape(rec,(-1,self.EMB_SIZE)))
        x_test_new = []
        for rec1 in x_test1:
            x_test_new.append([rec1])
                
        x_train_new = np.array(x_train_new)  
        x_test_new = np.array(x_test_new)
        #print(x_train_new)
        
        y_train_onehot_new = []
        y_test_new = [] 
        y_test_new1 = [] 
        
        for sid in y_res:
            index = int(sid)
            sidnum = [0]*n
           
            #print(index)
            sidnum[index-1] = 1
            #print(index)
            #print(sidnum)
            y_train_onehot_new.append(sidnum)  
        for sidstr in y_test1:
            index = int(sidstr)
            sidnum1 = [0]*n
            
            #print(index)
            sidnum1[index-1] = 1
            #print(sidnum)
            y_test_new.append(sidnum1) 
            y_test_new1.append([sidnum1])

        return x_train_new, y_train_onehot_new, x_test_new, y_test_new, y_test_new1

######################################################
    def build_model(self,num):
        return self.simple_model(num)

######################################################
    def simple_model(self,num):
        model = Sequential()
        model.add(Conv1D(input_shape=(1, self.EMB_SIZE), filters=128, kernel_size=16, padding='same'))
        model.add(BatchNormalization())
        model.add(LeakyReLU())
        model.add(Dropout(0.5))

        model.add(Conv1D(filters=64, kernel_size=8, padding='same'))
        model.add(BatchNormalization())
        model.add(LeakyReLU())
        model.add(Dropout(0.5))
       
        model.add(Flatten())
        model.add(Dense(128))
        model.add(BatchNormalization())
        model.add(LeakyReLU())
        if num==0:
            model.add(Dense(40))
        if num==1:
            model.add(Dense(38))
        if num==2:
            model.add(Dense(16))
        if num==3:
            model.add(Dense(81))
            
        model.add(Activation('softmax'))

        opt = Nadam(lr=0.002)
        
        #model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=[auc])
        model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
        #print(model.summary())
        return model

######################################################
    def train_model(self, model, x_train, y_train_onehot, x_test1, y_test1 ):
        model_name = 'cnn.'+self.stkid+'.model'
        input_shape = x_train[0].shape
        #reduce_lr = ReduceLROnPlateau(monitor='roc_auc_val', factor=0.8, patience=30, min_lr=0.000001, verbose=2)
        reduce_lr = ReduceLROnPlateau(monitor='roc_accuracy', factor=0.8, patience=30, min_lr=0.000001, verbose=2)
        checkpointer = ModelCheckpoint(filepath=model_name, verbose=0, save_best_only=True)
        '''
        train_history = model.fit(np.array(x_train),np.array(y_train_onehot), epochs = 200, batch_size = 10, verbose=2, \
                                  validation_data=(np.array(x_test1), np.array(y_test1)), callbacks=[reduce_lr, checkpointer], shuffle=True)
        '''
        train_history = model.fit(np.array(x_train),np.array(y_train_onehot), epochs = 200, batch_size = 100, verbose=0, \
                                   callbacks=[reduce_lr, checkpointer], shuffle=True)
        
        #show_train_history(train_history, 'accuracy', 'val_accuracy')
        #show_train_history(train_history, 'auc', 'val_auc')
        
        #show_train_history(train_history, 'loss', 'val_loss')
        return train_history

######################################################
    def do_backtest(self,num,s):

        # cut data 
        x_train, y_train_onehot, x_test1, y_test1, y_test2 = self.backtest_training_data(num,s)
        # build model 
        model = self.build_model(num)
        train_history = self.train_model(model, x_train, y_train_onehot, x_test1, y_test1 )
        
        if num==0:
            n = 40
        if num==1:
            n = 38
        if num==2:
            n = 16
        if num==3:
            n = 81
            
        count = 0
        total = 0
        #print(x_test1)
        #print(np.array(y_test2))
        #print("x : "+str(x_test1.shape)+" / y : "+str(np.array(y_test2).shape))
        for x_test, y_test in zip(x_test1, np.array(y_test2)):
            
            prediction = model.predict_classes(np.array([x_test]),verbose=0)
            #print(proba[0], proba[1])
            #print(datestr, prediction[0], y_test_onehot[0], scores[1], last_close)
            
      
            index = int(prediction)+1
            #print("prediction: "+str(index))
        
            sidnum = [0]*n
            #print(index)
            sidnum[index-1] = 1
            sidnum_x = np.array(sidnum)
            
            total += 1
            
            if (y_test[0][index-1] == sidnum_x[index-1]):
                count += 1
        if (total == 0):
            return 0, 0
        
    
        #print(total)
        #print(count)
        return count, total
        
######################################################
"""
def show_train_history(train_history, train, validation):
    plt.plot(train_history.history[train])
    #plt.plot(train_history.history[validation])
    plt.title('Train History')
    plt.ylabel(train)
    plt.xlabel('Epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()
"""
def main():
    print("start")
    stkid = 'ALL_BMI_2' 
    TID = 'BMI'
    DA = ["AUSAIC Data","MIT-BIH Arrhythmia Data","MIT-BIH Normal Sinus Rhythm Data","QT Data"]
    SMO = ["Original","SMOTE","SVMSMOTE","ROS","BorderlineSMOTE","SMOTETomek"]
    ADD = ["/home/keg/桌面/ecg_data/AUSAIC Data/BMI/",
          "/home/keg/桌面/ecg_data/MIT-BIH Arrhythmia Data/BMI/",
          "/home/keg/桌面/ecg_data/MIT-BIH Normal Sinus Rhythm Data/BMI/",
          "/home/keg/桌面/ecg_data/QT Data/BMI/"]
    NUMB = [0, 1, 2, 3]
    NUMBE = [0, 1, 2, 3, 4, 5]
    info_size = 15
    onehot_size = 0
    file = open('/home/keg/桌面/ecg_data/BMI_accuracy.csv', 'w')
    file.write('Method,DATA,Ture/Total,accuracy'+'\n')
    
    for num in NUMB:
        base_path = ADD[num]
        #S=0  #SMOTE /S=1  #SVMSMOTE /S=2  #ROS  /S=3  #BorderlineSMOTE  /S=4  #SMOTETomek
        for S in NUMBE:
            for time in range(5):
                ecg = ecgcnn(base_path, stkid, info_size, onehot_size )
                count, total = ecg.do_backtest(num,S)
                hitrate = 0.0
                if (total > 0):
                    hitrate = float(count)/total
                    print(SMO[S]+","+DA[num]+",("+str(count)+"/"+str(total)+"),"+str(hitrate))
                    plan =SMO[S]+","+DA[num]+",("+str(count)+"/"+str(total)+"),"+str(hitrate)+'\n'
                    #file = open('/home/keg/桌面/ecg_data/BMI_accuracy.txt', 'w')
                    file.write(plan)
                
    file.close()
    print("finish")
    #xt, yt = ecg.backtest_training_data()
    #print(xt)
    #print(yt[-1])
    #for stkid in stklist:    
    #    model = ecgcnn(base_path, stkid,info_size)
                      
if (__name__ == "__main__"):
    main()


start




Original,AUSAIC Data,(662/869),0.761795166858458




Original,AUSAIC Data,(657/869),0.7560414269275029




Original,AUSAIC Data,(653/869),0.7514384349827388




Original,AUSAIC Data,(677/869),0.7790563866513234




Original,AUSAIC Data,(652/869),0.7502876869965478




SMOTE,AUSAIC Data,(628/869),0.7226697353279632




SMOTE,AUSAIC Data,(665/869),0.7652474108170311




SMOTE,AUSAIC Data,(651/869),0.7491369390103567




SMOTE,AUSAIC Data,(671/869),0.7721518987341772




SMOTE,AUSAIC Data,(661/869),0.760644418872267




SVMSMOTE,AUSAIC Data,(648/869),0.7456846950517837




SVMSMOTE,AUSAIC Data,(652/869),0.7502876869965478




SVMSMOTE,AUSAIC Data,(644/869),0.7410817031070196




SVMSMOTE,AUSAIC Data,(652/869),0.7502876869965478




SVMSMOTE,AUSAIC Data,(665/869),0.7652474108170311




ROS,AUSAIC Data,(661/869),0.760644418872267




ROS,AUSAIC Data,(674/869),0.7756041426927502




ROS,AUSAIC Data,(660/869),0.759493670886076




ROS,AUSAIC Data,(644/869),0.7410817031070196




ROS,AUSAIC Data,(653/869),0.7514384349827388




BorderlineSMOTE,AUSAIC Data,(631/869),0.7261219792865362




BorderlineSMOTE,AUSAIC Data,(676/869),0.7779056386651323




BorderlineSMOTE,AUSAIC Data,(663/869),0.762945914844649




BorderlineSMOTE,AUSAIC Data,(665/869),0.7652474108170311




BorderlineSMOTE,AUSAIC Data,(647/869),0.7445339470655926




SMOTETomek,AUSAIC Data,(678/869),0.7802071346375143




SMOTETomek,AUSAIC Data,(669/869),0.7698504027617952




SMOTETomek,AUSAIC Data,(655/869),0.7537399309551208




SMOTETomek,AUSAIC Data,(663/869),0.762945914844649




SMOTETomek,AUSAIC Data,(661/869),0.760644418872267




Original,MIT-BIH Arrhythmia Data,(715/725),0.9862068965517241




Original,MIT-BIH Arrhythmia Data,(703/725),0.9696551724137931




Original,MIT-BIH Arrhythmia Data,(703/725),0.9696551724137931




Original,MIT-BIH Arrhythmia Data,(714/725),0.9848275862068966




Original,MIT-BIH Arrhythmia Data,(710/725),0.9793103448275862




SMOTE,MIT-BIH Arrhythmia Data,(711/725),0.9806896551724138




SMOTE,MIT-BIH Arrhythmia Data,(714/725),0.9848275862068966




SMOTE,MIT-BIH Arrhythmia Data,(712/725),0.9820689655172414




SMOTE,MIT-BIH Arrhythmia Data,(713/725),0.983448275862069




SMOTE,MIT-BIH Arrhythmia Data,(712/725),0.9820689655172414




SVMSMOTE,MIT-BIH Arrhythmia Data,(704/725),0.9710344827586207




SVMSMOTE,MIT-BIH Arrhythmia Data,(715/725),0.9862068965517241




SVMSMOTE,MIT-BIH Arrhythmia Data,(711/725),0.9806896551724138




SVMSMOTE,MIT-BIH Arrhythmia Data,(717/725),0.9889655172413793




SVMSMOTE,MIT-BIH Arrhythmia Data,(710/725),0.9793103448275862




ROS,MIT-BIH Arrhythmia Data,(716/725),0.9875862068965517




ROS,MIT-BIH Arrhythmia Data,(720/725),0.993103448275862




ROS,MIT-BIH Arrhythmia Data,(716/725),0.9875862068965517




ROS,MIT-BIH Arrhythmia Data,(712/725),0.9820689655172414




ROS,MIT-BIH Arrhythmia Data,(712/725),0.9820689655172414




BorderlineSMOTE,MIT-BIH Arrhythmia Data,(716/725),0.9875862068965517




BorderlineSMOTE,MIT-BIH Arrhythmia Data,(711/725),0.9806896551724138




BorderlineSMOTE,MIT-BIH Arrhythmia Data,(707/725),0.9751724137931035




BorderlineSMOTE,MIT-BIH Arrhythmia Data,(708/725),0.976551724137931




BorderlineSMOTE,MIT-BIH Arrhythmia Data,(706/725),0.9737931034482759




SMOTETomek,MIT-BIH Arrhythmia Data,(710/725),0.9793103448275862




SMOTETomek,MIT-BIH Arrhythmia Data,(707/725),0.9751724137931035




SMOTETomek,MIT-BIH Arrhythmia Data,(711/725),0.9806896551724138




SMOTETomek,MIT-BIH Arrhythmia Data,(698/725),0.9627586206896551




SMOTETomek,MIT-BIH Arrhythmia Data,(702/725),0.9682758620689655




Original,MIT-BIH Normal Sinus Rhythm Data,(260/261),0.9961685823754789




Original,MIT-BIH Normal Sinus Rhythm Data,(256/261),0.9808429118773946




Original,MIT-BIH Normal Sinus Rhythm Data,(261/261),1.0




Original,MIT-BIH Normal Sinus Rhythm Data,(256/261),0.9808429118773946




Original,MIT-BIH Normal Sinus Rhythm Data,(259/261),0.9923371647509579




SMOTE,MIT-BIH Normal Sinus Rhythm Data,(261/261),1.0




SMOTE,MIT-BIH Normal Sinus Rhythm Data,(259/261),0.9923371647509579




SMOTE,MIT-BIH Normal Sinus Rhythm Data,(261/261),1.0




SMOTE,MIT-BIH Normal Sinus Rhythm Data,(261/261),1.0




SMOTE,MIT-BIH Normal Sinus Rhythm Data,(259/261),0.9923371647509579




SVMSMOTE,MIT-BIH Normal Sinus Rhythm Data,(261/261),1.0




SVMSMOTE,MIT-BIH Normal Sinus Rhythm Data,(259/261),0.9923371647509579




SVMSMOTE,MIT-BIH Normal Sinus Rhythm Data,(258/261),0.9885057471264368




SVMSMOTE,MIT-BIH Normal Sinus Rhythm Data,(259/261),0.9923371647509579




SVMSMOTE,MIT-BIH Normal Sinus Rhythm Data,(261/261),1.0




ROS,MIT-BIH Normal Sinus Rhythm Data,(259/261),0.9923371647509579




ROS,MIT-BIH Normal Sinus Rhythm Data,(261/261),1.0




ROS,MIT-BIH Normal Sinus Rhythm Data,(257/261),0.9846743295019157




ROS,MIT-BIH Normal Sinus Rhythm Data,(255/261),0.9770114942528736




ROS,MIT-BIH Normal Sinus Rhythm Data,(261/261),1.0




BorderlineSMOTE,MIT-BIH Normal Sinus Rhythm Data,(255/261),0.9770114942528736




BorderlineSMOTE,MIT-BIH Normal Sinus Rhythm Data,(259/261),0.9923371647509579




BorderlineSMOTE,MIT-BIH Normal Sinus Rhythm Data,(260/261),0.9961685823754789




BorderlineSMOTE,MIT-BIH Normal Sinus Rhythm Data,(261/261),1.0




BorderlineSMOTE,MIT-BIH Normal Sinus Rhythm Data,(260/261),0.9961685823754789




SMOTETomek,MIT-BIH Normal Sinus Rhythm Data,(261/261),1.0




SMOTETomek,MIT-BIH Normal Sinus Rhythm Data,(259/261),0.9923371647509579




SMOTETomek,MIT-BIH Normal Sinus Rhythm Data,(261/261),1.0




SMOTETomek,MIT-BIH Normal Sinus Rhythm Data,(259/261),0.9923371647509579




SMOTETomek,MIT-BIH Normal Sinus Rhythm Data,(259/261),0.9923371647509579




Original,QT Data,(1556/1584),0.9823232323232324




Original,QT Data,(1563/1584),0.9867424242424242




Original,QT Data,(1562/1584),0.9861111111111112




Original,QT Data,(1572/1584),0.9924242424242424




Original,QT Data,(1573/1584),0.9930555555555556




SMOTE,QT Data,(1560/1584),0.9848484848484849




SMOTE,QT Data,(1561/1584),0.985479797979798




SMOTE,QT Data,(1566/1584),0.9886363636363636




SMOTE,QT Data,(1572/1584),0.9924242424242424




SMOTE,QT Data,(1565/1584),0.9880050505050505




SVMSMOTE,QT Data,(1562/1584),0.9861111111111112




SVMSMOTE,QT Data,(1576/1584),0.9949494949494949




SVMSMOTE,QT Data,(1560/1584),0.9848484848484849




SVMSMOTE,QT Data,(1568/1584),0.98989898989899




SVMSMOTE,QT Data,(1566/1584),0.9886363636363636




ROS,QT Data,(1571/1584),0.9917929292929293




ROS,QT Data,(1570/1584),0.9911616161616161




ROS,QT Data,(1571/1584),0.9917929292929293




ROS,QT Data,(1565/1584),0.9880050505050505




ROS,QT Data,(1568/1584),0.98989898989899




BorderlineSMOTE,QT Data,(1574/1584),0.9936868686868687




BorderlineSMOTE,QT Data,(1571/1584),0.9917929292929293




BorderlineSMOTE,QT Data,(1569/1584),0.990530303030303




BorderlineSMOTE,QT Data,(1562/1584),0.9861111111111112




BorderlineSMOTE,QT Data,(1568/1584),0.98989898989899




SMOTETomek,QT Data,(1571/1584),0.9917929292929293




SMOTETomek,QT Data,(1560/1584),0.9848484848484849




SMOTETomek,QT Data,(1565/1584),0.9880050505050505




SMOTETomek,QT Data,(1565/1584),0.9880050505050505




SMOTETomek,QT Data,(1566/1584),0.9886363636363636
finish
