In [59]:
import pandas as pd
import neurokit2 as nk
import csv
import numpy as np
#this is probably a bad idea to suppress all warnings but it gets rid of the neurokit warnings
import warnings
warnings.filterwarnings('ignore')
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

from sklearn.model_selection import train_test_split

In [19]:
def df_from_txt(txt_path):
        '''
        Args:
            txt_path (str): path to the annotations.txt file
        Returns:
            Pandas df: dataframe with the first three columns of the text file ("Time", "Sample#", "Result")
        '''
        data = []
        with open(txt_path,'r') as data_file:
            for line in data_file:
                data.append(line.split()[:3])
        df = pd.DataFrame(data[1:], columns = ['Time', 'Sample#', 'Result'])
        #convert the sample to integers
        for i in range(len(df["Sample#"])):
            df["Sample#"][i] = int(df["Sample#"][i])
        return df
    
def NeurokitExtraction(FileNumber, signal = "\'MLII\'"):
    """
    Added signal preference:
    """
    try:
        
        print("extracting File",FileNumber)

        #load data and annotations into dataframes
        File = pd.read_csv("mitbih_database/"+str(FileNumber)+".csv")
        annotations = df_from_txt("mitbih_database/"+str(FileNumber)+"annotations.txt")

        #select the signal from the file
        if signal in File.columns:
            #preferred signal
            ecg_signal = File[signal]

        elif "\'V5\'" in File.columns:
            #otherwise default to V5 signal
            ecg_signal = File["\'V5\'"]

        else:
            #otherwise default to whatever signal is there
            ecg_signal = File[File.columns[1]].to_numpy()


        rpeak = []
        for i in range(len(annotations['Sample#'])):
            x = int(annotations['Sample#'][i])
            rpeak.append(x)
        rpeak = np.array(rpeak)

        ecg_signal = File[File.columns[1]].to_numpy()
        cleaned_ecg = nk.ecg_clean(ecg_signal, sampling_rate=360, method='neurokit')#used

        events = rpeak[1:]
        epochs = nk.epochs_create(ecg_signal, events, sampling_rate=360, epochs_start=-0.4, epochs_end=0.4)


        features = nk.ecg_analyze(epochs, sampling_rate=360, method='auto', subepoch_rate=[None, None])
        features.columns = ['beat_number', 'ECG_R_Peaks']



        dataframe, QRS = nk.ecg_delineate(cleaned_ecg, rpeaks=rpeak[1:], sampling_rate=360, method='dwt', show=False, show_type='all', check=False)

        features['ECG_Q_Peaks'] = QRS['ECG_Q_Peaks']
        features['ECG_S_Peaks'] = QRS['ECG_S_Peaks']
        features['ECG_P_Peaks'] = QRS['ECG_P_Peaks']
        features['ECG_P_Onsets'] = QRS['ECG_P_Onsets']
        features['ECG_P_Offsets'] = QRS['ECG_P_Offsets']
        features['ECG_T_Peaks'] = QRS['ECG_T_Peaks']
        features['ECG_T_Onsets'] = QRS['ECG_T_Onsets']
        features['ECG_T_Offsets'] = QRS['ECG_T_Offsets']
        features['ECG_R_Onsets'] = QRS['ECG_R_Onsets']
        features['ECG_R_Offsets'] = QRS['ECG_R_Offsets']

        print("{} Extracted".format(FileNumber))
    except:
        print("Not able to process file:", FileNumber)
        
    return features

In [16]:
files = [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 111, 112, 114, 115, 116, 118,
         119, 121, 122, 123, 124, 200, 201, 202, 203, 205, 207, 208, 209, 210, 212, 213, 214,
         215, 217, 219, 220, 221, 222, 223, 228, 230, 231, 232, 233, 234]



In [20]:
testFeatures=pd.DataFrame()

for file in files:
    
    DATA = pd.DataFrame(np.nan, index=range(1), columns = testFeatures.columns)
    testFeatures = pd.concat([testFeatures,DATA])
    testFeatures = pd.concat([testFeatures,NeurokitExtraction(file)])

extracting File 100
100 Extracted
extracting File 101
101 Extracted
extracting File 102
102 Extracted
extracting File 103
103 Extracted
extracting File 104
104 Extracted
extracting File 105
105 Extracted
extracting File 106
106 Extracted
extracting File 107
107 Extracted
extracting File 108
108 Extracted
extracting File 109
109 Extracted
extracting File 111
111 Extracted
extracting File 112
112 Extracted
extracting File 114
114 Extracted
extracting File 115
115 Extracted
extracting File 116
116 Extracted
extracting File 118
118 Extracted
extracting File 119
119 Extracted
extracting File 121
121 Extracted
extracting File 122
122 Extracted
extracting File 123
123 Extracted
extracting File 124
124 Extracted
extracting File 200
200 Extracted
extracting File 201
201 Extracted
extracting File 202
202 Extracted
extracting File 203
203 Extracted
extracting File 205
205 Extracted
extracting File 207
207 Extracted
extracting File 208
208 Extracted
extracting File 209
209 Extracted
extracting Fil

In [21]:
testFeatures.to_csv("testFeatures.csv")

In [23]:
Training = pd.DataFrame(columns = ['File','Result', 'Sample#'] )

for i in files:
    temp= pd.DataFrame(columns = Training.columns)

    FileName ="mitbih_database/" + str(i)+'annotations.txt'
    annotations = df_from_txt(FileName)
    temp['Sample#']=annotations['Sample#']
    temp['Result']=annotations['Result']
    temp['File']= i
    
    Training = pd.concat([Training, temp], axis=0)

In [24]:
#NewTraining=pd.concat([NewTraining,NewFeatures],axis=1)
Training = Training.reset_index(drop=True)
testFeatures=testFeatures.reset_index(drop=True)
Training=pd.concat([Training,testFeatures],axis=1)
Training = Training.reset_index()
Training.head()

Unnamed: 0,index,File,Result,Sample#,beat_number,ECG_R_Peaks,ECG_Q_Peaks,ECG_S_Peaks,ECG_P_Peaks,ECG_P_Onsets,ECG_P_Offsets,ECG_T_Peaks,ECG_T_Onsets,ECG_T_Offsets,ECG_R_Onsets,ECG_R_Offsets
0,0,100,+,18,,,,,,,,,,,,
1,1,100,N,77,1.0,77.0,66.0,88.0,,,,144.0,,147.0,,91.0
2,2,100,N,370,2.0,370.0,359.0,472.0,309.0,294.0,338.0,432.0,418.0,468.0,350.0,387.0
3,3,100,N,662,3.0,662.0,652.0,757.0,603.0,586.0,610.0,783.0,761.0,791.0,644.0,676.0
4,4,100,N,946,4.0,946.0,936.0,958.0,884.0,863.0,890.0,1064.0,1045.0,1070.0,925.0,961.0


In [25]:
Training = Training.drop(["ECG_T_Peaks","ECG_T_Onsets","ECG_T_Offsets","ECG_R_Offsets"],axis='columns')
print(len(Training))
Training.dropna(inplace=True)
print(len(Training))

109312
91465


In [26]:
Interval = pd.DataFrame(columns=["File","Abnormal","Result","R-R Interval","R Height","R_Onset-Rpeak","Q-Q Interval","Q Height","P_Onset-P_Offset","P Height","P-P Interval","P_Onset-Ppeak"])
RRLIST=np.array([])
QQLIST=np.array([])
R_Height=np.array([])
Q_Height=np.array([]) 
P_Height=np.array([])
Ron_RP=np.array([])
Pon_PP=np.array([])
Pon_Poff=np.array([])
PPLIST=np.array([])
Abnormal=np.array([])
Result=np.array([])
FILES=np.array([])
y=2
FileNumber=Training["File"][y]
RAWFile= pd.read_csv("mitbih_database/"+str(FileNumber)+".csv")


for i in Training["index"][:-1]:
    if (i-y>1):
        y=i
        continue
    if Training["Result"][i]=='N':
        Abnormal=np.append(Abnormal,0)
    else:
        Abnormal=np.append(Abnormal,1)
    if (Training["File"][i] != FileNumber):
        print("Reading File",FileNumber)
        FileNumber=Training["File"][i]
        RAWFile= pd.read_csv("mitbih_database/"+str(FileNumber)+".csv")
        
    RRLIST=np.append(RRLIST,Training["ECG_R_Peaks"][i]-Training["ECG_R_Peaks"][y])
    QQLIST=np.append(QQLIST,Training["ECG_Q_Peaks"][i]-Training["ECG_Q_Peaks"][y])
    PPLIST=np.append(PPLIST,Training["ECG_P_Peaks"][i]-Training["ECG_P_Peaks"][y])
    R_Height=np.append(R_Height,RAWFile[RAWFile.columns[1]][Training["ECG_R_Peaks"][i]])#
    Q_Height=np.append(Q_Height,RAWFile[RAWFile.columns[1]][Training["ECG_Q_Peaks"][i]])#
    P_Height=np.append(P_Height,RAWFile[RAWFile.columns[1]][Training["ECG_P_Peaks"][i]])#
    Ron_RP=np.append(Ron_RP,Training["ECG_R_Peaks"][i]-Training["ECG_R_Onsets"][i])
    Pon_PP=np.append(Pon_PP,Training["ECG_P_Peaks"][i]-Training["ECG_P_Onsets"][i])
    Pon_Poff=np.append(Pon_Poff,Training["ECG_P_Offsets"][i]-Training["ECG_P_Onsets"][i])
    Result=np.append(Result,Training["Result"][i])
    FILES=np.append(FILES,FileNumber)
    
    y=i

Interval["R-R Interval"]=RRLIST
Interval["Q-Q Interval"]=QQLIST
Interval["P-P Interval"]=PPLIST
Interval["R_Onset-Rpeak"]=Ron_RP
Interval["P_Onset-Ppeak"]=Pon_PP
Interval["P_Onset-P_Offset"]=Pon_Poff
Interval["R Height"]=R_Height#
Interval["Q Height"]=Q_Height#
Interval["P Height"]=P_Height#
Interval["File"]=FILES
Interval["Abnormal"]=Abnormal
Interval["Result"]=Result

print("Length of DataFrame",len(Interval))
Interval.head(10)

Reading File 100
Reading File 101
Reading File 102
Reading File 103
Reading File 104
Reading File 105
Reading File 106
Reading File 107
Reading File 108
Reading File 109
Reading File 111
Reading File 112
Reading File 114
Reading File 115
Reading File 116
Reading File 118
Reading File 119
Reading File 121
Reading File 122
Reading File 123
Reading File 124
Reading File 200
Reading File 201
Reading File 202
Reading File 203
Reading File 205
Reading File 207
Reading File 208
Reading File 209
Reading File 210
Reading File 212
Reading File 213
Reading File 214
Reading File 215
Reading File 217
Reading File 219
Reading File 220
Reading File 221
Reading File 222
Reading File 223
Reading File 228
Reading File 230
Reading File 231
Reading File 232
Reading File 233
Length of DataFrame 82529


Unnamed: 0,File,Abnormal,Result,R-R Interval,R Height,R_Onset-Rpeak,Q-Q Interval,Q Height,P_Onset-P_Offset,P Height,P-P Interval,P_Onset-Ppeak
0,100.0,0.0,N,0.0,1212.0,20.0,0.0,922.0,44.0,983.0,0.0,15.0
1,100.0,0.0,N,292.0,1201.0,18.0,293.0,920.0,24.0,981.0,294.0,17.0
2,100.0,0.0,N,284.0,1186.0,21.0,284.0,895.0,27.0,975.0,281.0,21.0
3,100.0,0.0,N,285.0,1188.0,18.0,285.0,921.0,61.0,973.0,284.0,27.0
4,100.0,0.0,N,284.0,1201.0,20.0,284.0,915.0,18.0,974.0,294.0,4.0
5,100.0,0.0,N,294.0,1213.0,19.0,294.0,920.0,27.0,984.0,284.0,20.0
6,100.0,1.0,A,235.0,1193.0,19.0,235.0,914.0,42.0,978.0,238.0,23.0
7,100.0,0.0,N,358.0,1193.0,20.0,357.0,920.0,30.0,972.0,354.0,19.0
8,100.0,0.0,N,304.0,1202.0,20.0,304.0,918.0,25.0,975.0,304.0,18.0
9,100.0,0.0,N,292.0,1209.0,19.0,293.0,919.0,23.0,981.0,295.0,17.0


In [64]:
X_train, X_test, y_train, y_test = train_test_split(Interval.drop(['Result', 'Abnormal', 'File'], axis = 'columns'), Interval['Abnormal'], test_size = 0.3) 

In [65]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

RandomForestClassifier()

In [66]:
model.score(X_test, y_test)

0.9436164627004322

In [67]:
#trying a classic decision tree:
model1 = tree.DecisionTreeClassifier()

model1.fit(X_train, y_train)
model1.score(X_test, y_test)

0.9050850195888364