In [1]:
import numpy as np
import pandas as pd
import csv
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn import tree
%run Functions.ipynb
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

First the data is imported from the Interval infor csv file saved on the branch

In [2]:
DATA=pd.read_csv("IntervalInfo.csv")
DATA=DATA[DATA.columns[2:]]

DATA.dropna(inplace=True)
DATA.head()


Unnamed: 0,Abnormal,Result,R-R Interval,R Height,R_Onset-Rpeak,Q-Q Interval,Q Height,P_Onset-P_Offset,P Height,P-P Interval,P_Onset-Ppeak
0,0.0,N,0.0,1212.0,56.0,0.0,953.0,61.0,978.0,0.0,28.0
1,0.0,N,292.0,1201.0,50.0,294.0,947.0,60.0,976.0,295.0,28.0
2,0.0,N,284.0,1186.0,54.0,285.0,946.0,65.0,971.0,280.0,28.0
3,0.0,N,285.0,1188.0,59.0,284.0,948.0,66.0,973.0,283.0,29.0
4,0.0,N,284.0,1201.0,56.0,283.0,952.0,64.0,977.0,285.0,28.0


Then four decision tree classifiers are called which will all be trained on different subsets of the data to determine the optimal information to put into the model

In [3]:
model1 = tree.DecisionTreeClassifier()
model2 = tree.DecisionTreeClassifier()
model3 = tree.DecisionTreeClassifier()
model4 = tree.DecisionTreeClassifier()

the four subsets of data are extracted from the features data frame. the subsets include all of the data, data with an equal ratio of abnormal to normal information, a dataset consisting of only abnormal values, and a dataset where some of the features are omitted due to uncertainty in the reliability of the information involved due to issues in the preprocessing stage

In [4]:
inputs = DATA.drop(["Result","Abnormal"],axis='columns')
Target = DATA[["Abnormal"]]

NumAbnormal=0
for i in DATA["Abnormal"]:
    if (i==1):
        NumAbnormal+=1
print(NumAbnormal) #23% abnormal

rows_to_remove=DATA.index[DATA['Abnormal'] == 0].tolist()
inputs2 = inputs.drop(rows_to_remove[NumAbnormal:],axis='rows')
Target2 = Target.drop(rows_to_remove[NumAbnormal:],axis='rows')

inputs3 = inputs.drop(rows_to_remove,axis='rows')
Target3 = Target.drop(rows_to_remove,axis='rows')

inputs4 = DATA.drop(["Result","Abnormal","R Height","Q Height", "P Height"],axis='columns')
Target4 = DATA[["Abnormal"]]

print(len(inputs2))

print(Target)

17067
34134
       Abnormal
0           0.0
1           0.0
2           0.0
3           0.0
4           0.0
...         ...
71472       0.0
71473       0.0
71474       0.0
71475       0.0
71476       0.0

[71477 rows x 1 columns]


Training and testing data is then created for each of the datasets so that the four models can be trained accordingly. the set is randomized in the train_test_split function and this is important due to the distribution in the actual dataset

In [5]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(inputs, Target,test_size=0.3)
x_train2, x_test2, y_train2, y_test2 = train_test_split(inputs2, Target2,test_size=0.3)
x_train3, x_test3, y_train3, y_test3 = train_test_split(inputs3, Target3,test_size=0.3)
x_train4, x_test4, y_train4, y_test4 = train_test_split(inputs4, Target4,test_size=0.3)

Here we train and test the first model. this model is trained on the entire dataset then tested on all of them,
doing this shows a greater picture for the actual accuracy overall.

In [6]:
model1.fit(X=x_train1,y=y_train1)
print(model1.score(x_test1,y_test1),"Regular")
print(model1.score(x_test2,y_test2), "50/50",)
print(model1.score(x_test3,y_test3),"Abnormal")
print(roc_auc_score(y_test1, model1.predict(x_test1)),"AUC1")
print(roc_auc_score(y_test2, model1.predict(x_test2)),"AUC2")




0.9312628240999814 Regular
0.9713895127428962 50/50
0.9586018355789885 Abnormal
0.9072564883967363 AUC1
0.9713630126766972 AUC2


The second model is trained with a dataset consisting of an even distribution of normal and abnormal heart beats

In [7]:
#for inputs with even normal and abnormal

model2.fit(X=x_train2,y=y_train2)
print(model2.score(x_test1,y_test1),"Regular")
print(model2.score(x_test2,y_test2), "50/50",)
print(model2.score(x_test3,y_test3),"Abnormal")
print(roc_auc_score(y_test1, model2.predict(x_test1)),"AUC1")
print(roc_auc_score(y_test2, model2.predict(x_test2)),"AUC2")


0.6616769259466517 Regular
0.9474660677668196 50/50
0.9869166178480765 Abnormal
0.7720916603271063 AUC1
0.9474678681259804 AUC2


The third model is trained on a dataset consisting of only abnormal data, this was done mostly as a test to see what would happen by experimenting with the code. 

In [8]:
model3.fit(X=x_train3,y=y_train3)
print(model3.score(x_test1,y_test1),"Regular")
print(model3.score(x_test2,y_test2), "50/50",)
print(model3.score(x_test3,y_test3),"Abnormal")
print(roc_auc_score(y_test1, model3.predict(x_test1)),"AUC1")
print(roc_auc_score(y_test2, model3.predict(x_test2)),"AUC2")
#print(roc_auc_score(y_test3, model1.predict(x_test3)),"AUC3")

0.2389945905614624 Regular
0.49917000292940145 50/50
1.0 Abnormal
0.5 AUC1
0.5 AUC2


The final model is trained on the entire dataset with the omission of peak height features. this is due to having less confidence in these values. accuracy regarding these features would likely improve with normalizing on the waves.

In [9]:
model4.fit(X=x_train4,y=y_train4)
print(model4.score(x_test4,y_test4),"Regular")
print(roc_auc_score(y_test4, model4.predict(x_test4)),"AUC1")

0.8816452154448797 Regular
0.8392776790770925 AUC1


It can be seen that model1, taking all of the provided data has the highest accuracy when tested against all of the subsets of data. This was the expected conclusion but it is still useful to show proof of this fact

In [10]:
CLASSNAMES=["0","1"]
fig = plt.figure(figsize=(100,100))
#_ = tree.plot_tree(model,feature_names=inputs.columns,class_names=CLASSNAMES)
_ = tree.plot_tree(model,feature_names=inputs.columns)

NameError: name 'model' is not defined

<Figure size 7200x7200 with 0 Axes>