In [1]:
import numpy as np
import pandas as pd
import sklearn as s
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
# LABELS: HEALTHY, SYSTOLIC, DIASTOLIC, & BOTH

In [2]:
# Classification with Old Features

In [3]:
# Loading the Data
heart = pd.read_csv("heartsound_features_og.csv")

In [4]:
# heart.info()

In [5]:
# heart.describe()

In [6]:
sample_incomplete_rows = heart[heart.isnull().any(axis=1)].head()
sample_incomplete_rows

Unnamed: 0,fileName,Label,S1@(ms),S2@(ms),systole,diastole,S1(ABV),syst_mur(0/1),smur_begin,syst_dur,syst_amp,syst_freq,S2(ABV),diast_mur(0/1),dmur_begin,diast_dur,diast_amp,diast_freq


In [7]:
heart_labels = heart["Label"].copy()
heart = heart.drop("Label", axis=1)
heart = heart.drop("fileName", axis=1)

In [0]:
# Testing for Overfitting/Underfitting

In [9]:
# Cross-Validation with an 80-20 split
num = 100
knn_acc = 0
svm_acc = 0
for i in range(0, num):

    train_X, test_X, train_y, test_y = train_test_split(heart, heart_labels, test_size=.1)

    # K-Nearest Neighbors to classify test set
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(train_X, train_y)
    predictions = knn.predict(test_X)
    knn_acc = knn_acc + accuracy_score(test_y, predictions, normalize=True, sample_weight=None)

    # Support Vector Classifier to classify test set
    clf = svm.SVC(kernel='linear')
    clf.fit(train_X, train_y)
    predictions = clf.predict(test_X)
    svm_acc = svm_acc + accuracy_score(test_y, predictions, normalize=True, sample_weight=None)

print(knn_acc / num * 100)
print(svm_acc / num * 100)

90.33333333333329
91.40476190476187


In [10]:
# Cross-Validation with an 90-10 split
num = 100
knn_acc = 0
svm_acc = 0
for i in range(0, num):

    train_X, test_X, train_y, test_y = train_test_split(heart, heart_labels, test_size=.2)

    # K-Nearest Neighbors to classify test set
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(train_X, train_y)
    predictions = knn.predict(test_X)
    knn_acc = knn_acc + accuracy_score(test_y, predictions, normalize=True, sample_weight=None)

    # Support Vector Classifier to classify test set
    clf = svm.SVC(kernel='linear')
    clf.fit(train_X, train_y)
    predictions = clf.predict(test_X)
    svm_acc = svm_acc + accuracy_score(test_y, predictions, normalize=True, sample_weight=None)

print(knn_acc / num * 100)
print(svm_acc / num * 100)

90.44047619047619
91.53571428571432


In [0]:
# Result for Old Features: we are not overfitting or underfitting

In [11]:
# Classification with New, Added Features

In [0]:
# ONLY 4 LABELS

In [54]:
# Loading the Data
heart = pd.read_csv("heartsound_features.csv")

In [55]:
# heart.info()

In [56]:
# heart.describe()

In [57]:
sample_incomplete_rows = heart[heart.isnull().any(axis=1)].head()
sample_incomplete_rows

Unnamed: 0,fileName,Label,S1@(ms),S2@(ms),systole,diastole,S1(ABV),syst_mur(0/1),smur_begin,syst_dur,...,Npeaks systole,Npeaks diastole,var_pks_systole,var_pks_diastole,L_pks_syst,M_pks_syst,H_pks_syst,L_pks_diast,M_pks_diast,H_pks_diast


In [58]:
heart_labels = heart["Label"].copy()
heart = heart.drop("Label", axis=1)
heart = heart.drop("fileName", axis=1)

In [59]:
# Testing for Overfitting/Underfitting and Confusion Matrix Analysis

In [60]:
# Cross-Validation with an 80-20 split

In [86]:
num = 100
knn_acc = 0
svm_acc = 0
for i in range(0, num):

    train_X, test_X, train_y, test_y = train_test_split(heart, heart_labels, test_size=.2)

    # K-Nearest Neighbors to classify test set
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(train_X, train_y)
    predictions = knn.predict(test_X)
    knn_acc = knn_acc + accuracy_score(test_y, predictions, normalize=True, sample_weight=None)

    # Support Vector Classifier to classify test set
    clf = svm.SVC(kernel='linear')
    clf.fit(train_X, train_y)
    predictions = clf.predict(test_X)
    svm_acc = svm_acc + accuracy_score(test_y, predictions, normalize=True, sample_weight=None)

train_y_pred_knn = cross_val_predict(knn, train_X, train_y, cv=100)
train_y_pred_svm = cross_val_predict(clf, train_X, train_y, cv=100)
print(knn_acc / num * 100)
print(svm_acc / num * 100)





93.28571428571422
97.39285714285712


In [87]:
# Confusion Matrix
    # Row – actual class
    # Column – predicted class

In [88]:
# Confusion Matrix for KNN
confusion_matrix(train_y, train_y_pred_knn)

array([[ 76,   6,   0,   1],
       [  5, 209,   1,   1],
       [  4,   3,   7,   0],
       [  2,   0,   0,  18]])

In [89]:
# Confusion Matrix for SVM
confusion_matrix(train_y, train_y_pred_svm)

array([[ 83,   0,   0,   0],
       [  0, 213,   2,   1],
       [  0,   2,  12,   0],
       [  0,   3,   1,  16]])

In [90]:
# Precision and Recall for KNN
print(classification_report(train_y, train_y_pred_knn, digits=3))

              precision    recall  f1-score   support

           0      0.874     0.916     0.894        83
           1      0.959     0.968     0.963       216
           2      0.875     0.500     0.636        14
           3      0.900     0.900     0.900        20

    accuracy                          0.931       333
   macro avg      0.902     0.821     0.848       333
weighted avg      0.930     0.931     0.928       333



In [91]:
# Precision and Recall for SVM
print(classification_report(train_y, train_y_pred_svm, digits=3))

              precision    recall  f1-score   support

           0      1.000     1.000     1.000        83
           1      0.977     0.986     0.982       216
           2      0.800     0.857     0.828        14
           3      0.941     0.800     0.865        20

    accuracy                          0.973       333
   macro avg      0.930     0.911     0.919       333
weighted avg      0.973     0.973     0.973       333



In [78]:
# Cross-Validation with an 90-10 split

In [81]:
num = 100
knn_acc = 0
svm_acc = 0
for i in range(0, num):

    train_X, test_X, train_y, test_y = train_test_split(heart, heart_labels, test_size=.2)

    # K-Nearest Neighbors to classify test set
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(train_X, train_y)
    predictions = knn.predict(test_X)
    knn_acc = knn_acc + accuracy_score(test_y, predictions, normalize=True, sample_weight=None)

    # Support Vector Classifier to classify test set
    clf = svm.SVC(kernel='linear')
    clf.fit(train_X, train_y)
    predictions = clf.predict(test_X)
    svm_acc = svm_acc + accuracy_score(test_y, predictions, normalize=True, sample_weight=None)


train_y_pred_knn = cross_val_predict(knn, train_X, train_y, cv=100)
train_y_pred_svm = cross_val_predict(clf, train_X, train_y, cv=100)
print(knn_acc / num * 100)
print(svm_acc / num * 100)





93.02380952380948
96.96428571428572


In [82]:
# Confusion Matrix for KNN
confusion_matrix(train_y, train_y_pred_knn)

array([[ 66,   6,   0,   2],
       [  0, 222,   1,   2],
       [  3,   1,   5,   1],
       [  3,   3,   0,  18]])

In [83]:
# Confusion Matrix for SVM
confusion_matrix(train_y, train_y_pred_svm)

array([[ 74,   0,   0,   0],
       [  0, 220,   1,   4],
       [  0,   1,   9,   0],
       [  0,   3,   1,  20]])

In [84]:
# Precision and Recall for KNN
print(classification_report(train_y, train_y_pred_knn, digits=3))

              precision    recall  f1-score   support

           0      0.917     0.892     0.904        74
           1      0.957     0.987     0.972       225
           2      0.833     0.500     0.625        10
           3      0.783     0.750     0.766        24

    accuracy                          0.934       333
   macro avg      0.872     0.782     0.817       333
weighted avg      0.932     0.934     0.931       333



In [85]:
# Precision and Recall for SVM
print(classification_report(train_y, train_y_pred_svm, digits=3))

              precision    recall  f1-score   support

           0      1.000     1.000     1.000        74
           1      0.982     0.978     0.980       225
           2      0.818     0.900     0.857        10
           3      0.833     0.833     0.833        24

    accuracy                          0.970       333
   macro avg      0.908     0.928     0.918       333
weighted avg      0.970     0.970     0.970       333



In [0]:
# Result with New, Added Features: we are not overfitting or underfitting

In [0]:
# MORE LABELS: HEALTHY (0), EARLY SYSTOLIC (1), MID SYSTOLIC (2) LATE SYSTOLIC (3), HOLOSYSTOLIC (4), DIASTOLIC (5), BOTH (6)

In [11]:
# Classification with New, Added Features

In [2]:
# Loading the Data
heart = pd.read_csv("heartsound_features_final.csv")

In [3]:
# heart.info()

In [4]:
# heart.describe()

In [5]:
sample_incomplete_rows = heart[heart.isnull().any(axis=1)].head()
sample_incomplete_rows

Unnamed: 0,fileName,Label,S1@(ms),S2@(ms),systole,diastole,S1(ABV),syst_mur(0/1),smur_begin,syst_dur,...,Npeaks systole,Npeaks diastole,var_pks_systole,var_pks_diastole,L_pks_syst,M_pks_syst,H_pks_syst,L_pks_diast,M_pks_diast,H_pks_diast


In [6]:
heart_labels = heart["Label"].copy()
heart = heart.drop("Label", axis=1)
heart = heart.drop("fileName", axis=1)

In [7]:
# Testing for Overfitting/Underfitting and Confusion Matrix Analysis

In [8]:
# Cross-Validation with an 80-20 split

In [11]:
num = 10
knn_acc = 0
svm_acc = 0
for i in range(0, num):

    train_X, test_X, train_y, test_y = train_test_split(heart, heart_labels, test_size=.2)

    # K-Nearest Neighbors to classify test set
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(train_X, train_y)
    predictions = knn.predict(test_X)
    knn_acc = knn_acc + accuracy_score(test_y, predictions, normalize=True, sample_weight=None)

    # Support Vector Classifier to classify test set
    clf = svm.SVC(kernel='linear')
    clf.fit(train_X, train_y)
    predictions = clf.predict(test_X)
    svm_acc = svm_acc + accuracy_score(test_y, predictions, normalize=True, sample_weight=None)

train_y_pred_knn = cross_val_predict(knn, train_X, train_y, cv=10)
train_y_pred_svm = cross_val_predict(clf, train_X, train_y, cv=10)
print(knn_acc / num * 100)
print(svm_acc / num * 100)

90.65934065934067
93.4065934065934


In [10]:
# Confusion Matrix
    # Row – actual class
    # Column – predicted class

In [13]:
# Confusion Matrix for KNN
confusion_matrix(train_y, train_y_pred_knn)

array([[74,  0,  2,  2,  0,  0,  1],
       [ 0, 19,  0,  0,  2,  0,  0],
       [ 3,  0, 77,  0,  4,  0,  2],
       [ 0,  2,  0, 44,  0,  0,  0],
       [ 0,  6,  4,  0, 45,  0,  2],
       [ 6,  1,  1,  0,  0, 39,  0],
       [ 2,  0,  1,  1,  2,  0, 20]])

In [14]:
# Confusion Matrix for SVM
confusion_matrix(train_y, train_y_pred_svm)

array([[79,  0,  0,  0,  0,  0,  0],
       [ 0, 21,  0,  0,  0,  0,  0],
       [ 2,  0, 80,  1,  2,  0,  1],
       [ 1,  1,  1, 43,  0,  0,  0],
       [ 0,  4,  3,  1, 46,  2,  1],
       [ 0,  0,  0,  0,  3, 44,  0],
       [ 1,  0,  0,  0,  2,  1, 22]])

In [18]:
# Precision and Recall for KNN
print(classification_report(train_y, train_y_pred_knn, digits=3))

              precision    recall  f1-score   support

           0      0.883     0.907     0.895        75
           1      0.857     0.900     0.878        20
           2      0.877     0.887     0.882        80
           3      0.860     0.980     0.916        50
           4      0.878     0.796     0.835        54
           5      1.000     0.911     0.953        56
           6      0.885     0.852     0.868        27

    accuracy                          0.892       362
   macro avg      0.891     0.890     0.890       362
weighted avg      0.894     0.892     0.892       362



In [19]:
# Precision and Recall for SVM
print(classification_report(train_y, train_y_pred_svm, digits=3))

              precision    recall  f1-score   support

           0      0.987     1.000     0.993        75
           1      0.833     1.000     0.909        20
           2      0.949     0.925     0.937        80
           3      0.907     0.980     0.942        50
           4      0.878     0.796     0.835        54
           5      0.982     0.964     0.973        56
           6      0.923     0.889     0.906        27

    accuracy                          0.936       362
   macro avg      0.923     0.936     0.928       362
weighted avg      0.937     0.936     0.936       362



In [20]:
# Cross-Validation with an 90-10 split

In [23]:
num = 10
knn_acc = 0
svm_acc = 0
for i in range(0, num):

    train_X, test_X, train_y, test_y = train_test_split(heart, heart_labels, test_size=.2)

    # K-Nearest Neighbors to classify test set
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(train_X, train_y)
    predictions = knn.predict(test_X)
    knn_acc = knn_acc + accuracy_score(test_y, predictions, normalize=True, sample_weight=None)

    # Support Vector Classifier to classify test set
    clf = svm.SVC(kernel='linear')
    clf.fit(train_X, train_y)
    predictions = clf.predict(test_X)
    svm_acc = svm_acc + accuracy_score(test_y, predictions, normalize=True, sample_weight=None)


train_y_pred_knn = cross_val_predict(knn, train_X, train_y, cv=10)
train_y_pred_svm = cross_val_predict(clf, train_X, train_y, cv=10)
print(knn_acc / num * 100)
print(svm_acc / num * 100)

89.23076923076924
92.96703296703296


In [24]:
# Confusion Matrix for KNN
confusion_matrix(train_y, train_y_pred_knn)

array([[67,  0,  1,  1,  0,  0,  2],
       [ 0, 19,  0,  0,  2,  0,  0],
       [ 3,  0, 73,  0,  5,  0,  2],
       [ 0,  0,  0, 51,  1,  0,  0],
       [ 1,  0,  3,  0, 55,  1,  0],
       [ 4,  2,  1,  0,  0, 44,  0],
       [ 3,  0,  1,  1,  1,  0, 18]])

In [25]:
# Confusion Matrix for SVM
confusion_matrix(train_y, train_y_pred_svm)

array([[69,  0,  0,  2,  0,  0,  0],
       [ 0, 21,  0,  0,  0,  0,  0],
       [ 0,  0, 78,  1,  4,  0,  0],
       [ 0,  1,  1, 50,  0,  0,  0],
       [ 0,  2,  5,  0, 50,  2,  1],
       [ 0,  0,  1,  0,  4, 46,  0],
       [ 0,  0,  0,  0,  1,  0, 23]])

In [26]:
# Precision and Recall for KNN
print(classification_report(train_y, train_y_pred_knn, digits=3))

              precision    recall  f1-score   support

           0      0.859     0.944     0.899        71
           1      0.905     0.905     0.905        21
           2      0.924     0.880     0.901        83
           3      0.962     0.981     0.971        52
           4      0.859     0.917     0.887        60
           5      0.978     0.863     0.917        51
           6      0.818     0.750     0.783        24

    accuracy                          0.903       362
   macro avg      0.901     0.891     0.895       362
weighted avg      0.905     0.903     0.903       362



In [27]:
# Precision and Recall for SVM
print(classification_report(train_y, train_y_pred_svm, digits=3))

              precision    recall  f1-score   support

           0      1.000     0.972     0.986        71
           1      0.875     1.000     0.933        21
           2      0.918     0.940     0.929        83
           3      0.943     0.962     0.952        52
           4      0.847     0.833     0.840        60
           5      0.958     0.902     0.929        51
           6      0.958     0.958     0.958        24

    accuracy                          0.931       362
   macro avg      0.929     0.938     0.933       362
weighted avg      0.932     0.931     0.931       362



In [0]:
# Result with New, Added Features: accuracy drops by about 1% for both KNN and SVM with change to 90–10 spit. Is this relevant?