In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn import svm

In [2]:
data = pd.read_csv('murmor_dataset.csv')  
#data.head(2)
data.groupby('MURMUR').count()['Patient_ID']

MURMUR
Absent     457
Present    203
Name: Patient_ID, dtype: int64

In [3]:
y = data.MURMUR
y = y.replace({'Present':1,'Absent':0})
data = data.drop(columns=['Patient_ID', 'AV', 'MV', 'PV', 'TV','MURMUR'])

In [4]:
data = data.fillna(0)
data.replace([np.inf, -np.inf], 0, inplace=True)
data

Unnamed: 0,mean_ae_AV,mean_ae_MV,mean_ae_PV,mean_ae_TV,median_ae_AV,median_ae_MV,median_ae_PV,median_ae_TV,std_ae_AV,std_ae_MV,...,TV_mfcc_4,TV_mfcc_5,TV_mfcc_6,TV_mfcc_7,TV_mfcc_8,TV_mfcc_9,TV_mfcc_10,TV_mfcc_11,TV_mfcc_12,TV_mfcc_13
0,0.093476,0.083762,0.164984,0.107563,0.047040,0.033079,0.087165,0.042905,0.102132,0.131644,...,-33.460999,-6.598893,34.000908,21.154072,-7.830566,-1.645628,21.914124,20.209282,-0.958162,-6.293470
1,0.208012,0.207428,0.220741,0.521196,0.186779,0.200002,0.179823,0.517704,0.142052,0.147645,...,17.596321,19.525085,37.584980,30.011152,8.105770,1.824675,11.158860,14.041341,4.578974,-2.173921
2,0.091702,0.099159,0.121979,0.129162,0.059809,0.067497,0.076158,0.042184,0.085225,0.115260,...,-29.298609,-5.337496,30.698105,20.061325,-6.931499,-4.781078,14.501362,14.630821,-2.321874,-7.382983
3,0.102736,0.137654,0.125572,0.118599,0.070183,0.050025,0.089777,0.034491,0.152237,0.243194,...,-16.032585,-7.885186,16.791134,13.384658,-3.903045,-4.349921,8.308956,10.734879,0.940423,-3.516605
4,0.125086,0.189740,0.169284,0.204370,0.089281,0.112374,0.132720,0.140140,0.105365,0.181673,...,-42.962135,-18.818758,24.844090,18.756638,-8.074683,-5.306619,16.791115,18.958763,1.252272,-5.608494
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
655,0.444231,0.149096,0.121362,0.216245,0.452134,0.100187,0.120617,0.223760,0.309193,0.138438,...,-11.825885,-5.011165,26.163977,20.614601,-3.552867,-5.199866,10.845299,13.142046,-0.844687,-6.795125
656,0.098496,0.047454,0.078426,0.064200,0.073418,0.042560,0.084883,0.065354,0.120828,0.028967,...,-56.324604,-8.860107,44.910240,22.425835,-17.387327,-7.809362,21.201967,15.155473,-11.155810,-12.096591
657,0.110223,0.145489,0.204346,0.155980,0.084383,0.116087,0.147397,0.081949,0.153631,0.133531,...,-16.779829,-1.881451,30.266893,24.487698,-0.503826,-4.205161,10.446774,13.586130,1.071647,-4.660987
658,0.115837,0.133750,0.278490,0.355638,0.132048,0.140689,0.240603,0.347520,0.076844,0.121044,...,-13.073344,-8.276624,20.480719,16.134476,-4.487090,-3.632254,12.708085,14.148383,-0.949542,-7.966821


# Transformation - Data splitting

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
s = scaler.fit(X_train)
X_train=scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
def print_metrics(y_test,y_pred):
    print(f'Precision: {precision_score(y_test,y_pred)}')
    print(f'Recall: {recall_score(y_test,y_pred)}')
    print(f'f1_score: {f1_score(y_test,y_pred)}')
    print(f'Accuracy: {accuracy_score(y_test,y_pred)}') 

# Logistic Regression Test

In [7]:
clf = LogisticRegression(random_state=0,max_iter=1000).fit(X_train, y_train) 
y_pred = clf.predict(X_test)
print_metrics(y_test,y_pred)

Precision: 0.6
Recall: 0.5581395348837209
f1_score: 0.5783132530120482
Accuracy: 0.7348484848484849


# SVM test

In [8]:
clf1 = svm.SVC()
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
print_metrics(y_test,y_pred)

Precision: 0.7777777777777778
Recall: 0.4883720930232558
f1_score: 0.6000000000000001
Accuracy: 0.7878787878787878


# Naive Bayes

In [9]:
clf2 = GaussianNB()
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print_metrics(y_test,y_pred)

Precision: 0.3474576271186441
Recall: 0.9534883720930233
f1_score: 0.5093167701863354
Accuracy: 0.4015151515151515


# KNN

In [10]:
clf3 = KNeighborsClassifier(n_neighbors=3)
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print_metrics(y_test,y_pred)

Precision: 0.5714285714285714
Recall: 0.46511627906976744
f1_score: 0.5128205128205128
Accuracy: 0.7121212121212122


# Decision Tree

In [11]:
clf4 = tree.DecisionTreeClassifier()
clf4.fit(X_train, y_train)
y_pred = clf4.predict(X_test)
print_metrics(y_test,y_pred)

Precision: 0.5208333333333334
Recall: 0.5813953488372093
f1_score: 0.5494505494505495
Accuracy: 0.6893939393939394


# LDA

In [12]:
clf5 = LinearDiscriminantAnalysis()
clf5.fit(X_train, y_train)
y_pred = clf5.predict(X_test)
print_metrics(y_test,y_pred)

Precision: 0.6046511627906976
Recall: 0.6046511627906976
f1_score: 0.6046511627906976
Accuracy: 0.7424242424242424


# QDA

In [13]:
clf6 = QuadraticDiscriminantAnalysis()
clf6.fit(X_train, y_train)
y_pred = clf6.predict(X_test)
print_metrics(y_test,y_pred)

Precision: 0.5238095238095238
Recall: 0.5116279069767442
f1_score: 0.5176470588235295
Accuracy: 0.6893939393939394


# ADABOOST

In [14]:
from sklearn.ensemble import AdaBoostClassifier

clf7 = AdaBoostClassifier(n_estimators=100, random_state=0)
clf7.fit(X_train, y_train)
y_pred = clf7.predict(X_test)
print_metrics(y_test,y_pred)

Precision: 0.5526315789473685
Recall: 0.4883720930232558
f1_score: 0.5185185185185185
Accuracy: 0.7045454545454546
