In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn import svm

1) data loading (final features)
2) get the augmented samples to different dataframe
3) get label column to different pandas series
4) drop 'raw' audio
5) there are very few nan and +/- inf values, replace them with 0.

In [2]:
data = pd.read_csv('murmor_dataset.csv') 
print(f"Classes' distribution:")
print(data.groupby('MURMUR').count()['Patient_ID'])
duplicates = data[data.duplicated(['Patient_ID'],keep=False)]
duplicates = duplicates.sort_values(by=['Patient_ID'])
data.drop_duplicates(subset=['Patient_ID'],keep=False, inplace=True)
print(f'dataframe without duplicate samples shape: {data.shape}')
print(f'augmented positive samples dataframe shape: {duplicates.shape}')
y = data.MURMUR
y = y.replace({'Present':1,'Absent':0})
data = data.drop(columns=['Patient_ID', 'AV', 'MV', 'PV', 'TV','MURMUR'])
data = data.fillna(0)
data.replace([np.inf, -np.inf], 0, inplace=True)

Classes' distribution:
MURMUR
Absent     457
Present    203
Name: Patient_ID, dtype: int64
dataframe without duplicate samples shape: (488, 134)
augmented positive samples dataframe shape: (172, 134)


In [3]:
data.head(3)

Unnamed: 0,mean_ae_AV,mean_ae_MV,mean_ae_PV,mean_ae_TV,median_ae_AV,median_ae_MV,median_ae_PV,median_ae_TV,std_ae_AV,std_ae_MV,...,TV_mfcc_4,TV_mfcc_5,TV_mfcc_6,TV_mfcc_7,TV_mfcc_8,TV_mfcc_9,TV_mfcc_10,TV_mfcc_11,TV_mfcc_12,TV_mfcc_13
0,0.093476,0.083762,0.164984,0.107563,0.04704,0.033079,0.087165,0.042905,0.102132,0.131644,...,-33.460999,-6.598893,34.000908,21.154072,-7.830566,-1.645628,21.914124,20.209282,-0.958162,-6.29347
2,0.091702,0.099159,0.121979,0.129162,0.059809,0.067497,0.076158,0.042184,0.085225,0.11526,...,-29.298609,-5.337496,30.698105,20.061325,-6.931499,-4.781078,14.501362,14.630821,-2.321874,-7.382983
4,0.125086,0.18974,0.169284,0.20437,0.089281,0.112374,0.13272,0.14014,0.105365,0.181673,...,-42.962135,-18.818758,24.84409,18.756638,-8.074683,-5.306619,16.791115,18.958763,1.252272,-5.608494


# Transformation - Data splitting

Split non-augmented data to 3 datasets. Training, validation and test sets.
We will use the training set in order to fit our classifiers.
We will use the validation set for hyperparameter tuning.
The test set will be used for the final evaluation of our hypothesis.

In [4]:
X_train, X_test, y_train, y_test  = train_test_split(data, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val   = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [5]:
print(f'Training dataset size = {X_train.shape} , {round(X_train.shape[0]/data.shape[0], 2)} %')
print(f'Validation dataset size = {X_val.shape}  , {round(X_val.shape[0]/data.shape[0] , 2)} %')
print(f'Test dataset size = {X_test.shape} , {round(X_test.shape[0]/data.shape[0], 2)} %')

Training dataset size = (292, 128) , 0.6 %
Validation dataset size = (98, 128)  , 0.2 %
Test dataset size = (98, 128) , 0.2 %


Now, we need to distribute the augmented samples to the 3 datasets. We are doing so, in order to make sure that samples coming from the same patients have been assigned ton the same dataset. Full explanation can be found in the report. By using the split_augmented_samples() method we define also the percentage of positive samples we whould like to add to each dataset.

In [6]:
duplicates.shape

(172, 134)

In [7]:
def split_augmented_samples(data, train_percentage = .8, validation_percentage = .1, test_percentage = .1 , duplicate_factor = 2):
    duplicates = data.sort_values(by=['Patient_ID'])
    num_of_diff_samples = duplicates.shape[0] / duplicate_factor
    
    print(f'{num_of_diff_samples} unique samples have been duplicated, by a factor of {duplicate_factor}')
    
    trains_added = int(train_percentage * num_of_diff_samples) 
    vals_added = int(validation_percentage * num_of_diff_samples)
    tests_added = int(test_percentage * num_of_diff_samples)
    residual = int(num_of_diff_samples - trains_added - vals_added - tests_added)
    trains_added += residual
    
    print(f'{trains_added} samples to add to training dataset')
    print(f'{vals_added} samples to add to validation dataset')
    print(f'{tests_added} samples to add to testing dataset')
    
    
    modified = 0
    
    while trains_added % duplicate_factor != 0:
        trains_added -= 1
        modified += 1
        
    while vals_added % duplicate_factor != 0:
        vals_added -= 1
        modified += 1
    
    trains_added += modified
    
    to_train = duplicates.iloc[:trains_added*duplicate_factor,:]
    to_val = duplicates.iloc[trains_added*duplicate_factor:trains_added*duplicate_factor + vals_added*duplicate_factor,:]
    to_test = duplicates.iloc[trains_added*duplicate_factor + vals_added*duplicate_factor:,:]
    
    if to_train.shape[0] % duplicate_factor !=0 or to_val.shape[0] % duplicate_factor !=0 or to_test.shape[0] % duplicate_factor !=0:
        raise ValueError
    
    train_IDs = set(to_train.Patient_ID.unique())
    val_IDs = set(to_val.Patient_ID.unique())
    test_IDs = set(to_test.Patient_ID.unique())
    
    
    if len(train_IDs.intersection(val_IDs).intersection(test_IDs)) != 0:
        raise ValueError
        
    y_to_train = to_train.MURMUR
    y_to_train = y_to_train.replace({'Present':1,'Absent':0})
    X_to_train = to_train.drop(columns=['Patient_ID', 'AV', 'MV', 'PV', 'TV','MURMUR'])
    X_to_train = X_to_train.fillna(0)
    X_to_train.replace([np.inf, -np.inf], 0, inplace=True)


    y_to_val = to_val.MURMUR
    y_to_val = y_to_val.replace({'Present':1,'Absent':0})
    X_to_val = to_val.drop(columns=['Patient_ID', 'AV', 'MV', 'PV', 'TV','MURMUR'])
    X_to_val = X_to_val.fillna(0)
    X_to_val.replace([np.inf, -np.inf], 0, inplace=True)
    
    y_to_test = to_test.MURMUR
    y_to_test = y_to_test.replace({'Present':1,'Absent':0})
    X_to_test = to_test.drop(columns=['Patient_ID', 'AV', 'MV', 'PV', 'TV','MURMUR'])
    X_to_test = X_to_test.fillna(0)
    X_to_test.replace([np.inf, -np.inf], 0, inplace=True)
    
    return X_to_train,y_to_train, X_to_val, y_to_val, X_to_test, y_to_test

In [8]:
X_to_train,y_to_train, X_to_val, y_to_val, X_to_test, y_to_test = split_augmented_samples(duplicates)

86.0 unique samples have been duplicated, by a factor of 2
70 samples to add to training dataset
8 samples to add to validation dataset
8 samples to add to testing dataset


In [9]:
#X_train,y_train,  X_val,y_val ,   X_test, y_test #original
X_train = X_train.append(X_to_train)
y_train = y_train.append(y_to_train)

X_val = X_val.append(X_to_val)
y_val = y_val.append(y_to_val)

X_test = X_test.append(X_to_test)
y_test = y_test.append(y_to_test)

In [28]:
total = X_train.shape[0] + X_val.shape[0] + X_test.shape[0]
print(f'total samples : {total}')
print(f'final training set size : {X_train.shape[0]} samples, {round(X_train.shape[0]/total,4) * 100} %')
print(f'final validation set size : {X_val.shape[0]} samples, {round(X_val.shape[0]/total,4) * 100} %')
print(f'final testing set size : {X_test.shape[0]} samples, {round(X_test.shape[0]/total,4) * 100} %')

total samples : 660
final training set size : 432 samples, 65.45 %
final validation set size : 114 samples, 17.27 %
final testing set size : 114 samples, 17.27 %


In [32]:
print(f'positive sample rate in training set : {round(y_train.sum()/y_train.shape[0],2)}')
print(f'positive sample rate in validation set : {round(y_val.sum()/y_val.shape[0],2)}')
print(f'positive sample rate in testing set : {round(y_test.sum()/y_test.shape[0], 2)}')

positive sample rate in training set : 0.36
positive sample rate in validation set : 0.18
positive sample rate in testing set : 0.23


# Testing against validation set only

In [18]:
scaler = StandardScaler()
s = scaler.fit(X_train)
X_train=scaler.transform(X_train)
X_val = scaler.transform(X_val)

In [19]:
def print_metrics(y_val,y_pred):
    print(f'Precision: {precision_score(y_val,y_pred)}')
    print(f'Recall: {recall_score(y_val,y_pred)}')
    print(f'f1_score: {f1_score(y_val,y_pred)}')
    print(f'Accuracy: {accuracy_score(y_val,y_pred)}') 

# Logistic Regression Test

In [20]:
clf = LogisticRegression(random_state=0,max_iter=1000).fit(X_train, y_train) 
y_pred = clf.predict(X_val)
print_metrics(y_val,y_pred)

Precision: 0.4444444444444444
Recall: 0.5714285714285714
f1_score: 0.5
Accuracy: 0.7894736842105263


# SVM test

In [21]:
clf1 = svm.SVC()
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_val)
print_metrics(y_val,y_pred)

Precision: 0.7142857142857143
Recall: 0.47619047619047616
f1_score: 0.5714285714285714
Accuracy: 0.868421052631579


# Naive Bayes

In [22]:
clf2 = GaussianNB()
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_val)
print_metrics(y_val,y_pred)

Precision: 0.42105263157894735
Recall: 0.38095238095238093
f1_score: 0.4
Accuracy: 0.7894736842105263


# KNN

In [23]:
clf3 = KNeighborsClassifier(n_neighbors=3)
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_val)
print_metrics(y_val,y_pred)

Precision: 0.2222222222222222
Recall: 0.2857142857142857
f1_score: 0.25
Accuracy: 0.6842105263157895


# Decision Tree

In [24]:
clf4 = tree.DecisionTreeClassifier()
clf4.fit(X_train, y_train)
y_pred = clf4.predict(X_val)
print_metrics(y_val,y_pred)

Precision: 0.21212121212121213
Recall: 0.3333333333333333
f1_score: 0.25925925925925924
Accuracy: 0.6491228070175439


# LDA

In [25]:
clf5 = LinearDiscriminantAnalysis()
clf5.fit(X_train, y_train)
y_pred = clf5.predict(X_val)
print_metrics(y_val,y_pred)

Precision: 0.52
Recall: 0.6190476190476191
f1_score: 0.5652173913043478
Accuracy: 0.8245614035087719


# QDA

In [26]:
clf6 = QuadraticDiscriminantAnalysis()
clf6.fit(X_train, y_train)
y_pred = clf6.predict(X_val)
print_metrics(y_val,y_pred)

Precision: 1.0
Recall: 0.09523809523809523
f1_score: 0.17391304347826084
Accuracy: 0.8333333333333334


# ADABOOST

In [27]:
from sklearn.ensemble import AdaBoostClassifier

clf7 = AdaBoostClassifier(n_estimators=100, random_state=0)
clf7.fit(X_train, y_train)
y_pred = clf7.predict(X_val)
print_metrics(y_val,y_pred)

Precision: 0.35714285714285715
Recall: 0.47619047619047616
f1_score: 0.40816326530612246
Accuracy: 0.7456140350877193
