In [1]:
# Import all required Files
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import pandas as pd
import wave
import sys
import os
import librosa
import librosa.display
import xgboost as xgb
from  sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import sklearn.naive_bayes as nb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier,GradientBoostingClassifier
from sklearn.neighbors import kd_tree
import seaborn as sn
from sklearn.metrics import confusion_matrix
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE

  import pandas.util.testing as tm


In [0]:
# Load per cycle data frame
import pickle
folder = "/content/drive/My Drive/Respiratory_Sound_Database/Respiratory_Sound_Database/" 
filename = folder + 'w_c_dataset.pickle'
infile = open(filename,'rb')
[sound,sr,lengths,times,labels] = pickle.load(infile)
infile.close()

In [0]:

# This fucntion splits sounds into cycles
def split_sounds(sounds,times,labels):
    s=[]
    l=[]
    for i,sound in enumerate(sounds):
        for t,label in zip(times[i],labels[i]):
            s.append(sound[int(t[0]):int(t[1])])
            if label==0:
                a=np.array([1,0,0,0])
            if label==1:
                a=np.array([0,1,0,0])
            if label==2:
                a=np.array([0,0,1,0])    
            if label==3:
                a=np.array([0,0,0,1])
            l.append(a)
    return s,l


In [0]:
# Extract MFCC Features
# Then take their mean and standard deviation and stack them up
import librosa
def extract_feature(cycle_number,X,sample_rate):
    n_fft=int(sample_rate*0.025)
    hop_length=int(sample_rate*0.01)
    mfcc = librosa.feature.mfcc(y=X, sr=sample_rate,n_fft=n_fft,hop_length=hop_length,
                                         n_mfcc=50)
    mean_mfcc = np.mean(mfcc.T,axis=0)
    std_mfcc = np.std(mfcc.T,axis=0)
    return np.vstack((mean_mfcc,std_mfcc))

In [0]:
[data,label]=split_sounds(sound,times,labels)

In [21]:
# Form the train data for SVM
dataset = []
for cycle_number,d in enumerate(data):
    a = extract_feature(cycle_number+1,d,sr[0])
    dataset.append(a)
data=np.asarray(dataset)
print(data.shape)

(6898, 2, 50)


In [22]:
# Form the labels
data = data.reshape([6898,data.shape[1]*data.shape[2],])
print(data.shape)

label=np.asarray(label)
a=np.zeros(label.shape[0])
for i in range(label.shape[0]):
    for j in range(label.shape[1]):
        if label[i][j]==1:
            a[i]=j

# Form the labels for binary classification
#bin_labels = []
#for label in a:
#    if label == 0:
#        bin_labels.append(0)
#    else:
#        bin_labels.append(1)

(6898, 100)


In [0]:
# Standardize the data
scaler=StandardScaler()
x_train,x_test,y_train,y_test = sklearn.model_selection.train_test_split(data,a,test_size=0.3, random_state=42,stratify=a)
scaler.fit(x_train)
x_train=scaler.transform(x_train)
x_test=scaler.transform(x_test)

In [11]:

# Grid-Search to find the best fit SVM Model
Cs = [2**(-2),2**(-1), 1,2**(1),2**(2),2**(3),2**(4),2**(5),2**(6),2**(7),2**(8),2**(9),2**(10)]
gamma = [2**(-7),2**(-6),2**(-5),2**(-4),2**(-3),2**(-2),2**(-1),2**(0),2**(1),2**(2),2**(3)]


param_grid = {'C': Cs,  
                'gamma': gamma, 
                'kernel': ['rbf'],
                'decision_function_shape':['ov'],
                'class_weight': ['balanced']}  

grid1 = GridSearchCV(SVC(), param_grid,cv=3,n_jobs=-1, verbose = 3) 

# fitting the model for grid search 
grid1.fit(x_train, y_train)

# print best parameter after tuning 
print("Best Parameters after tuning:")
print(grid1.best_params_) 
# print how our model looks after hyper-parameter tuning
print("Best selected Model: ") 
print(grid1.best_estimator_)

grid_predictions = grid1.predict(x_test) 

# print classification report
print("Classification Report: ") 
print(classification_report(y_test, grid_predictions))
print("Classification Accuracy: ")
print(accuracy_score(y_test,grid_predictions))

print("Confusion Matrix is as Follows: ")
print(sklearn.metrics.confusion_matrix(y_test,grid_predictions))

Fitting 3 folds for each of 143 candidates, totalling 429 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done 429 out of 429 | elapsed: 18.7min finished


Best Parameters after tuning:
{'C': 8, 'class_weight': 'balanced', 'decision_function_shape': 'ov', 'gamma': 0.03125, 'kernel': 'rbf'}
Best selected Model: 
SVC(C=8, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ov', degree=3, gamma=0.03125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
Classification Report: 
              precision    recall  f1-score   support

         0.0       0.80      0.84      0.82      1093
         1.0       0.70      0.74      0.72       559
         2.0       0.75      0.56      0.65       266
         3.0       0.59      0.51      0.55       152

    accuracy                           0.76      2070
   macro avg       0.71      0.66      0.68      2070
weighted avg       0.75      0.76      0.75      2070

Classification Accuracy: 
0.755072463768116
Confusion Matrix is as Follows: 
[[922 132  31   8]
 [120 414   2  23]
 [ 78  16 150  2

In [25]:
# Running XgBoost
import xgboost as xgb

print('Training XGB Classifier from new features:')
xgb_model = xgb.XGBClassifier(max_depth=40,num_class=2, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='multi:softmax', eta=0.3, silent=0, subsample=0.8).fit(x_train, y_train)

xgb_pred = xgb_model.predict(x_test)
print(accuracy_score(y_test,xgb_pred))

Training XGB Classifier from new features:
0.721256038647343
