In [1]:
from sklearn.utils import shuffle
import sklearn 
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import os
import pandas as pd
import numpy as np
import random

In [4]:
random.seed(0)

In [5]:
import matplotlib.pyplot as plt

In [6]:
import pickle


In [35]:
Wheeze_dataframe = pd.read_csv("./MFCCs_stasts_Csv/Wheeze_CMN.csv")

In [111]:
def train_test_set(Asthmatic_Female, Asthmatic_Male, Healthy_Female, Healthy_Male, before_after):
    random.seed(0)
    test_set = np.concatenate(((np.random.choice(np.array(Asthmatic_Male), replace=False, size=(4)),
                                np.random.choice(np.array(Asthmatic_Female), replace=False, size=(4)),
                                np.random.choice(np.array(Healthy_Male), replace=False, size=(4)),
                                np.random.choice(np.array(Healthy_Female), replace=False, size=(4)))))
    train_set = np.concatenate((np.array(Asthmatic_Male),
                                np.array(Asthmatic_Female),
                                np.array(Healthy_Male),
                                np.array(Healthy_Female)))
    train_set = np.setdiff1d(train_set,test_set)
#     np.array(before_after).shape
    train_set = np.concatenate((train_set, np.array(before_after)))
    return train_set, test_set

In [112]:
# train_test Csv
def train_test_csv(sound_dataframe):
    Total = np.unique(sound_dataframe.iloc[:,sound_dataframe.shape[1]-1])
    name = []
    for i in Total:
        name.append(i[:4])

    unique = []
    before_after = []
    for i in Total:
        if np.sum((np.array(name, dtype = int) == int(i[:4]))) == 1:
            unique.append(i)
        else:
            before_after.append(i) 

    Asthmatic_Female = []
    Asthmatic_Male = []
    Healthy_Male = []
    Healthy_Female = []
    for file in unique:
        if file.find('sthma') !=-1:
            if file.find("_M_")!=-1:
                Asthmatic_Male.append(file)
            if file.find("_F_")!=-1:
                Asthmatic_Female.append(file)
        if file.find("_C_")!=-1:
            if file.find("_M_")!=-1:
                Healthy_Male.append(file)
            if file.find("_F_")!=-1:
                Healthy_Female.append(file)

                

    Train, Test = train_test_set(Asthmatic_Female, Asthmatic_Male, Healthy_Female, Healthy_Male, before_after)     

    test_dataframe = pd.DataFrame()
    for i, line in enumerate(Test):
        A = sound_dataframe[(sound_dataframe[str(sound_dataframe.shape[1]-1)] == (Test[i]))]
        test_dataframe = pd.DataFrame.append(test_dataframe,A)
        
        
    train_dataframe = pd.DataFrame()
    for i, line in enumerate(Train):
        A = sound_dataframe[(sound_dataframe[str(sound_dataframe.shape[1]-1)] == (Train[i]))]
        train_dataframe = pd.DataFrame.append(train_dataframe,A) 
        
        
    
    return train_dataframe, test_dataframe

In [18]:
#Machine Learning Algorithm (MLA) Selection and Initialization
def MLA_selection(sound_dataframe, sound):    
    MLA = [
        #Ensemble Methods
        ensemble.AdaBoostClassifier(),
        ensemble.BaggingClassifier(),
        ensemble.ExtraTreesClassifier(),
        ensemble.GradientBoostingClassifier(),
        ensemble.RandomForestClassifier(),

        #Gaussian Processes
        gaussian_process.GaussianProcessClassifier(),

        #GLM
        linear_model.LogisticRegressionCV(),
        linear_model.PassiveAggressiveClassifier(),
        linear_model.RidgeClassifierCV(),
        linear_model.SGDClassifier(),
        linear_model.Perceptron(),

        #Navies Bayes
        naive_bayes.BernoulliNB(),
        naive_bayes.GaussianNB(),

        #Nearest Neighbor
        neighbors.KNeighborsClassifier(),

        #SVM
        svm.SVC(probability=True),
        svm.NuSVC(probability=True),
        svm.LinearSVC(),

        #Trees    
        tree.DecisionTreeClassifier(),
        tree.ExtraTreeClassifier(),

        #Discriminant Analysis
        discriminant_analysis.LinearDiscriminantAnalysis(),
        discriminant_analysis.QuadraticDiscriminantAnalysis(),


        XGBClassifier(max_depth = 8,
                     subsample = 0.8,
                     learning_rate = 0.01,
                     n_estimators = 450,
                     min_child_weight = 1)
#         XGBClassifier()
        ]


    MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy','MLA Train Accuracy Mean', 'MLA Test Accuracy', 'MLA Test Accuracy Mean','MLA Test Accuracy Std' ]
    MLA_compare = pd.DataFrame(columns = MLA_columns)
    row_index = 0
    for alg in MLA:
        MLA_name = alg.__class__.__name__
        MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
        MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
        test = []
        train = []
        for i in range(0,10):
            train_csv, test_csv = pd.read_csv('./Set_CSV/train'+sound+str(i)+'.csv'), pd.read_csv('./Set_CSV/test'+sound+str(i)+'.csv')
            X_train = train_csv.iloc[:,:sound_dataframe.shape[1]-2]
            y_train = train_csv.iloc[:,sound_dataframe.shape[1]-2]
            X_test = test_csv.iloc[:,:sound_dataframe.shape[1]-2]
            y_test = test_csv.iloc[:,sound_dataframe.shape[1]-2]
            #Scaling
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
            X_train = pd.DataFrame(X_train)
            X_test = pd.DataFrame(X_test)

            alg.fit(X_train,y_train)
            test.append(metrics.accuracy_score(y_test,alg.predict(X_test)))

            train.append(metrics.accuracy_score(y_train,alg.predict(X_train)))
            
#             if alg.__class__.__name__ == 'XGBClassifier':
#                 filename = './CMN/XGB_model'+ str(i) + '.model'
#                 pickle.dump(alg, open(filename, 'wb'))
#                 train_csv.to_csv('./CMN/Train_XGB'+str(i)+'.csv', index=False)
                
        
        MLA_compare.loc[row_index, 'MLA Train Accuracy'] = train
        MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = np.mean(train)
        MLA_compare.loc[row_index, 'MLA Test Accuracy'] = test
        MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = np.mean(test)
        MLA_compare.loc[row_index, 'MLA Test Accuracy Std'] = np.std(test)


        row_index+=1

    
    MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
    
    return MLA_compare

In [19]:
MLA_Wheeze = MLA_selection(Wheeze_dataframe, 'Wheeze_CMN') 



### Original

In [20]:
MLA_Wheeze

Unnamed: 0,MLA Name,MLA Parameters,MLA Train Accuracy,MLA Train Accuracy Mean,MLA Test Accuracy,MLA Test Accuracy Mean,MLA Test Accuracy Std
4,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.0,"[0.8931623931623932, 0.7619047619047619, 0.922...",0.870088,0.0492989
21,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...","[0.9992748368382887, 1.0, 0.9992684711046086, ...",0.999414,"[0.9017094017094017, 0.753968253968254, 0.9024...",0.868901,0.054507
3,GradientBoostingClassifier,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...","[0.9949238578680203, 0.9955914768552535, 0.992...",0.993416,"[0.8675213675213675, 0.7222222222222222, 0.898...",0.861209,0.0567945
0,AdaBoostClassifier,"{'algorithm': 'SAMME.R', 'base_estimator': Non...","[0.9361856417693981, 0.9537105069801617, 0.934...",0.945755,"[0.8333333333333334, 0.7341269841269841, 0.861...",0.833952,0.0456194
2,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.0,"[0.8076923076923077, 0.753968253968254, 0.9024...",0.833407,0.0498345
1,BaggingClassifier,"{'base_estimator': None, 'bootstrap': True, 'b...","[0.9963741841914431, 0.9977957384276267, 0.994...",0.996201,"[0.8931623931623932, 0.6984126984126984, 0.882...",0.831952,0.0653338
16,LinearSVC,"{'C': 1.0, 'class_weight': None, 'dual': True,...","[0.896301667875272, 0.9191770756796473, 0.8961...",0.90144,"[0.8205128205128205, 0.7301587301587301, 0.837...",0.824632,0.042931
6,LogisticRegressionCV,"{'Cs': 10, 'class_weight': None, 'cv': None, '...","[0.8912255257432923, 0.9155033063923586, 0.892...",0.891077,"[0.811965811965812, 0.7341269841269841, 0.8577...",0.823709,0.043721
19,LinearDiscriminantAnalysis,"{'covariance_estimator': None, 'n_components':...","[0.8868745467730239, 0.9162380602498164, 0.887...",0.893972,"[0.7863247863247863, 0.7579365079365079, 0.833...",0.818333,0.0497408
8,RidgeClassifierCV,"{'alphas': array([ 0.1, 1. , 10. ]), 'class_w...","[0.8875997099347354, 0.9162380602498164, 0.888...",0.893971,"[0.7777777777777778, 0.75, 0.8373983739837398,...",0.818133,0.0485332


In [36]:
sound_dataframe = Wheeze_dataframe

In [37]:
Total = np.unique(sound_dataframe.iloc[:,sound_dataframe.shape[1]-1])
name = []
for i in Total:
    name.append(i[:4])

unique = []
before_after = []
for i in Total:
    if np.sum((np.array(name, dtype = int) == int(i[:4]))) == 1:
        unique.append(i)
    else:
        before_after.append(i) 

Asthmatic_Female = []
Asthmatic_Male = []
Healthy_Male = []
Healthy_Female = []
for file in unique:
    if file.find('sthma') !=-1:
        if file.find("_M_")!=-1:
            Asthmatic_Male.append(file)
        if file.find("_F_")!=-1:
            Asthmatic_Female.append(file)
    if file.find("_C_")!=-1:
        if file.find("_M_")!=-1:
            Healthy_Male.append(file)
        if file.find("_F_")!=-1:
            Healthy_Female.append(file)

In [107]:
before_wheeze_dataframe = pd.DataFrame()
after_wheeze_dataframe = pd.DataFrame()
for i in (np.unique(Wheeze_dataframe['73'])):
    if i.find('_before_') != -1 or i.find('_C_') !=-1:
        A = Wheeze_dataframe[Wheeze_dataframe['73'] == i]
        before_wheeze_dataframe = pd.DataFrame.append(before_wheeze_dataframe,A)
    if i.find('_after_') != -1 or i.find('_C_') !=-1:
        B = Wheeze_dataframe[Wheeze_dataframe['73'] == i]
        after_wheeze_dataframe = pd.DataFrame.append(after_wheeze_dataframe,B)        

In [120]:
print(np.unique(before_wheeze_dataframe['73']).shape)
print(np.unique(after_wheeze_dataframe['73']).shape)

(133,)
(131,)


In [121]:
def make_and_save_sets(sound_dataframe, sound):
    for i in range(0,10):
        train_csv, test_csv = train_test_csv(sound_dataframe)
        train_csv.to_csv('./Set_CSV/train'+sound+str(i)+'.csv', index=False)
        test_csv.to_csv('./Set_CSV/test'+sound+str(i)+'.csv', index=False)

In [122]:
make_and_save_sets(before_wheeze_dataframe, 'before_wheeze')

In [123]:
make_and_save_sets(after_wheeze_dataframe, 'after_wheeze')

In [124]:
MLA_Wheeze_before = MLA_selection(before_wheeze_dataframe, 'before_wheeze') 



In [126]:
MLA_Wheeze_before

Unnamed: 0,MLA Name,MLA Parameters,MLA Train Accuracy,MLA Train Accuracy Mean,MLA Test Accuracy,MLA Test Accuracy Mean,MLA Test Accuracy Std
4,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.0,"[0.8135593220338984, 0.7107438016528925, 0.861...",0.808997,0.0832143
2,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.0,"[0.8135593220338984, 0.8760330578512396, 0.788...",0.798247,0.0986386
15,NuSVC,"{'break_ties': False, 'cache_size': 200, 'clas...","[0.9859578736208626, 0.9678068410462777, 0.975...",0.975089,"[0.8135593220338984, 0.8264462809917356, 0.854...",0.797912,0.077087
14,SVC,"{'C': 1.0, 'break_ties': False, 'cache_size': ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.0,"[0.8135593220338984, 0.768595041322314, 0.8613...",0.7917,0.105391
6,LogisticRegressionCV,"{'Cs': 10, 'class_weight': None, 'cv': None, '...","[0.8916750250752257, 0.9104627766599598, 0.941...",0.913744,"[0.6949152542372882, 0.7933884297520661, 0.788...",0.780798,0.0752533
3,GradientBoostingClassifier,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.0,"[0.6779661016949152, 0.7933884297520661, 0.861...",0.755687,0.0582586
12,GaussianNB,"{'priors': None, 'var_smoothing': 1e-09}","[0.8665997993981945, 0.8470824949698189, 0.850...",0.858446,"[0.7203389830508474, 0.8264462809917356, 0.744...",0.750785,0.0757055
11,BernoulliNB,"{'alpha': 1.0, 'binarize': 0.0, 'class_prior':...","[0.8796389167502507, 0.8651911468812877, 0.847...",0.857235,"[0.7457627118644068, 0.628099173553719, 0.8175...",0.746141,0.0915189
0,AdaBoostClassifier,"{'algorithm': 'SAMME.R', 'base_estimator': Non...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.0,"[0.8813559322033898, 0.7933884297520661, 0.788...",0.745598,0.10998
10,Perceptron,"{'alpha': 0.0001, 'class_weight': None, 'early...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9959595959595...",0.999596,"[0.7033898305084746, 0.8512396694214877, 0.744...",0.731016,0.100849


In [125]:
MLA_Wheeze_after = MLA_selection(after_wheeze_dataframe, 'after_wheeze')



In [127]:
MLA_Wheeze_after

Unnamed: 0,MLA Name,MLA Parameters,MLA Train Accuracy,MLA Train Accuracy Mean,MLA Test Accuracy,MLA Test Accuracy Mean,MLA Test Accuracy Std
14,SVC,"{'C': 1.0, 'break_ties': False, 'cache_size': ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.0,"[0.8222222222222222, 0.7612903225806451, 0.946...",0.805953,0.0750491
3,GradientBoostingClassifier,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.0,"[0.6148148148148148, 0.9354838709677419, 0.806...",0.795489,0.0865326
15,NuSVC,"{'break_ties': False, 'cache_size': 200, 'clas...","[0.9698492462311558, 0.9774358974358974, 0.974...",0.978802,"[0.8222222222222222, 0.7806451612903226, 0.946...",0.778615,0.0916939
21,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.0,"[0.6148148148148148, 0.8129032258064516, 0.853...",0.773664,0.0827355
2,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.0,"[0.6888888888888889, 0.9032258064516129, 0.853...",0.772423,0.0745146
0,AdaBoostClassifier,"{'algorithm': 'SAMME.R', 'base_estimator': Non...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.0,"[0.6148148148148148, 0.8709677419354839, 0.7, ...",0.760343,0.110972
5,GaussianProcessClassifier,"{'copy_X_train': True, 'kernel': None, 'max_it...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.0,"[0.8296296296296296, 0.8258064516129032, 0.74,...",0.760276,0.137775
13,KNeighborsClassifier,"{'algorithm': 'auto', 'leaf_size': 30, 'metric...","[0.9979899497487437, 0.9969230769230769, 0.996...",0.997576,"[0.8296296296296296, 0.7483870967741936, 0.74,...",0.75443,0.118999
4,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.0,"[0.6222222222222222, 0.8387096774193549, 0.78,...",0.752497,0.0811122
6,LogisticRegressionCV,"{'Cs': 10, 'class_weight': None, 'cv': None, '...","[0.9105527638190954, 1.0, 0.9734693877551021, ...",0.935248,"[0.6888888888888889, 0.9032258064516129, 0.726...",0.751493,0.0741519
