In [1]:
from sklearn.utils import shuffle
import sklearn 
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import os
import pandas as pd
import numpy as np
import random

In [4]:
random.seed(0)

In [5]:
import matplotlib.pyplot as plt

In [6]:
import pickle


In [7]:
def train_test_set(Asthmatic_Female, Asthmatic_Male, Healthy_Female, Healthy_Male, before_after):
    random.seed(0)
    test_set = np.concatenate(((np.random.choice(np.array(Asthmatic_Male), replace=False, size=(7)),
                                np.random.choice(np.array(Asthmatic_Female), replace=False, size=(7)),
                                np.random.choice(np.array(Healthy_Male), replace=False, size=(7)),
                                np.random.choice(np.array(Healthy_Female), replace=False, size=(7)))))
    train_set = np.concatenate((np.array(Asthmatic_Male),
                                np.array(Asthmatic_Female),
                                np.array(Healthy_Male),
                                np.array(Healthy_Female)))
    train_set = np.setdiff1d(train_set,test_set)
#     np.array(before_after).shape
    train_set = np.concatenate((train_set, np.array(before_after)))
    return train_set, test_set

In [8]:
Breath_1_dataframe = pd.read_csv("./breath_csv/Breath_1.csv")
Breath_2_dataframe = pd.read_csv("./breath_csv/Breath_2.csv")
Breath_3_dataframe = pd.read_csv("./breath_csv/Breath_3.csv")

In [9]:
# train_test Csv
def train_test_csv(sound_dataframe):
    Total = np.unique(sound_dataframe.iloc[:,sound_dataframe.shape[1]-1])
    name = []
    for i in Total:
        name.append(i[:4])

    unique = []
    before_after = []
    for i in Total:
        if np.sum((np.array(name, dtype = int) == int(i[:4]))) == 1:
            unique.append(i)
        else:
            before_after.append(i) 

    Asthmatic_Female = []
    Asthmatic_Male = []
    Healthy_Male = []
    Healthy_Female = []
    for file in unique:
        if file.find('sthma') !=-1:
            if file.find("_M_")!=-1:
                Asthmatic_Male.append(file)
            if file.find("_F_")!=-1:
                Asthmatic_Female.append(file)
        if file.find("_C_")!=-1:
            if file.find("_M_")!=-1:
                Healthy_Male.append(file)
            if file.find("_F_")!=-1:
                Healthy_Female.append(file)

                

    Train, Test = train_test_set(Asthmatic_Female, Asthmatic_Male, Healthy_Female, Healthy_Male, before_after)     

    test_dataframe = pd.DataFrame()
    for i, line in enumerate(Test):
        A = sound_dataframe[(sound_dataframe[str(sound_dataframe.shape[1]-1)] == (Test[i]))]
        test_dataframe = pd.DataFrame.append(test_dataframe,A)
        
        
    train_dataframe = pd.DataFrame()
    for i, line in enumerate(Train):
        A = sound_dataframe[(sound_dataframe[str(sound_dataframe.shape[1]-1)] == (Train[i]))]
        train_dataframe = pd.DataFrame.append(train_dataframe,A) 
        
        
    
    return train_dataframe, test_dataframe

In [15]:
#Machine Learning Algorithm (MLA) Selection and Initialization
def MLA_selection(sound_dataframe, sound):    
    MLA = [
        #Ensemble Methods
        ensemble.AdaBoostClassifier(),
        ensemble.BaggingClassifier(),
        ensemble.ExtraTreesClassifier(),
        ensemble.GradientBoostingClassifier(),
        ensemble.RandomForestClassifier(),

        #Gaussian Processes
        gaussian_process.GaussianProcessClassifier(),

        #GLM
        linear_model.LogisticRegressionCV(),
        linear_model.PassiveAggressiveClassifier(),
        linear_model.RidgeClassifierCV(),
        linear_model.SGDClassifier(),
        linear_model.Perceptron(),

        #Navies Bayes
        naive_bayes.BernoulliNB(),
        naive_bayes.GaussianNB(),

        #Nearest Neighbor
        neighbors.KNeighborsClassifier(),

        #SVM
        svm.SVC(probability=True),
        svm.NuSVC(probability=True),
        svm.LinearSVC(),

        #Trees    
        tree.DecisionTreeClassifier(),
        tree.ExtraTreeClassifier(),

        #Discriminant Analysis
        discriminant_analysis.LinearDiscriminantAnalysis(),
        discriminant_analysis.QuadraticDiscriminantAnalysis(),


        XGBClassifier(max_depth = 8,
                     subsample = 0.8,
                     learning_rate = 0.01,
                     n_estimators = 450,
                     min_child_weight = 1)
#         XGBClassifier()
        ]


    MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy','MLA Train Accuracy Mean', 'MLA Test Accuracy', 'MLA Test Accuracy Mean','MLA Test Accuracy Std' ]
    MLA_compare = pd.DataFrame(columns = MLA_columns)
    row_index = 0
    for alg in MLA:
        MLA_name = alg.__class__.__name__
        MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
        MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
        test = []
        train = []
        for i in range(0,10):
            train_csv, test_csv = pd.read_csv('./breathe_set_csv/train'+sound+str(i)+'.csv'), pd.read_csv('./breathe_set_csv/test'+sound+str(i)+'.csv')
            X_train = train_csv.iloc[:,:sound_dataframe.shape[1]-2]
            y_train = train_csv.iloc[:,sound_dataframe.shape[1]-2]
            X_test = test_csv.iloc[:,:sound_dataframe.shape[1]-2]
            y_test = test_csv.iloc[:,sound_dataframe.shape[1]-2]
            #Scaling
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
            X_train = pd.DataFrame(X_train)
            X_test = pd.DataFrame(X_test)

            alg.fit(X_train,y_train)
            test.append(metrics.accuracy_score(y_test,alg.predict(X_test)))

            train.append(metrics.accuracy_score(y_train,alg.predict(X_train)))
            
#             if alg.__class__.__name__ == 'XGBClassifier':
#                 filename = './CMN/XGB_model'+ str(i) + '.model'
#                 pickle.dump(alg, open(filename, 'wb'))
#                 train_csv.to_csv('./CMN/Train_XGB'+str(i)+'.csv', index=False)
                
        
        MLA_compare.loc[row_index, 'MLA Train Accuracy'] = train
        MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = np.mean(train)
        MLA_compare.loc[row_index, 'MLA Test Accuracy'] = test
        MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = np.mean(test)
        MLA_compare.loc[row_index, 'MLA Test Accuracy Std'] = np.std(test)


        row_index+=1

    
    MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
    
    return MLA_compare

In [10]:
# Making Sets
def make_and_save_sets(sound_dataframe, sound):
    for i in range(0,10):
        train_csv, test_csv = train_test_csv(sound_dataframe)
        train_csv.to_csv('./breathe_set_csv/train'+sound+str(i)+'.csv', index=False)
        test_csv.to_csv('./breathe_set_csv/test'+sound+str(i)+'.csv', index=False)

### Breath_1

In [11]:
make_and_save_sets(Breath_1_dataframe, 'breath_1')

In [16]:
MLA_breathe_1 = MLA_selection(Breath_1_dataframe, 'breath_1')



In [17]:
MLA_breathe_1

Unnamed: 0,MLA Name,MLA Parameters,MLA Train Accuracy,MLA Train Accuracy Mean,MLA Test Accuracy,MLA Test Accuracy Mean,MLA Test Accuracy Std
4,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.0,"[0.6771653543307087, 0.7073170731707317, 0.858...",0.776043,0.0655424
21,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...","[0.9995126705653021, 1.0, 1.0, 1.0, 0.99951171...",0.999709,"[0.6889763779527559, 0.7398373983739838, 0.804...",0.773864,0.0634345
3,GradientBoostingClassifier,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...","[0.9668615984405458, 0.9626213592233009, 0.957...",0.96367,"[0.6338582677165354, 0.7479674796747967, 0.829...",0.767432,0.0801102
2,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.0,"[0.6181102362204725, 0.7113821138211383, 0.846...",0.751227,0.0692842
14,SVC,"{'C': 1.0, 'break_ties': False, 'cache_size': ...","[0.9600389863547758, 0.9572815533980582, 0.958...",0.957693,"[0.610236220472441, 0.7520325203252033, 0.8215...",0.747849,0.0603294
15,NuSVC,"{'break_ties': False, 'cache_size': 200, 'clas...","[0.9200779727095516, 0.9242718446601942, 0.924...",0.923638,"[0.6220472440944882, 0.7195121951219512, 0.796...",0.738557,0.0567822
1,BaggingClassifier,"{'base_estimator': None, 'bootstrap': True, 'b...","[0.996588693957115, 0.9970873786407767, 0.9961...",0.996701,"[0.6653543307086615, 0.6991869918699187, 0.792...",0.736552,0.0679334
6,LogisticRegressionCV,"{'Cs': 10, 'class_weight': None, 'cv': None, '...","[0.830896686159844, 0.8092233009708738, 0.8096...",0.812056,"[0.6023622047244095, 0.7967479674796748, 0.767...",0.730586,0.0576243
16,LinearSVC,"{'C': 1.0, 'class_weight': None, 'dual': True,...","[0.8469785575048733, 0.8262135922330097, 0.828...",0.83236,"[0.6062992125984252, 0.7926829268292683, 0.771...",0.729927,0.0636378
8,RidgeClassifierCV,"{'alphas': array([ 0.1, 1. , 10. ]), 'class_w...","[0.8421052631578947, 0.8194174757281554, 0.826...",0.828318,"[0.5984251968503937, 0.7845528455284553, 0.751...",0.728279,0.0597514


### Breath_2

In [18]:
make_and_save_sets(Breath_2_dataframe, 'breath_2')

In [19]:
MLA_breathe_2 = MLA_selection(Breath_2_dataframe, 'breath_2')



In [20]:
MLA_breathe_2

Unnamed: 0,MLA Name,MLA Parameters,MLA Train Accuracy,MLA Train Accuracy Mean,MLA Test Accuracy,MLA Test Accuracy Mean,MLA Test Accuracy Std
21,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...","[1.0, 0.9995129079396006, 1.0, 1.0, 1.0, 1.0, ...",0.999951,"[0.717741935483871, 0.7391304347826086, 0.7735...",0.768987,0.0391215
4,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.0,"[0.7217741935483871, 0.782608695652174, 0.7991...",0.76552,0.0277316
3,GradientBoostingClassifier,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...","[0.9630709426627794, 0.9605455431076474, 0.959...",0.962029,"[0.717741935483871, 0.758893280632411, 0.76495...",0.762377,0.0359218
1,BaggingClassifier,"{'base_estimator': None, 'bootstrap': True, 'b...","[0.9951409135082604, 0.9951290793960058, 0.996...",0.995429,"[0.7137096774193549, 0.8102766798418972, 0.747...",0.749704,0.041521
20,QuadraticDiscriminantAnalysis,"{'priors': None, 'reg_param': 0.0, 'store_cova...","[0.9135082604470359, 0.9074525085241111, 0.908...",0.904176,"[0.7540322580645161, 0.7549407114624506, 0.811...",0.748395,0.0545893
2,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.0,"[0.6814516129032258, 0.7351778656126482, 0.803...",0.74809,0.0508561
14,SVC,"{'C': 1.0, 'break_ties': False, 'cache_size': ...","[0.9533527696793003, 0.9527520701412567, 0.956...",0.952099,"[0.6774193548387096, 0.7272727272727273, 0.816...",0.740358,0.0540052
15,NuSVC,"{'break_ties': False, 'cache_size': 200, 'clas...","[0.9198250728862973, 0.913297613248904, 0.9179...",0.917682,"[0.6895161290322581, 0.7312252964426877, 0.816...",0.738616,0.0499615
16,LinearSVC,"{'C': 1.0, 'class_weight': None, 'dual': True,...","[0.8483965014577259, 0.8329274232830005, 0.837...",0.840521,"[0.7056451612903226, 0.6877470355731226, 0.790...",0.727351,0.0459259
12,GaussianNB,"{'priors': None, 'var_smoothing': 1e-09}","[0.7745383867832848, 0.7652216268874817, 0.762...",0.769853,"[0.6774193548387096, 0.7786561264822134, 0.730...",0.726124,0.0524201


## Breath_3

In [21]:
make_and_save_sets(Breath_3_dataframe, 'breath_3')

In [22]:
MLA_breathe_3 = MLA_selection(Breath_3_dataframe, 'breath_3')



In [23]:
MLA_breathe_3

Unnamed: 0,MLA Name,MLA Parameters,MLA Train Accuracy,MLA Train Accuracy Mean,MLA Test Accuracy,MLA Test Accuracy Mean,MLA Test Accuracy Std
8,RidgeClassifierCV,"{'alphas': array([ 0.1, 1. , 10. ]), 'class_w...","[0.8118619348565873, 0.7979401667484061, 0.814...",0.812491,"[0.7751004016064257, 0.8876404494382022, 0.772...",0.762482,0.0632437
19,LinearDiscriminantAnalysis,"{'covariance_estimator': None, 'n_components':...","[0.8128342245989305, 0.7994114762138304, 0.812...",0.813177,"[0.7751004016064257, 0.8801498127340824, 0.768...",0.760579,0.0590045
6,LogisticRegressionCV,"{'Cs': 10, 'class_weight': None, 'cv': None, '...","[0.8186679630529898, 0.7969592937714566, 0.813...",0.814296,"[0.7751004016064257, 0.8764044943820225, 0.764...",0.760021,0.0574561
16,LinearSVC,"{'C': 1.0, 'class_weight': None, 'dual': True,...","[0.8118619348565873, 0.7979401667484061, 0.814...",0.814342,"[0.7791164658634538, 0.8764044943820225, 0.764...",0.758935,0.0575076
21,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...","[1.0, 0.9995095635115253, 0.9990248659190639, ...",0.999707,"[0.7710843373493976, 0.8239700374531835, 0.768...",0.747377,0.050693
2,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.0,"[0.7871485943775101, 0.8052434456928839, 0.729...",0.744572,0.0489298
4,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",1.0,"[0.7951807228915663, 0.8052434456928839, 0.776...",0.743723,0.0598756
14,SVC,"{'C': 1.0, 'break_ties': False, 'cache_size': ...","[0.9173553719008265, 0.9107405590975969, 0.921...",0.920624,"[0.7710843373493976, 0.8239700374531835, 0.729...",0.740749,0.0478887
15,NuSVC,"{'break_ties': False, 'cache_size': 200, 'clas...","[0.9037433155080213, 0.900931829328102, 0.9039...",0.902835,"[0.7791164658634538, 0.8277153558052435, 0.741...",0.740014,0.0501678
3,GradientBoostingClassifier,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...","[0.9236752552260573, 0.9195684158901423, 0.928...",0.925252,"[0.7871485943775101, 0.8352059925093633, 0.737...",0.739883,0.0559333
