In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import ExcelWriter
import tensorflow.keras as kr
from sklearn.model_selection import StratifiedKFold,ParameterGrid
from sklearn.preprocessing import MinMaxScaler,StandardScaler

from sklearn.metrics import recall_score, make_scorer,confusion_matrix, f1_score, balanced_accuracy_score, accuracy_score
from sklearn.metrics import classification_report, plot_roc_curve, roc_auc_score

from keras.utils import np_utils

from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [3]:
path='/content/drive/MyDrive/Respiratory Sounds Final/Resultados/Caracteristicas_50ms.csv'
df_data=pd.read_csv(path,index_col=0)
data=df_data.values
x = data[:,:-2]
y = data[:,-2:] #target para c y w
scaler = MinMaxScaler((-1,1),True)#Nomaliza entre -1 y 1
x=scaler.fit_transform(x)

In [None]:
df=pd.read_csv('/content/drive/MyDrive/Respiratory Sounds Final/Resultados/ExploracionEtiquetas.csv')

In [None]:
len(y[(y[:,0]==0) & (y[:,1]==1)])

858

In [None]:
np.sum(df,axis=0)

Unnamed: 0                                               422740
File          185_1b1_Pl_sc_Litt3200185_1b1_Lr_sc_Litt320018...
Patient                                                  151211
Equipment     Litt3200Litt3200Litt3200AKGC417LAKGC417LAKGC41...
mc/sc         scscscmcmcmcmcmcmcmcmcmcmcmcmcmcmcmcmcmcmcmcmc...
Diagnosis     COPDCOPDCOPDCOPDCOPDCOPDCOPDCOPDCOPDCOPDCOPDCO...
c                                                          2370
w                                                          1392
c/w                                                         506
n                                                          3642
c2                                                         2332
w2                                                         1361
c/w2                                                        503
n2                                                         3406
dtype: object

#Funciones


In [6]:
def multilabel_to_4classes(y):
  '''
  Convierte de multi-label a multi-class
  y_multi=1: crackles
  y_multi=2: wheezes
  y_multi=3: crackles & whezees
  '''
  y_multi=np.zeros(y.shape[0])
  y_multi[(y[:,0]==1) & (y[:,1]==0)]=1
  y_multi[(y[:,0]==0) & (y[:,1]==1)]=2
  y_multi[(y[:,0]==1) & (y[:,1]==1)]=3
  return y_multi
def multilabel_to_2classes(y):
  '''
  indica si hay presencia de anormalidades con 1
  '''
  y_multi=np.ones(y.shape[0])
  y_multi[(y[:,0]==0) & (y[:,1]==0)]=0
  return y_multi

def from_4_to_2classes(y):
  '''
  indica si hay presencia de anormalidades con 1
  '''
  y_multi=np.ones(y.shape[0])
  y_multi[y==0]=0
  return y_multi

In [7]:
def kfold_model_selection(x, y, clasificador, parameters,multilabel,n_split=5,random_state=12):
  '''
  inputs:
    x: vector de caraterísticas
    y: vector objetivo en formato multilabel
    clasificador: modelo de ML a probar
    parameters: (dict) hipeparametros del modelo a explorar
    func_transfromacion: Función de transformación de los datos
    n_split: Numero de splits de la valicdación cruzada
    random_state: random state de la val cruzada.
  return:
    df: devuelve el dataframe donde se muestran los resultados de la validacion cruzada resultante de variar los hiperparametros de los modelos de ML
  '''

  param_grid = ParameterGrid(parameters)
  index_values=list(param_grid)
  columns=['acc_2_2','acc_4_2','acc_4_4','acc_m_2','acc_m_4','acc_m_m1','acc_m_m2','std_acc_2_2','std_acc_4_2','std_acc_4_4','std_acc_m_2','std_acc_m_4','std_acc_m_m1','std_acc_m_m2']
  
  y_m_4=multilabel_to_4classes(y)
  y_m_2=multilabel_to_2classes(y)

  rskf=StratifiedKFold(n_split,random_state=random_state,shuffle=True)
  resultados=np.zeros((len(param_grid),len(columns)))

  for c,params in enumerate(param_grid):
    clasificador.set_params(**params)
    test_score=np.zeros((n_split,7))
    i=0
    for train_index, test_index in rskf.split(x,y_m_4):#queda estratificado respecto a las 4 clases
      x_train,x_test=x[train_index],x[test_index]
      
      # Multilabel
      if multilabel==True:
        y_train,y_test=y[train_index],y[test_index]
        clasificador.fit(x_train, y_train)
        y_pred=clasificador.predict(x_test)

        y_pred_m_4 = multilabel_to_4classes(y_pred)
        y_pred_m_2 = multilabel_to_2classes(y_pred)
        y_test_m_4 = multilabel_to_4classes(y_test)
        y_test_m_2 = multilabel_to_2classes(y_test)

        test_score[i,3] = balanced_accuracy_score(y_test_m_2,y_pred_m_2)
        test_score[i,4] = balanced_accuracy_score(y_test_m_4,y_pred_m_4)
        test_score[i,5] = (balanced_accuracy_score(y_test[:,0],y_pred[:,0])+balanced_accuracy_score(y_test[:,1],y_pred[:,1]))/2
        test_score[i,6] = np.sum(y_test==y_pred)/y_test.size
      # 4 clases
      y_train,y_test=y_m_4[train_index],y_m_4[test_index]
      clasificador.fit(x_train, y_train)
      y_pred=clasificador.predict(x_test)

      y_pred_4_2 = from_4_to_2classes(y_pred)
      y_test_4_2 = from_4_to_2classes(y_test)

      test_score[i,1] = balanced_accuracy_score(y_test_4_2,y_pred_4_2)
      test_score[i,2] = balanced_accuracy_score(y_test,y_pred)  
      # 2 clases
      y_train,y_test=y_m_2[train_index],y_m_2[test_index]
      clasificador.fit(x_train, y_train)
      y_pred=clasificador.predict(x_test)

      test_score[i,0] = balanced_accuracy_score(y_test,y_pred)    

      i+=1
    resultados[c] = np.concatenate((test_score.mean(axis=0),test_score.std(axis=0)),axis=0)
  return pd.DataFrame(data=resultados,index=index_values,columns=columns)

#Exploracion Modelos

# SVM

In [None]:
clasificador=SVC(class_weight='balanced',random_state=12) 
parameters={'C':[0.1,1,100], 
            'kernel':['rbf'], 
            'gamma':['scale']}

df_svm=kfold_model_selection(x, y, clasificador, parameters,multilabel=False,n_split=5,random_state=12)

In [None]:
df_svm

Unnamed: 0,acc_2_2,acc_4_2,acc_4_4,acc_m_2,acc_m_4,acc_m_m1,acc_m_m2,std_acc_2_2,std_acc_4_2,std_acc_4_4,std_acc_m_2,std_acc_m_4,std_acc_m_m1,std_acc_m_m2
"{'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}",0.650669,0.636639,0.455871,0.0,0.0,0.0,0.0,0.007346,0.013516,0.025644,0.0,0.0,0.0,0.0
"{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}",0.698577,0.671327,0.531807,0.0,0.0,0.0,0.0,0.011507,0.012839,0.019532,0.0,0.0,0.0,0.0
"{'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}",0.780392,0.76746,0.643242,0.0,0.0,0.0,0.0,0.014247,0.01393,0.011278,0.0,0.0,0.0,0.0


In [None]:
clasificador=SVC(class_weight='balanced',random_state=12) 
parameters={'C':[80,100,120,150], 
            'kernel':['rbf'], 
            'gamma':['scale']}

df_svm=kfold_model_selection(x, y, clasificador, parameters,multilabel=False,n_split=5,random_state=12)

In [None]:
df_svm

Unnamed: 0,acc_2_2,acc_4_2,acc_4_4,acc_m_2,acc_m_4,acc_m_m1,acc_m_m2,std_acc_2_2,std_acc_4_2,std_acc_4_4,std_acc_m_2,std_acc_m_4,std_acc_m_m1,std_acc_m_m2
"{'C': 80, 'gamma': 'scale', 'kernel': 'rbf'}",0.780129,0.762881,0.642267,0.0,0.0,0.0,0.0,0.014382,0.012721,0.016497,0.0,0.0,0.0,0.0
"{'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}",0.780392,0.76746,0.643242,0.0,0.0,0.0,0.0,0.014247,0.01393,0.011278,0.0,0.0,0.0,0.0
"{'C': 120, 'gamma': 'scale', 'kernel': 'rbf'}",0.780312,0.769435,0.647497,0.0,0.0,0.0,0.0,0.012249,0.012195,0.011874,0.0,0.0,0.0,0.0
"{'C': 150, 'gamma': 'scale', 'kernel': 'rbf'}",0.781917,0.771704,0.644849,0.0,0.0,0.0,0.0,0.012113,0.009593,0.010612,0.0,0.0,0.0,0.0


#MLP

In [None]:
def KerasMLPClassifier(input_dim,output_dim,hidden_layer=10,activation='tanh',batch_size=32,learning_rate=0.001,epochs=500, class_weight={1:0.23,0:0.77},loss='categorical_crossentropy'):
  model = kr.Sequential()
  model.add(kr.layers.Dense(hidden_layer, input_dim=input_dim, activation='relu'))
  model.add(kr.layers.Dense(output_dim, activation=activation))
  
  model.compile(loss=loss,
              optimizer=kr.optimizers.Adam(learning_rate=learning_rate), 
              metrics=['binary_accuracy'])
  return model

In [None]:
clasificador=kr.wrappers.scikit_learn.KerasClassifier(build_fn=KerasMLPClassifier,input_dim=36,output_dim=1,verbose=0,hidden_layer=10,activation='softmax',batch_size=32,class_weight={1:0.65,0:2.157},
                                                      learning_rate=0.001,epochs=500,callbacks=kr.callbacks.EarlyStopping(monitor='val_loss', patience=50,restore_best_weights=True),
                                                      validation_split=0.1,loss='categorical_crossentropy')
parameters={'hidden_layer':[10,20,100,150,200], 
            'learning_rate':[0.001,0.0001,0.01], 
            'batch_size':[32,64,128]}

In [None]:
clasificador=kr.wrappers.scikit_learn.KerasClassifier(build_fn=KerasMLPClassifier,input_dim=36,output_dim=1,verbose=0,hidden_layer=183,activation='softmax',batch_size=60,class_weight={1:0.65,0:2.157},
                                                      learning_rate=0.001,epochs=500,callbacks=kr.callbacks.EarlyStopping(monitor='val_loss', patience=50,restore_best_weights=True),
                                                      validation_split=0.1,loss='categorical_crossentropy')
parameters={'hidden_layer':[183], 
            'learning_rate':[0.001], 
            'batch_size':[60]}

In [None]:
n_split=5
random_state=12
multilabel=True
param_grid = ParameterGrid(parameters)
index_values=list(param_grid)
columns=['acc_2_2','acc_4_2','acc_4_4','acc_m_2','acc_m_4','acc_m_m1','acc_m_m2','std_acc_2_2','std_acc_4_2','std_acc_4_4','std_acc_m_2','std_acc_m_4','std_acc_m_m1','std_acc_m_m2']

y_m_4=multilabel_to_4classes(y)
y_m_2=multilabel_to_2classes(y)

rskf=StratifiedKFold(n_split,random_state=random_state,shuffle=True)
resultados=np.zeros((len(param_grid),len(columns)))

for c,params in enumerate(param_grid):
  clasificador.set_params(**params)
  test_score=np.zeros((n_split,7))
  i=0
  for train_index, test_index in rskf.split(x,y_m_4):#queda estratificado respecto a las 4 clases
    x_train,x_test=x[train_index],x[test_index]
    
    # Multilabel
    if multilabel==True:
      y_train,y_test=y[train_index],y[test_index]
      clasificador.set_params(**{'output_dim':2,'class_weight':None,'activation':'sigmoid','loss':'binary_crossentropy'})
      clasificador.fit(x_train, y_train)
      y_pred=clasificador.predict_proba(x_test)>0.5

      y_pred_m_4 = multilabel_to_4classes(y_pred)
      y_pred_m_2 = multilabel_to_2classes(y_pred)
      y_test_m_4 = multilabel_to_4classes(y_test)
      y_test_m_2 = multilabel_to_2classes(y_test)

      test_score[i,3] = balanced_accuracy_score(y_test_m_2,y_pred_m_2)
      test_score[i,4] = balanced_accuracy_score(y_test_m_4,y_pred_m_4)
      test_score[i,5] = (balanced_accuracy_score(y_test[:,0],y_pred[:,0])+balanced_accuracy_score(y_test[:,1],y_pred[:,1]))/2
      test_score[i,6] = np.sum(y_test==y_pred)/y_test.size
    # 4 clases
    y_train,y_test=np_utils.to_categorical(y_m_4[train_index]),y_m_4[test_index]
    clasificador.set_params(**{'output_dim':4,'class_weight':{3:0.484,2:0.9,1:1.92,0:3.28},'activation':'softmax','loss':'categorical_crossentropy'})
    clasificador.fit(x_train, y_train)
    y_pred=clasificador.predict(x_test)

    y_pred_4_2 = from_4_to_2classes(y_pred)
    y_test_4_2 = from_4_to_2classes(y_test)

    test_score[i,1] = balanced_accuracy_score(y_test_4_2,y_pred_4_2)
    test_score[i,2] = balanced_accuracy_score(y_test,y_pred)  
    # 2 clases
    y_train,y_test=np_utils.to_categorical(y_m_2[train_index]),y_m_2[test_index]
    clasificador.set_params(**{'output_dim':2,'class_weight':{1:0.97,0:1.03},'activation':'softmax','loss':'categorical_crossentropy'})
    clasificador.fit(x_train, y_train)
    y_pred=clasificador.predict(x_test)

    test_score[i,0] = balanced_accuracy_score(y_test,y_pred)    

    i+=1
  resultados[c] = np.concatenate((test_score.mean(axis=0),test_score.std(axis=0)),axis=0)
df_mlp = pd.DataFrame(data=resultados,index=index_values,columns=columns)

In [None]:
df_mlp

Unnamed: 0,acc_2_2,acc_4_2,acc_4_4,acc_m_2,acc_m_4,acc_m_m1,acc_m_m2,std_acc_2_2,std_acc_4_2,std_acc_4_4,std_acc_m_2,std_acc_m_4,std_acc_m_m1,std_acc_m_m2
"{'batch_size': 60, 'hidden_layer': 183, 'learning_rate': 0.001}",0.693361,0.663483,0.388874,0.669911,0.413895,0.641097,0.770468,0.017613,0.046743,0.041723,0.030579,0.033338,0.028468,0.014771


In [None]:
n_split=5
random_state=12
multilabel=True
param_grid = ParameterGrid(parameters)
index_values=list(param_grid)
columns=['acc_2_2','acc_4_2','acc_4_4','acc_m_2','acc_m_4','acc_m_m1','acc_m_m2','std_acc_2_2','std_acc_4_2','std_acc_4_4','std_acc_m_2','std_acc_m_4','std_acc_m_m1','std_acc_m_m2']

y_m_4=multilabel_to_4classes(y)
y_m_2=multilabel_to_2classes(y)

rskf=StratifiedKFold(n_split,random_state=random_state,shuffle=True)
resultados=np.zeros((len(param_grid),len(columns)))

for c,params in enumerate(param_grid):
  clasificador.set_params(**params)
  test_score=np.zeros((n_split,7))
  i=0
  for train_index, test_index in rskf.split(x,y_m_4):#queda estratificado respecto a las 4 clases
    x_train,x_test=x[train_index],x[test_index]
    
    # Multilabel
    if multilabel==True:
      y_train,y_test=y[train_index],y[test_index]
      clasificador.set_params(**{'output_dim':2,'class_weight':None,'activation':'sigmoid','loss':'binary_crossentropy'})
      clasificador.fit(x_train, y_train)
      y_pred=clasificador.predict_proba(x_test)>0.5

      y_pred_m_4 = multilabel_to_4classes(y_pred)
      y_pred_m_2 = multilabel_to_2classes(y_pred)
      y_test_m_4 = multilabel_to_4classes(y_test)
      y_test_m_2 = multilabel_to_2classes(y_test)

      test_score[i,3] = accuracy_score(y_test_m_2,y_pred_m_2)
      test_score[i,4] = accuracy_score(y_test_m_4,y_pred_m_4)
      test_score[i,5] = (accuracy_score(y_test[:,0],y_pred[:,0])+accuracy_score(y_test[:,1],y_pred[:,1]))/2
      test_score[i,6] = np.sum(y_test==y_pred)/y_test.size
    # 4 clases
    y_train,y_test=np_utils.to_categorical(y_m_4[train_index]),y_m_4[test_index]
    clasificador.set_params(**{'output_dim':4,'class_weight':{3:0.484,2:0.9,1:1.92,0:3.28},'activation':'softmax','loss':'categorical_crossentropy'})
    clasificador.fit(x_train, y_train)
    y_pred=clasificador.predict(x_test)

    y_pred_4_2 = from_4_to_2classes(y_pred)
    y_test_4_2 = from_4_to_2classes(y_test)

    test_score[i,1] = accuracy_score(y_test_4_2,y_pred_4_2)
    test_score[i,2] = accuracy_score(y_test,y_pred)  
    # 2 clases
    y_train,y_test=np_utils.to_categorical(y_m_2[train_index]),y_m_2[test_index]
    clasificador.set_params(**{'output_dim':2,'class_weight':{1:0.97,0:1.03},'activation':'softmax','loss':'categorical_crossentropy'})
    clasificador.fit(x_train, y_train)
    y_pred=clasificador.predict(x_test)

    test_score[i,0] = accuracy_score(y_test,y_pred)    

    i+=1
  resultados[c] = np.concatenate((test_score.mean(axis=0),test_score.std(axis=0)),axis=0)
df_mlp = pd.DataFrame(data=resultados,index=index_values,columns=columns)

In [None]:
df_mlp

Unnamed: 0,acc_2_2,acc_4_2,acc_4_4,acc_m_2,acc_m_4,acc_m_m1,acc_m_m2,std_acc_2_2,std_acc_4_2,std_acc_4_4,std_acc_m_2,std_acc_m_4,std_acc_m_m1,std_acc_m_m2
"{'batch_size': 60, 'hidden_layer': 183, 'learning_rate': 0.001}",0.671163,0.672375,0.614461,0.679659,0.597638,0.768573,0.768573,0.026889,0.041945,0.028814,0.024487,0.01602,0.011721,0.011721


#RF

In [None]:
clasificador=RandomForestClassifier(class_weight='balanced',random_state=12) 
parameters={'n_estimators':[80,100,500], 
            'criterion':['gini'], 
            'min_samples_split':[10,20,30,50],
            'max_features':['auto']}
df_rf=kfold_model_selection(x, y, clasificador, parameters,multilabel=True,n_split=5,random_state=12)

In [None]:
df_rf

In [None]:
clasificador=RandomForestClassifier(class_weight='balanced',random_state=12) 
parameters={'n_estimators':[80,100,120,150], 
            'criterion':['gini'], 
            'min_samples_split':[17,20,23,35],
            'max_features':['auto']}
df_rf=kfold_model_selection(x, y, clasificador, parameters,multilabel=True,n_split=5,random_state=12)

In [None]:
df_rf

Unnamed: 0,acc_2_2,acc_4_2,acc_4_4,acc_m_2,acc_m_4,acc_m_m1,acc_m_m2,std_acc_2_2,std_acc_4_2,std_acc_4_4,std_acc_m_2,std_acc_m_4,std_acc_m_m1,std_acc_m_m2
"{'criterion': 'gini', 'max_features': 'auto', 'min_samples_split': 17, 'n_estimators': 80}",0.765523,0.750925,0.570782,0.741852,0.539519,0.735622,0.817161,0.011884,0.007253,0.003991,0.015685,0.013174,0.009919,0.00996
"{'criterion': 'gini', 'max_features': 'auto', 'min_samples_split': 17, 'n_estimators': 100}",0.770059,0.754762,0.574514,0.742576,0.544005,0.738171,0.818223,0.013717,0.00882,0.006445,0.014213,0.009795,0.009346,0.009594
"{'criterion': 'gini', 'max_features': 'auto', 'min_samples_split': 17, 'n_estimators': 120}",0.767834,0.752677,0.575265,0.74239,0.54536,0.73977,0.819587,0.010132,0.008474,0.009021,0.012736,0.004968,0.006158,0.007655
"{'criterion': 'gini', 'max_features': 'auto', 'min_samples_split': 17, 'n_estimators': 150}",0.770312,0.75441,0.575501,0.74471,0.546249,0.740653,0.820497,0.010445,0.013745,0.012154,0.014999,0.008271,0.006683,0.007368
"{'criterion': 'gini', 'max_features': 'auto', 'min_samples_split': 20, 'n_estimators': 80}",0.766499,0.745254,0.574596,0.739185,0.542604,0.736747,0.813219,0.010432,0.012158,0.010199,0.013237,0.008338,0.006858,0.007811
"{'criterion': 'gini', 'max_features': 'auto', 'min_samples_split': 20, 'n_estimators': 100}",0.767643,0.746418,0.577345,0.740577,0.551065,0.740858,0.815873,0.012092,0.009939,0.007319,0.013885,0.009117,0.006349,0.006935
"{'criterion': 'gini', 'max_features': 'auto', 'min_samples_split': 20, 'n_estimators': 120}",0.768584,0.747171,0.577326,0.744023,0.5512,0.74264,0.817313,0.009149,0.010349,0.007815,0.01658,0.007327,0.007351,0.007358
"{'criterion': 'gini', 'max_features': 'auto', 'min_samples_split': 20, 'n_estimators': 150}",0.766156,0.745744,0.576038,0.743308,0.551028,0.743447,0.817464,0.008145,0.009334,0.009467,0.016006,0.007247,0.00606,0.006559
"{'criterion': 'gini', 'max_features': 'auto', 'min_samples_split': 23, 'n_estimators': 80}",0.76109,0.737941,0.574682,0.736082,0.55635,0.745097,0.812841,0.01133,0.014638,0.006348,0.018609,0.013978,0.009115,0.008379
"{'criterion': 'gini', 'max_features': 'auto', 'min_samples_split': 23, 'n_estimators': 100}",0.762414,0.742963,0.580197,0.737109,0.559091,0.746859,0.814887,0.01122,0.015313,0.010007,0.016309,0.011451,0.007125,0.007783


#KNN

In [None]:
clasificador=KNeighborsClassifier()
parameters={'n_neighbors':[2,5,10,20,50],
            'weights':['uniform', 'distance'],
            'algorithm':['ball_tree','kd_tree'],
            'leaf_size':[30,90]}
df_knn=kfold_model_selection(x, y, clasificador, parameters,multilabel=True,n_split=5,random_state=12)
df_knn

Unnamed: 0,acc_2_2,acc_4_2,acc_4_4,acc_m_2,acc_m_4,acc_m_m1,acc_m_m2,std_acc_2_2,std_acc_4_2,std_acc_4_4,std_acc_m_2,std_acc_m_4,std_acc_m_m1,std_acc_m_m2
"{'algorithm': 'ball_tree', 'leaf_size': 30, 'n_neighbors': 2, 'weights': 'uniform'}",0.805414,0.805414,0.611526,0.796475,0.609091,0.779195,0.860445,0.007814,0.007814,0.015524,0.008339,0.01679,0.011604,0.006224
"{'algorithm': 'ball_tree', 'leaf_size': 30, 'n_neighbors': 2, 'weights': 'distance'}",0.820584,0.820584,0.683137,0.820584,0.683137,0.824826,0.860673,0.008507,0.008507,0.016549,0.008507,0.016549,0.010887,0.006661
"{'algorithm': 'ball_tree', 'leaf_size': 30, 'n_neighbors': 5, 'weights': 'uniform'}",0.818386,0.815972,0.647842,0.815471,0.659291,0.810366,0.864842,0.006149,0.004504,0.009059,0.00339,0.009946,0.008634,0.003808
"{'algorithm': 'ball_tree', 'leaf_size': 30, 'n_neighbors': 5, 'weights': 'distance'}",0.82099,0.824072,0.674604,0.819339,0.665362,0.813833,0.86704,0.006713,0.005296,0.007409,0.00455,0.011294,0.008668,0.004234
"{'algorithm': 'ball_tree', 'leaf_size': 30, 'n_neighbors': 10, 'weights': 'uniform'}",0.789647,0.773732,0.57749,0.771001,0.554568,0.745675,0.840358,0.007492,0.010364,0.012676,0.004558,0.00602,0.005212,0.003206
"{'algorithm': 'ball_tree', 'leaf_size': 30, 'n_neighbors': 10, 'weights': 'distance'}",0.811102,0.801872,0.631366,0.801233,0.621385,0.786624,0.856504,0.009943,0.008726,0.011234,0.009317,0.013143,0.009368,0.005293
"{'algorithm': 'ball_tree', 'leaf_size': 30, 'n_neighbors': 20, 'weights': 'uniform'}",0.752012,0.730384,0.489794,0.727247,0.475175,0.692151,0.81049,0.007188,0.0112,0.014629,0.009312,0.012255,0.008224,0.007257
"{'algorithm': 'ball_tree', 'leaf_size': 30, 'n_neighbors': 20, 'weights': 'distance'}",0.774888,0.760076,0.544878,0.756786,0.529137,0.730853,0.828911,0.006781,0.005435,0.009745,0.009969,0.020887,0.010288,0.008188
"{'algorithm': 'ball_tree', 'leaf_size': 30, 'n_neighbors': 50, 'weights': 'uniform'}",0.709961,0.67088,0.399722,0.669358,0.395352,0.627872,0.774104,0.012486,0.009642,0.001751,0.007916,0.00847,0.005891,0.005619
"{'algorithm': 'ball_tree', 'leaf_size': 30, 'n_neighbors': 50, 'weights': 'distance'}",0.735671,0.694376,0.43052,0.698088,0.425724,0.653745,0.789796,0.011388,0.010368,0.006598,0.011667,0.008645,0.007834,0.007317


In [None]:
def kfold_model_selection(x, y, clasificador, parameters,multilabel,n_split=5,random_state=12):
  '''
  inputs:
    x: vector de caraterísticas
    y: vector objetivo en formato multilabel
    clasificador: modelo de ML a probar
    parameters: (dict) hipeparametros del modelo a explorar
    func_transfromacion: Función de transformación de los datos
    n_split: Numero de splits de la valicdación cruzada
    random_state: random state de la val cruzada.
  return:
    df: devuelve el dataframe donde se muestran los resultados de la validacion cruzada resultante de variar los hiperparametros de los modelos de ML
  '''

  param_grid = ParameterGrid(parameters)
  index_values=list(param_grid)
  columns=['acc_2_2','acc_4_2','acc_4_4','acc_m_2','acc_m_4','acc_m_m1','acc_m_m2','std_acc_2_2','std_acc_4_2','std_acc_4_4','std_acc_m_2','std_acc_m_4','std_acc_m_m1','std_acc_m_m2']
  
  y_m_4=multilabel_to_4classes(y)
  y_m_2=multilabel_to_2classes(y)

  rskf=StratifiedKFold(n_split,random_state=random_state,shuffle=True)
  resultados=np.zeros((len(param_grid),len(columns)))

  for c,params in enumerate(param_grid):
    clasificador.set_params(**params)
    test_score=np.zeros((n_split,7))
    i=0
    for train_index, test_index in rskf.split(x,y_m_4):#queda estratificado respecto a las 4 clases
      x_train,x_test=x[train_index],x[test_index]
      
      # Multilabel
      if multilabel==True:
        y_train,y_test=y[train_index],y[test_index]
        clasificador.fit(x_train, y_train)
        y_pred=clasificador.predict(x_test)

        y_pred_m_4 = multilabel_to_4classes(y_pred)
        y_pred_m_2 = multilabel_to_2classes(y_pred)
        y_test_m_4 = multilabel_to_4classes(y_test)
        y_test_m_2 = multilabel_to_2classes(y_test)

        test_score[i,3] = accuracy_score(y_test_m_2,y_pred_m_2)
        test_score[i,4] = accuracy_score(y_test_m_4,y_pred_m_4)
        test_score[i,5] = (accuracy_score(y_test[:,0],y_pred[:,0])+accuracy_score(y_test[:,1],y_pred[:,1]))/2
        test_score[i,6] = np.sum(y_test==y_pred)/y_test.size
      # 4 clases
      y_train,y_test=y_m_4[train_index],y_m_4[test_index]
      clasificador.fit(x_train, y_train)
      y_pred=clasificador.predict(x_test)

      y_pred_4_2 = from_4_to_2classes(y_pred)
      y_test_4_2 = from_4_to_2classes(y_test)

      test_score[i,1] = accuracy_score(y_test_4_2,y_pred_4_2)
      test_score[i,2] = accuracy_score(y_test,y_pred)  
      # 2 clases
      y_train,y_test=y_m_2[train_index],y_m_2[test_index]
      clasificador.fit(x_train, y_train)
      y_pred=clasificador.predict(x_test)

      test_score[i,0] = accuracy_score(y_test,y_pred)    

      i+=1
    resultados[c] = np.concatenate((test_score.mean(axis=0),test_score.std(axis=0)),axis=0)
  return pd.DataFrame(data=resultados,index=index_values,columns=columns)
clasificador=KNeighborsClassifier()
parameters={'n_neighbors':[2],
            'weights':['distance'],
            'algorithm':['ball_tree'],
            'leaf_size':[30]}
df_knn=kfold_model_selection(x, y, clasificador, parameters,multilabel=True,n_split=5,random_state=12)
df_knn

Unnamed: 0,acc_2_2,acc_4_2,acc_4_4,acc_m_2,acc_m_4,acc_m_m1,acc_m_m2,std_acc_2_2,std_acc_4_2,std_acc_4_4,std_acc_m_2,std_acc_m_4,std_acc_m_m1,std_acc_m_m2
"{'algorithm': 'ball_tree', 'leaf_size': 30, 'n_neighbors': 2, 'weights': 'distance'}",0.820345,0.820345,0.751668,0.820345,0.751668,0.860673,0.860673,0.008418,0.008418,0.010415,0.008418,0.010415,0.006661,0.006661


#LOG


In [None]:
clasificador=LogisticRegression(class_weight='balanced',random_state=12,max_iter=5000)
parameters={'C':[100,200,300,500,1000,5000],
            'solver':['newton-cg','lbfgs']}
df_log=kfold_model_selection(x, y, clasificador, parameters,multilabel=False,n_split=5,random_state=12)
df_log

Unnamed: 0,acc_2_2,acc_4_2,acc_4_4,acc_m_2,acc_m_4,acc_m_m1,acc_m_m2,std_acc_2_2,std_acc_4_2,std_acc_4_4,std_acc_m_2,std_acc_m_4,std_acc_m_m1,std_acc_m_m2
"{'C': 100, 'solver': 'newton-cg'}",0.649661,0.633933,0.463125,0.0,0.0,0.0,0.0,0.009525,0.010394,0.011794,0.0,0.0,0.0,0.0
"{'C': 100, 'solver': 'lbfgs'}",0.64966,0.633913,0.463136,0.0,0.0,0.0,0.0,0.009924,0.010359,0.011812,0.0,0.0,0.0,0.0
"{'C': 200, 'solver': 'newton-cg'}",0.649994,0.63366,0.463145,0.0,0.0,0.0,0.0,0.00944,0.010628,0.009679,0.0,0.0,0.0,0.0
"{'C': 200, 'solver': 'lbfgs'}",0.649837,0.633483,0.462865,0.0,0.0,0.0,0.0,0.009614,0.010077,0.010382,0.0,0.0,0.0,0.0
"{'C': 300, 'solver': 'newton-cg'}",0.649857,0.633503,0.463009,0.0,0.0,0.0,0.0,0.009709,0.010496,0.009788,0.0,0.0,0.0,0.0
"{'C': 300, 'solver': 'lbfgs'}",0.649857,0.633356,0.463072,0.0,0.0,0.0,0.0,0.009928,0.010519,0.009823,0.0,0.0,0.0,0.0
"{'C': 500, 'solver': 'newton-cg'}",0.6497,0.633209,0.462571,0.0,0.0,0.0,0.0,0.010133,0.01073,0.010463,0.0,0.0,0.0,0.0
"{'C': 500, 'solver': 'lbfgs'}",0.649867,0.633796,0.462865,0.0,0.0,0.0,0.0,0.010186,0.011062,0.010526,0.0,0.0,0.0,0.0
"{'C': 1000, 'solver': 'newton-cg'}",0.649407,0.633032,0.461363,0.0,0.0,0.0,0.0,0.010664,0.010441,0.012048,0.0,0.0,0.0,0.0
"{'C': 1000, 'solver': 'lbfgs'}",0.649563,0.633356,0.462645,0.0,0.0,0.0,0.0,0.010557,0.010418,0.01121,0.0,0.0,0.0,0.0


# MFCCs

In [8]:
path='/content/drive/MyDrive/Respiratory Sounds Final/Resultados/Caracteristicas_50ms.csv'
df_data=pd.read_csv(path,index_col=0)
x = df_data.loc[:,['mfcc1 m', ' mfcc2 m', ' mfcc3 m', ' mfcc4 m', ' mfcc5 m', ' mfcc6 m',
       ' mfcc7 m', ' mfcc8 m', ' mfcc9 m', ' mfcc10 m', ' mfcc11 m',
       ' mfcc12 m', ' mfcc13 m', ' mfcc1 s', ' mfcc2 s', 
        ' mfcc3 s', ' mfcc4 s', ' mfcc5 s',
       ' mfcc6 s', ' mfcc7 s', ' mfcc8 s', ' mfcc9 s', ' mfcc10 s',
       ' mfcc11 s', ' mfcc12 s', ' mfcc13 s']].values
y = data[:,-2:] #target para c y w
scaler = MinMaxScaler((-1,1),True)#Nomaliza entre -1 y 1
x=scaler.fit_transform(x)

In [9]:
clasificador=KNeighborsClassifier()
parameters={'n_neighbors':[2],
            'weights':['distance'],
            'algorithm':['ball_tree'],
            'leaf_size':[30]}
df_knn=kfold_model_selection(x, y, clasificador, parameters,multilabel=True,n_split=5,random_state=12)
df_knn

Unnamed: 0,acc_2_2,acc_4_2,acc_4_4,acc_m_2,acc_m_4,acc_m_m1,acc_m_m2,std_acc_2_2,std_acc_4_2,std_acc_4_4,std_acc_m_2,std_acc_m_4,std_acc_m_m1,std_acc_m_m2
"{'algorithm': 'ball_tree', 'leaf_size': 30, 'n_neighbors': 2, 'weights': 'distance'}",0.822093,0.822093,0.681023,0.822093,0.681023,0.825038,0.860218,0.004141,0.004141,0.010826,0.004141,0.010826,0.008234,0.004966


In [10]:
clasificador=RandomForestClassifier(class_weight='balanced',random_state=12) 
parameters={'n_estimators':[120], 
            'criterion':['gini'], 
            'min_samples_split':[23],
            'max_features':['auto']}
df_rf=kfold_model_selection(x, y, clasificador, parameters,multilabel=True,n_split=5,random_state=12)
df_rf

Unnamed: 0,acc_2_2,acc_4_2,acc_4_4,acc_m_2,acc_m_4,acc_m_m1,acc_m_m2,std_acc_2_2,std_acc_4_2,std_acc_4_4,std_acc_m_2,std_acc_m_4,std_acc_m_m1,std_acc_m_m2
"{'criterion': 'gini', 'max_features': 'auto', 'min_samples_split': 23, 'n_estimators': 120}",0.763344,0.741186,0.569607,0.738212,0.554682,0.744521,0.808898,0.008907,0.01421,0.006673,0.015479,0.014131,0.011878,0.011942


In [11]:
def KerasMLPClassifier(input_dim,output_dim,hidden_layer=10,activation='tanh',batch_size=32,learning_rate=0.001,epochs=500, class_weight={1:0.23,0:0.77},loss='categorical_crossentropy'):
  model = kr.Sequential()
  model.add(kr.layers.Dense(hidden_layer, input_dim=input_dim, activation='relu'))
  model.add(kr.layers.Dense(output_dim, activation=activation))
  
  model.compile(loss=loss,
              optimizer=kr.optimizers.Adam(learning_rate=learning_rate), 
              metrics=['binary_accuracy'])
  return model

In [12]:
clasificador=kr.wrappers.scikit_learn.KerasClassifier(build_fn=KerasMLPClassifier,input_dim=26,output_dim=1,verbose=0,hidden_layer=183,activation='softmax',batch_size=60,class_weight={1:0.65,0:2.157},
                                                      learning_rate=0.001,epochs=500,callbacks=kr.callbacks.EarlyStopping(monitor='val_loss', patience=50,restore_best_weights=True),
                                                      validation_split=0.1,loss='categorical_crossentropy')
parameters={'hidden_layer':[183], 
            'learning_rate':[0.001], 
            'batch_size':[60]}
n_split=5
random_state=12
multilabel=True
param_grid = ParameterGrid(parameters)
index_values=list(param_grid)
columns=['acc_2_2','acc_4_2','acc_4_4','acc_m_2','acc_m_4','acc_m_m1','acc_m_m2','std_acc_2_2','std_acc_4_2','std_acc_4_4','std_acc_m_2','std_acc_m_4','std_acc_m_m1','std_acc_m_m2']

y_m_4=multilabel_to_4classes(y)
y_m_2=multilabel_to_2classes(y)

rskf=StratifiedKFold(n_split,random_state=random_state,shuffle=True)
resultados=np.zeros((len(param_grid),len(columns)))

for c,params in enumerate(param_grid):
  clasificador.set_params(**params)
  test_score=np.zeros((n_split,7))
  i=0
  for train_index, test_index in rskf.split(x,y_m_4):#queda estratificado respecto a las 4 clases
    x_train,x_test=x[train_index],x[test_index]
    
    # Multilabel
    if multilabel==True:
      y_train,y_test=y[train_index],y[test_index]
      clasificador.set_params(**{'output_dim':2,'class_weight':None,'activation':'sigmoid','loss':'binary_crossentropy'})
      clasificador.fit(x_train, y_train)
      y_pred=clasificador.predict_proba(x_test)>0.5

      y_pred_m_4 = multilabel_to_4classes(y_pred)
      y_pred_m_2 = multilabel_to_2classes(y_pred)
      y_test_m_4 = multilabel_to_4classes(y_test)
      y_test_m_2 = multilabel_to_2classes(y_test)

      test_score[i,3] = balanced_accuracy_score(y_test_m_2,y_pred_m_2)
      test_score[i,4] = balanced_accuracy_score(y_test_m_4,y_pred_m_4)
      test_score[i,5] = (balanced_accuracy_score(y_test[:,0],y_pred[:,0])+balanced_accuracy_score(y_test[:,1],y_pred[:,1]))/2
      test_score[i,6] = np.sum(y_test==y_pred)/y_test.size
    # 4 clases
    y_train,y_test=np_utils.to_categorical(y_m_4[train_index]),y_m_4[test_index]
    clasificador.set_params(**{'output_dim':4,'class_weight':{3:0.484,2:0.9,1:1.92,0:3.28},'activation':'softmax','loss':'categorical_crossentropy'})
    clasificador.fit(x_train, y_train)
    y_pred=clasificador.predict(x_test)

    y_pred_4_2 = from_4_to_2classes(y_pred)
    y_test_4_2 = from_4_to_2classes(y_test)

    test_score[i,1] = balanced_accuracy_score(y_test_4_2,y_pred_4_2)
    test_score[i,2] = balanced_accuracy_score(y_test,y_pred)  
    # 2 clases
    y_train,y_test=np_utils.to_categorical(y_m_2[train_index]),y_m_2[test_index]
    clasificador.set_params(**{'output_dim':2,'class_weight':{1:0.97,0:1.03},'activation':'softmax','loss':'categorical_crossentropy'})
    clasificador.fit(x_train, y_train)
    y_pred=clasificador.predict(x_test)

    test_score[i,0] = balanced_accuracy_score(y_test,y_pred)    

    i+=1
  resultados[c] = np.concatenate((test_score.mean(axis=0),test_score.std(axis=0)),axis=0)
df_mlp = pd.DataFrame(data=resultados,index=index_values,columns=columns)

In [13]:
df_mlp

Unnamed: 0,acc_2_2,acc_4_2,acc_4_4,acc_m_2,acc_m_4,acc_m_m1,acc_m_m2,std_acc_2_2,std_acc_4_2,std_acc_4_4,std_acc_m_2,std_acc_m_4,std_acc_m_m1,std_acc_m_m2
"{'batch_size': 60, 'hidden_layer': 183, 'learning_rate': 0.001}",0.682861,0.628611,0.361578,0.688553,0.451835,0.667856,0.779184,0.036791,0.060629,0.052617,0.013822,0.017036,0.013862,0.009434
