In [1]:
#Research references:
#1) Dry/wet cough classification: https://link.springer.com/article/10.1007/s10439-013-0741-6
#2) Pneumonia classification: https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6987276

In [2]:
import numpy as np
import os
import sox
#import pywt #wavelets
from pydub import AudioSegment
from pydub.silence import split_on_silence
from pydub.utils import mediainfo
from pydub.playback import play
import matplotlib.pyplot as plt
import seaborn as sn
import python_speech_features as spe_feats
import pandas as pd
from scipy.stats import kurtosis, skew, entropy
from scipy.signal import lfilter
import librosa
import math
import sys
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
#from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, confusion_matrix

## Settings

In [3]:
##DATASET

#folder of training data set
FOLDER_PATH = 'data/YT_set/edited_wavs/'

#folder where normalized wavs are stored
NORM_FOLDER_PATH = 'data/YT_set/edited_wavs/norm/'
norm_skip = False #skip normalization step (if done previously)


##FEATUTRES

featExtr_skip = False

#Initialize data frame of features:

feats = pd.DataFrame([])

#tiny constant value
eps = sys.float_info.epsilon

#Features' settings:

fs_targ = 16000 # set all audios to this sampling frequency
n_channels_targ = 1

#framing
frame_len_s=0.025 #12 segments seemed adequeate in paper, since segments are no longer than 400ms (400ms/12=33.3ms)
frame_step_s=frame_len_s #according to paper: non-overlapping frames

frame_len = int(round(frame_len_s*fs_targ)) #in samples
frame_step = int(round(frame_step_s*fs_targ)) #in samples
win_func =np.hamming #at least for mfcc

#mfcc
cep_num= 13 #number of coefficients as in paper (https://link.springer.com/article/10.1007/s10439-013-0741-6)

#lp
lp_ord = int(round(2 + fs_targ/1000)) #standard rule of thumb for LP oder

#formants
nr_formants = 4 #as in paper, first 4 formants

## Functions

In [4]:
#Apply pre-emphasis (high-pass) filter
def apply_preEmph(x):
    x_filt = lfilter([1., -0.97], 1, x)
    return x_filt
        
#Obtain autocorrelation
def autocorr(x):
    result = np.correlate(x, x, mode='full')
    return result[int((result.size+1)/2):] #Note: other people use re.size/2:, but this does not work for me 
                                   # TODO: check consistency in other computers

#Compute zero-crossing rate
def get_zcr(x):
    zcr = (((x[:-1] * x[1:]) < 0).sum())/(len(x)-1)
    return zcr

#Compute log-energy
def get_logEnergy(x):
    logEnergy = np.log10( ( (np.power(x,2)).sum()/len(x) ) + eps)  
    return logEnergy

#Estimate fundamental frequency (F0)
def get_F0(x,fs):
    #autocorrelation-based method to extract F0
    xcorr_arr = autocorr(x)
    
    #looking for F0 in the frequency interval 50-500Hz, but we search in time domain
    min_ms = round(fs/500)
    max_ms = round(fs/50)
    
    xcorr_slot = xcorr_arr[max_ms+1:2*max_ms+1]
    xcorr_slot = xcorr_slot[min_ms:max_ms]
    t0 = np.argmax(xcorr_slot)
    F0 = fs/(min_ms+t0-1)
    return F0

#Estimate formants
def get_formants(x, lp_order, nr_formants):
    
    #compute lp coefficients
    a = librosa.lpc(x, lp_ord)
    

    #get roots from lp coefficients
    rts = np.roots(a)
    rts = [r for r in rts if np.imag(r) >= 0]

    #get angles
    angz = np.arctan2(np.imag(rts), np.real(rts))

    #get formant frequencies
    formants = sorted(angz * (fs_targ / (2 * math.pi)))
    
    return formants[0:nr_formants]

#Extract frequencies
def feature_extraction(x,fs,feats_df,lp_ord,ID,label):
#Extract features from signal x (identified as ID), and concatenate them to dataframe feats_df
#Features' reference: (see Appendix)
#[1]https://link.springer.com/article/10.1007/s10439-013-0741-6
#[2]https://espace.library.uq.edu.au/data/UQ_344963/s41943203_phd_submission.pdf?dsi_version=c5434db897ab74b192ca295a9eeca041&Expires=1585086202&Key-Pair-Id=APKAJKNBJ4MJBJNC6NLQ&Signature=c8k8DmG~KIxg0ToTO8rebm2MzHneCzJGkjSFRB7BYTEQ-MHXEr0ocHmISrldP3hFf9qmeiL11ezyefcNeRVeKIQ9PVjOl9pn7rXWcjA1o2voPn1VnDd8n7G2cT31apdj0LNMclhlXRPnCsGD66qDRqa3d-xaqqXhEqU73aw3ZgBgroO213MfJOqFhJxxXo2QEia0bSlDRTeX9KhSczFK-IFTPC6GwFL2L04por8pQRI3HF7E3f26O9zp9OhkwxSU9qfJah20WxZLA4PxREdv7JGoVBinR6T0mTcIaQi~B4IzYjSPSsTTADMNk5znVYIvSqgtMT~DY~qwlfq4SRdFjQ__
  
    
    #do features in a frame-basis
    x_frames = spe_feats.sigproc.framesig(x,frame_len,frame_step,win_func) #DOUBT: should I use window or not?
                                                                        #at least for formant estimation i should

    nr_frames = x_frames.shape[0]
    #print(nr_frames)
        
    #0)Wavelets #TODO
    
    #DOUBT: if log-energy feature is included, should I also include the first mfcc coefficient (c0) ?
    #1)mfcc
    mfcc_feat = spe_feats.mfcc(x,fs, winlen=frame_len_s,winstep=frame_step_s, numcep=cep_num,winfunc=win_func)
    
    #deltas to capture 
    mfcc_delta_feat = spe_feats.delta(mfcc_feat,1) #mfcc_delta_feat = np.subtract(mfcc_feat[:-1], mfcc_feat[1:]) #same
    mfcc_deltadelta_feat = spe_feats.delta(mfcc_delta_feat,1)          
    
    #2)zero-crossing rate
    zcr_feat = np.apply_along_axis(get_zcr, 1, x_frames)
    
    #3)Formant frequencies
    #using LP-coeffcs-based method
    #formant_feat = np.apply_along_axis(get_formants, 1, x_frames, lp_ord, nr_formants)
    
    #Note: for the moment, it seems some frames are ill-conditioned for lp computing,
    #current solution - we skip those and fill with NaN values
    formants_feat= np.empty((nr_frames,4))
    formants_feat[:] = np.nan
    
    for i_frame in range(0,nr_frames):
        try: 
            formants_feat[i_frame] = get_formants(x_frames[i_frame], lp_ord, nr_formants)
        except:
            pass
    
    #4)Log-energy
    logEnergy_feat =  np.apply_along_axis(get_logEnergy, 1, x_frames)
    
    #5)Pitch (F0)
    F0_feat =  np.apply_along_axis(get_F0, 1, x_frames,fs)
    
    #TODO: compute also F0 with pysptk (a python wrapper for SPTK library), it probably gives better results
    #https://github.com/r9y9/pysptk/blob/master
    
    #6)Kurtosis
    kurt_feat =  np.apply_along_axis(kurtosis, 1, x_frames)
    
    #7)Bispectrum Score (BGS)
    #TODO: see PhD thesis for more info on this feature
    
    #8)Non-Gaussianity Score (NGS)
    #TODO: see PhD thesis for more info on this feature
   
    #9) Adding skewness as measure of non-gaussianity (not in paper)
    skew_feat =  np.apply_along_axis(skew, 1, x_frames)
    
    #DOUBT: 10) Shannon entropy GETTING -inf in all cases, WHY??? Don't include until fixed
    #entropy_feat = entropy(x)
    #Maybe compute directly to check
    
    mfcc_cols = ['mfcc_%s' % s for s in range(0,cep_num)]
    mfcc_delta_cols = ['mfcc_d%s' % s for s in range(0,cep_num)]
    mfcc_deltadelta_cols = ['mfcc_dd%s' % s for s in range(0,cep_num)]
    formants_cols = ['F%s' % s for s in range(1,nr_formants+1)]
          
    feats_segment = pd.concat([pd.DataFrame({'Id': ID, 'kurt': kurt_feat, 'logEnergy': logEnergy_feat,
                                                 'zcr': zcr_feat, 'F0': F0_feat,
                                                 'skewness': skew_feat, 'label': label}),
                               pd.DataFrame(mfcc_feat,columns=mfcc_cols), 
                            pd.DataFrame(formants_feat,columns=formants_cols)],axis=1)
    
    print(nr_frames)
    feats_df = feats_df.append(feats_segment,ignore_index=True, sort=False)
    
    return feats_df


# MAIN

## Reading recordings + feature extraction

In [5]:
#Read wav data set, apply pre-processing and extract features

if featExtr_skip is False:

    #only list files in FOLDER_PATH directory
    wav_files = [f for f in os.listdir(FOLDER_PATH) if os.path.isfile(os.path.join(FOLDER_PATH, f))]
    for file_name in wav_files:
    
        fname_noExt = os.path.splitext(file_name)[0] #file name without extension
    
        #full path file name
        full_fname = FOLDER_PATH+file_name
        #print(full_fname)
    
        #name for normalization
        norm_fname = NORM_FOLDER_PATH + os.path.splitext(file_name)[0] + '_NORM.wav'
    
        if norm_skip is False: 
        ## Normalization
        
            #level to same dB
            tfm = sox.Transformer()
            tfm.gain(gain_db=0.0, normalize=False, limiter=False, balance=None)
            #downsample to 16kHz and 1 channel
            tfm.convert(samplerate=fs_targ, n_channels=n_channels_targ, bitdepth=None) 
            #tfm.norm(db_level=0.0)
    
            # create the output normalized audio
        
            print(norm_fname)
            tfm.build(full_fname, norm_fname)
            tfm.effects_log
    
        # load normalized audio
        s = AudioSegment.from_wav(norm_fname)
        #sampling rate:
        info = mediainfo(norm_fname)
        fs = float(info['sample_rate'])
    
        #get ID of recording
        ID = fname_noExt.split('-')[-2] #for the current type of naming
        #print(file_name)
        #print(ID)
    
        #get label
        label = fname_noExt.split('-')[-1] #for the current type of naming
        #print(label)
    
        ## Segmentation of cough streams (silence-based)
        #min_silence_len in ms, silence_thresh in dB
        s_segments = split_on_silence (s, min_silence_len = 600, silence_thresh = -30)
        ## TODO: set more accurate thresholds, or find other way to split (variance-based?)
    
        #checks that segmentation and removal of silence is OK
        #print(len(s_segments))
        #for i in range(len(s_segments)):
        #    play(s_segments[i])
        #    input("Press Enter to continue...")
            
    
        #convert s_segments to numpy array format
        AudioSegment2numpy_arr = lambda x: np.asarray(x.get_array_of_samples())
        s_segments_np = list(map(AudioSegment2numpy_arr, s_segments))
    
        print('High-pass filtering...')
        #pre-emphasis filtering to each segment
        preEmph_filtering = lambda x: apply_preEmph(x)
        s_segments_filt = list(map(preEmph_filtering, s_segments_np))
    
        print('Computing features...')
        #Feature extraction for each segment
    
        #(lambda function doesn't work )
        #feat_extr_step = lambda x, fs, feats_df, lp_ord, ID: feature_extraction(x,fs,feats_df,lp_ord,ID)
        #feats = feat_extr_step(s_segments_filt,fs,feats,lp_ord,ID)
        for idx, seg_i in enumerate(s_segments_filt):
            print('\tSegment %d' % idx)
            feats = feature_extraction(seg_i,fs,feats,lp_ord,ID,label)
    
       

data/YT_set/edited_wavs/norm/edit_Spring Allergy Coughing-7Ez5Wc_esBg-Dry_NORM.wav
High-pass filtering...
Computing features...
	Segment 0
60
data/YT_set/edited_wavs/norm/edit_Coughing 51-LkxvBb2VXbs-Dry_NORM.wav
High-pass filtering...
Computing features...
	Segment 0
17
	Segment 1
30
	Segment 2
40
data/YT_set/edited_wavs/norm/edit_Wet coughing-0QQxKN-KC1U-Wet_NORM.wav
High-pass filtering...
Computing features...
	Segment 0
24
	Segment 1
44
data/YT_set/edited_wavs/norm/edit_Dry Coughing Fit in the Afternoon.-A5s2ZgwQ1VM-Dry_NORM.wav
High-pass filtering...
Computing features...
	Segment 0
31
	Segment 1
28
data/YT_set/edited_wavs/norm/edit_Coughing 77-2Mw-s5jnqXU-Wet_NORM.wav
High-pass filtering...
Computing features...
	Segment 0
51
data/YT_set/edited_wavs/norm/edit_Wheezing Chest and Wet Cough 2-5905FxXz9dI-Wet_NORM.wav
High-pass filtering...
Computing features...
	Segment 0
61
data/YT_set/edited_wavs/norm/edit_Coughing 46-dg-I9j76-t8-Wet_NORM.wav
High-pass filtering...
Computing featu

## Load  (or store) features 

In [6]:

feats_fname = 'feats_df.pkl'

if featExtr_skip is False:
    #Store feature df
    feats.to_pickle(feats_fname)
else:
    #Load feature df
    feats = pd.read_pickle(feats_fname)

## Pre-processing of features

In [7]:
#1.Check which columns have NaNs values

#feats2 = feats.copy()

#sum(feats.isna().any())
#feats.columns[feats.isna().any()].tolist() --> We get just the ones we have inserted in formants
feats2 = feats.interpolate(method ='cubic')
#feats2 = feats.dropna(axis=0).copy()
#feats2.dropna(axis=0, how="any", thresh=None, subset=None, inplace=True)

#feats2.columns[feats2.isna().any()].tolist()
#feats2.describe()


In [8]:
sum(feats2.isna().any())

0

In [9]:
#Grouping the frames from a same recording (Id) into chunks with the same number of frames.
#The training of the classifier will be based on these chunks

feats2['cum_IDidx'] = feats2.groupby('Id').cumcount()

def get_subidx(cum_Idx,batch_size):
    #batch needs to be an integer (or float like 3.0)
    return int(1.0*cum_Idx/batch_size)

feats2['subIdx'] = feats2.apply(lambda x: get_subidx(x['cum_IDidx'], 10), axis=1)
feats2 = feats2.drop(['cum_IDidx'],axis=1)

In [10]:
feats2

Unnamed: 0,Id,kurt,logEnergy,zcr,F0,skewness,label,mfcc_0,mfcc_1,mfcc_2,...,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,F1,F2,F3,F4,subIdx
0,7Ez5Wc_esBg,1.168570,2.405370,0.689223,484.848485,-0.105375,Dry,11.999428,-45.464501,6.818047,...,-16.963507,-6.881741,-13.180380,-3.492780,-21.028229,391.351112,1281.812825,2428.496341,3391.462908,0
1,7Ez5Wc_esBg,1.569533,2.214362,0.679198,457.142857,0.296251,Dry,11.503662,-45.137388,6.237469,...,-14.862523,1.838453,-15.617656,-13.608101,-21.572512,414.528611,1341.149532,2411.457709,3350.163245,0
2,7Ez5Wc_esBg,2.262644,2.073283,0.644110,333.333333,-0.255717,Dry,11.138907,-42.917045,-1.345602,...,-4.938680,-1.137625,-3.248927,-7.565779,-22.319002,591.076879,1275.152320,2288.820831,3305.290029,0
3,7Ez5Wc_esBg,3.417739,1.904354,0.714286,484.848485,0.058650,Dry,10.827765,-48.130098,-0.769200,...,-15.751260,-21.558265,-13.471156,-13.788487,-14.817594,0.000000,0.000000,939.580354,2328.960331,0
4,7Ez5Wc_esBg,2.230491,1.878562,0.699248,516.129032,0.060720,Dry,10.741783,-45.923854,6.025869,...,-4.128015,-7.916224,-14.731514,-14.746401,-14.500753,387.310873,1852.353697,2277.711294,3022.205605,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,ct3tHDfNKiQ,3.176705,6.818322,0.591479,516.129032,-0.336304,Wet,21.814211,-54.730372,1.318455,...,17.943542,22.944651,7.129800,7.599111,-9.795062,524.391642,1842.834631,2534.623086,3523.236822,5
2019,ct3tHDfNKiQ,5.542308,6.680917,0.593985,484.848485,-0.521756,Wet,21.630536,-54.268699,4.046790,...,31.217998,30.376779,-8.056270,-14.748922,-1.744284,583.837589,1678.182074,2869.777027,3614.657071,5
2020,ct3tHDfNKiQ,1.139484,5.746670,0.686717,444.444444,-0.069039,Wet,19.611167,-59.864258,-6.594004,...,9.127842,12.204370,8.759876,3.143983,-9.832523,601.454279,1872.906688,2512.884619,3340.146590,5
2021,ct3tHDfNKiQ,1.232797,5.711783,0.686717,390.243902,0.259294,Wet,19.570157,-63.365566,-7.314727,...,7.624828,26.349348,-10.205445,-3.653975,-2.570938,609.073950,1772.727321,2608.913676,3298.395489,5


In [11]:
mean_feats = feats2.groupby(['Id','subIdx']).aggregate('mean').reset_index()
std_feats = feats2.groupby(['Id','subIdx']).agg(lambda x: x.std(ddof=0)).reset_index() #ddof=0 to compute population std (rather than sample std)
keep_same = {'Id', 'subIdx'}
mean_feats.columns = ['{}{}'.format(c, '' if c in keep_same else '_m') for c in mean_feats.columns]
std_feats.columns = ['{}{}'.format(c, '' if c in keep_same else '_std') for c in std_feats.columns]

In [12]:
sum(std_feats.isna().any())

0

In [13]:
mean_std_feats = pd.merge(mean_feats, std_feats, on=['Id','subIdx'], how='outer')

In [14]:
#Make dictionary and add label column using it 
feats_unique = feats.drop_duplicates(subset=['Id'])
label_dict = dict(zip(feats_unique.Id, feats_unique.label))
mean_std_feats['label'] = mean_std_feats["Id"].map(label_dict)
#mean_std_feats[['Id','label']].head(50)

In [15]:
mean_std_feats.columns

Index(['Id', 'subIdx', 'kurt_m', 'logEnergy_m', 'zcr_m', 'F0_m', 'skewness_m',
       'mfcc_0_m', 'mfcc_1_m', 'mfcc_2_m', 'mfcc_3_m', 'mfcc_4_m', 'mfcc_5_m',
       'mfcc_6_m', 'mfcc_7_m', 'mfcc_8_m', 'mfcc_9_m', 'mfcc_10_m',
       'mfcc_11_m', 'mfcc_12_m', 'F1_m', 'F2_m', 'F3_m', 'F4_m', 'F0_std',
       'F1_std', 'F2_std', 'F3_std', 'F4_std', 'kurt_std', 'logEnergy_std',
       'mfcc_0_std', 'mfcc_1_std', 'mfcc_10_std', 'mfcc_11_std', 'mfcc_12_std',
       'mfcc_2_std', 'mfcc_3_std', 'mfcc_4_std', 'mfcc_5_std', 'mfcc_6_std',
       'mfcc_7_std', 'mfcc_8_std', 'mfcc_9_std', 'skewness_std', 'zcr_std',
       'label'],
      dtype='object')

In [16]:
mean_std_feats.describe()

Unnamed: 0,subIdx,kurt_m,logEnergy_m,zcr_m,F0_m,skewness_m,mfcc_0_m,mfcc_1_m,mfcc_2_m,mfcc_3_m,...,mfcc_2_std,mfcc_3_std,mfcc_4_std,mfcc_5_std,mfcc_6_std,mfcc_7_std,mfcc_8_std,mfcc_9_std,skewness_std,zcr_std
count,218.0,218.0,218.0,218.0,218.0,218.0,218.0,218.0,218.0,218.0,...,218.0,218.0,218.0,218.0,218.0,218.0,218.0,218.0,218.0,218.0
mean,2.715596,3.407311,4.252936,0.461233,441.213319,-0.006601,15.529144,-42.826797,-15.704705,-18.383076,...,6.120164,6.935951,8.268756,9.305648,9.439237,10.351931,9.380189,10.078636,0.259172,0.076492
std,2.052723,2.86536,2.657758,0.130557,26.060573,0.194675,6.602084,9.218986,8.607867,10.243488,...,2.825319,3.240529,3.542907,3.926046,4.015761,4.580077,4.22756,4.408859,0.204213,0.043776
min,0.0,-3.0,-15.65356,0.0,340.785851,-1.927648,-36.043653,-57.598442,-39.207942,-66.106803,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,2.243495,3.359031,0.384211,423.432401,-0.056251,13.32794,-49.850327,-21.462712,-23.623312,...,4.281148,4.715938,6.053916,6.478014,6.884485,6.98462,6.730671,7.359415,0.175593,0.045537
50%,3.0,2.571446,5.040823,0.484085,441.802036,0.011934,17.126364,-44.840213,-16.227323,-18.078689,...,5.870391,6.535495,7.931373,9.004429,9.122189,9.581584,8.860932,9.570507,0.222459,0.070866
75%,4.0,3.356258,5.714163,0.544236,455.300492,0.067576,19.238771,-37.560775,-9.861587,-12.732412,...,7.411207,8.927786,10.321373,11.233553,11.572127,12.990329,11.634667,12.412888,0.288909,0.095177
max,8.0,26.744905,7.02058,0.723559,516.129032,0.818531,22.277289,0.0,9.069258,18.857106,...,18.457188,20.738214,20.008697,26.31857,30.29664,24.804866,27.604,24.349278,2.603809,0.23108


In [17]:
mean_std_feats.columns[mean_std_feats.isna().any()].tolist() 

[]

In [18]:
mean_std_feats['Id']

0      1UDFq2InljM
1      1UDFq2InljM
2      1UDFq2InljM
3      1UDFq2InljM
4      4k0ziD0j5BI
          ...     
213    zjd4HrJbc8o
214    zjd4HrJbc8o
215    zjd4HrJbc8o
216    zjd4HrJbc8o
217    zjd4HrJbc8o
Name: Id, Length: 218, dtype: object

In [19]:
sum(mean_std_feats.isna().any())

0

In [20]:
#mean_std_feats = mean_std_feats.interpolate(method ='cubic')
#mean_std_feats.dropna(axis=0, how="any", thresh=None, subset=None, inplace=True) #--> Doesn't work!? Still get NaN error

In [21]:
sum(mean_std_feats.isna().any())

0

In [22]:
mean_std_feats['Id']

0      1UDFq2InljM
1      1UDFq2InljM
2      1UDFq2InljM
3      1UDFq2InljM
4      4k0ziD0j5BI
          ...     
213    zjd4HrJbc8o
214    zjd4HrJbc8o
215    zjd4HrJbc8o
216    zjd4HrJbc8o
217    zjd4HrJbc8o
Name: Id, Length: 218, dtype: object

In [23]:
#2. Get feature set, labels, and recording IDs
X_train = mean_std_feats.drop(['label','Id','subIdx'], 1).copy()
y_train =  mean_std_feats['label'].copy()

ID_train = mean_std_feats['Id']
ID_list = ID_train.drop_duplicates()

#ID_train.size
ID_list.size

36

In [24]:
#3. Normalization in case some model requires it

scaler = StandardScaler()
scaler.fit(X_train)

#use same scaler for both, based on X_train data
X_trainNorm = scaler.transform(X_train.values)

In [25]:
sum(X_train.isna().any())

0

## Model training

### Train-test split (k-fold)

In [59]:
k = ID_list.values.size #number of folds

group_kfold = GroupKFold(n_splits=k)
group_kfold.get_n_splits(X_trainNorm, y_train, ID_train)

36

### Logistic regression

In [60]:
#Do cross-validation
pred_probs = pd.DataFrame([])

idx_acc = 0
for train_index, test_index in group_kfold.split(X_trainNorm,y_train,ID_train):
    X_train1, X_test1 = X_trainNorm[train_index], X_trainNorm[test_index]
    y_train1, y_test1 = y_train[train_index], y_train[test_index]
    
    #logReg = LogisticRegression()
    
    #loss=log --> logistic regression
    logReg = SGDClassifier(loss='log', penalty='l2')
    logReg.fit(X_train1, y_train1)
    y_hat_prob = logReg.predict_proba(X_test1)
    classes =logReg.classes_
    pred_probs = pred_probs.append(pd.DataFrame({'ID': ID_train[test_index], str(classes[0]): y_hat_prob[:,0], str(classes[1]): y_hat_prob[:,1]}),ignore_index=True, sort=False)    

In [61]:
#mean_feats = feats2.groupby(['Id','subIdx']).aggregate('mean').reset_index()
mean_pred_probs = pred_probs.groupby('ID').aggregate('mean').reset_index()

In [62]:
mean_pred_probs

Unnamed: 0,ID,Dry,Wet
0,1UDFq2InljM,0.75,0.25
1,4k0ziD0j5BI,0.1603508,0.8396492
2,5905FxXz9dI,0.8502661,0.1497339
3,6LK6yHtIung,0.5534188,0.4465812
4,7Ez5Wc_esBg,0.8333333,0.1666667
5,A5s2ZgwQ1VM,0.5141334,0.4858666
6,AQOeIVbhFm4,0.6472901,0.3527099
7,CTSLdNxN1cc,0.2000001,0.7999999
8,CsDXlt7Ei1c,0.2500016,0.7499984
9,DYfjPnty2Ho,1.554312e-15,1.0


In [63]:
def predict_class(prob_dry,prob_wet):
    if prob_dry > prob_wet :
        return 'Dry'
    else:
        return 'Wet'
    
mean_pred_probs['pred_class'] = mean_pred_probs.apply(lambda x: predict_class(x['Dry'], x['Wet']), axis=1)

In [64]:
#add actual classes
mean_pred_probs['label'] = mean_pred_probs["ID"].map(label_dict)

In [65]:
mean_pred_probs

Unnamed: 0,ID,Dry,Wet,pred_class,label
0,1UDFq2InljM,0.75,0.25,Dry,Dry
1,4k0ziD0j5BI,0.1603508,0.8396492,Wet,Wet
2,5905FxXz9dI,0.8502661,0.1497339,Dry,Wet
3,6LK6yHtIung,0.5534188,0.4465812,Dry,Dry
4,7Ez5Wc_esBg,0.8333333,0.1666667,Dry,Dry
5,A5s2ZgwQ1VM,0.5141334,0.4858666,Dry,Dry
6,AQOeIVbhFm4,0.6472901,0.3527099,Dry,Dry
7,CTSLdNxN1cc,0.2000001,0.7999999,Wet,Wet
8,CsDXlt7Ei1c,0.2500016,0.7499984,Wet,Wet
9,DYfjPnty2Ho,1.554312e-15,1.0,Wet,Wet


In [74]:
mean_pred_probs[(mean_pred_probs.pred_class != mean_pred_probs.label)]

Unnamed: 0,ID,Dry,Wet,pred_class,label
2,5905FxXz9dI,0.850266,0.149734,Dry,Wet
17,Pwsk,0.222263,0.777737,Wet,Dry
19,TK4CveeCWfY,0.564683,0.435317,Dry,Wet
20,Xe68,0.399972,0.600028,Wet,Dry
23,d2wkdrScerU,0.999504,0.000496,Dry,Wet
24,diuuEXKzNB8,0.571334,0.428666,Dry,Wet
25,h2FLCKMcEX0,0.5,0.5,Dry,Wet
27,oCg,0.161503,0.838497,Wet,Dry
33,tZtJaS2ZtME,0.259929,0.740071,Wet,Dry
35,zjd4HrJbc8o,0.4,0.6,Wet,Dry


## Evaluation

In [67]:
#Accuracy
acc = accuracy_score(mean_pred_probs['label'], mean_pred_probs['pred_class'])
print(acc)

0.7222222222222222


In [68]:
#TODO: Check if following measures are computed OK

In [69]:
#Precision
prec = precision_score(mean_pred_probs['label'], mean_pred_probs['pred_class'],average="macro")
print(prec)

0.7142857142857142


In [70]:
#F1-score
f1 = f1_score(mean_pred_probs['label'], mean_pred_probs['pred_class'],average="macro")
print(f1)

0.7142857142857142


In [71]:
#recall
recall = recall_score(mean_pred_probs['label'], mean_pred_probs['pred_class'],average="macro")
print(recall)

0.7142857142857142


In [72]:
#confusion matrix
conf_mat_df = pd.crosstab(mean_pred_probs['label'], mean_pred_probs['pred_class'], margins=True)

In [73]:
conf_mat_df

pred_class,Dry,Wet,All
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dry,10,5,15
Wet,5,16,21
All,15,21,36


In [57]:
#TODO: check confusion matrix?, to see where there was the most problems --> I can see directly in the df

In [58]:
#Cough sound
#Breathing rate
#Breathing rhytm (consistence smoothness)
#Cough rate
#Panic level
#Hoarseness