In [1]:
#Research references:
#1) Dry/wet cough classification: https://link.springer.com/article/10.1007/s10439-013-0741-6
#2) Pneumonia classification: https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6987276
#3) https://espace.library.uq.edu.au/data/UQ_344963/s41943203_phd_submission.pdf?Expires=1585601065&Key-Pair-Id=APKAJKNBJ4MJBJNC6NLQ&Signature=Lnpf6wT8rkozSh9av7U9nGuC7WAH6KuI2Cj3Y7G366gkGlh8D-Ie1Kc~TyBAUu~uMsVltleJcSv3p6TCm6HdFnhpyoTgLcYh6eFfvQwIUqbk1Bf4JZldgB~BDKUOwY1G0pA-HoKjvIAu3avO98SMO35upakm9OEBByd4nC9aXsjKRThd6bTpq1qIuuD9gh1l5FaM6hNRB0c2lCf4Q3adx7C3FW0NMwdWhcuF45A9f~dO3zTWWSQamoo5Otc-PHMMt96TetNcML~jy9ghgJeCPY6DJLUIwQAt03fENBluS~TjTJ17WD~n51xiRofb94fEJHoRHh0d-430LLwr7BX4IA__

In [2]:
import numpy as np
import os
#import pywt #wavelets
from pydub import AudioSegment
from pydub.silence import split_on_silence
from pydub.utils import mediainfo
from pydub.playback import play
import matplotlib.pyplot as plt
#import seaborn as sn
import python_speech_features as spe_feats
import pandas as pd
from scipy.stats import kurtosis, skew
from scipy.signal import lfilter
import librosa
import pysptk
import math
import sys
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
#from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, confusion_matrix

## Settings

In [3]:
## RECORDINGS DATA SET

#CHANGE TO YOUR OWN TRAINING SET FOLDER
#folder of training data set
FOLDER_PATH = 'data/YT_set/edited_wavs/'

#RECORDINGS PRE-PROCESSING

fs_targ = 16000 # set all audios to this sampling frequency
n_channels_targ = 1

HPF_skip = False #skip applying pre-emphasis (high-pass) filtering
norm_skip = False #skip normalization step
dB_targ = -28.0 #target level for normalization

##FEATUTRE EXTRACTION

featExtr_skip = False #skip wavs reading + feature extraction steps (if feats pickle file already available)

#initialize data frame of features:
feats = pd.DataFrame([])

#tiny constant value
eps = sys.float_info.epsilon

#framing

frame_len_s=0.025 #12 segments seemed adequeate in paper, since segments are no longer than 400ms (400ms/12=33.3ms)
frame_step_s=frame_len_s #according to paper: non-overlapping frames

frame_len = int(round(frame_len_s*fs_targ)) #in samples
frame_step = int(round(frame_step_s*fs_targ)) #in samples
win_func =np.hamming #at least for mfcc, as in paper

#mfcc
cep_num= 13 #number of coefficients as in paper (https://link.springer.com/article/10.1007/s10439-013-0741-6)

#lp
lp_ord = int(round(2 + fs_targ/1000)) #standard rule of thumb for LP oder

#formants
nr_formants = 4 #as in paper, first 4 formants

## Functions

In [4]:
#compute RMS value of a signal and return it (in dB scale)
#seems not to work?
def get_RMS(s):
    s_rms = np.sqrt(np.mean(np.power(s,2)))
    #convert to dB scale
    #s_db = 20*np.log10(s_rms/1.0)
    return s_rms

#seems not to work either
#RMS-based normalization of a signal, based on a target level (in dB)
def RMS_normalization(s, dB_targ):
    
    #desired level is converted to linear scale
    rms_targ = 10**(dB_targ/20.0)
    
    #compute scaling factor
    scale = rms_targ/get_RMS(s)
    
    #scale amplitude of input signal
    scaled_s = scale*s
    
    return scaled_s

def match_target_amplitude(audioSegment_sound, target_dBFS):
    dBFS_diff = target_dBFS - audioSegment_sound.dBFS
    return audioSegment_sound.apply_gain(dBFS_diff)

#Apply pre-emphasis (high-pass) filter
def apply_preEmph(x):
    x_filt = lfilter([1., -0.97], 1, x)
    return x_filt
        
#Obtain autocorrelation
def autocorr(x):
    result = np.correlate(x, x, mode='full')
    return result[int((result.size+1)/2):] #Note: other people use re.size/2:, but this does not work for me 
                                   # TODO: check consistency in other computers

#Compute zero-crossing rate
def get_zcr(x):
    zcr = (((x[:-1] * x[1:]) < 0).sum())/(len(x)-1)
    return zcr

#Compute log-energy
def get_logEnergy(x):
    logEnergy = np.log10( ( (np.power(x,2)).sum()/len(x) ) + eps)  
    return logEnergy

#Estimate fundamental frequency (F0)
def get_F0(x,fs):
    #autocorrelation-based method to extract F0
    xcorr_arr = autocorr(x)
    
    #looking for F0 in the frequency interval 50-500Hz, but we search in time domain
    min_ms = round(fs/500)
    max_ms = round(fs/50)
    
    xcorr_slot = xcorr_arr[max_ms+1:2*max_ms+1]
    xcorr_slot = xcorr_slot[min_ms:max_ms]
    t0 = np.argmax(xcorr_slot)
    F0 = fs/(min_ms+t0-1)
    return F0

#Estimate formants
def get_formants(x, lp_order, nr_formants):
    
    #compute lp coefficients
    a = librosa.lpc(x, lp_ord)
    

    #get roots from lp coefficients
    rts = np.roots(a)
    rts = [r for r in rts if np.imag(r) >= 0]

    #get angles
    angz = np.arctan2(np.imag(rts), np.real(rts))

    #get formant frequencies
    formants = sorted(angz * (fs_targ / (2 * math.pi)))
    
    return formants[0:nr_formants]

def get_entropy(x, type='shannon'):
    #default shannon entropy since this is the one used by the phd thesis
    
    base = {'shannon' : 2., 'natural' : math.exp(1), 'hartley' : 10.}
    N = len(x)

    if N <= 1:
        return 0

    value,counts = np.unique(x, return_counts=True)
    probs = counts / N
    n_classes = np.count_nonzero(probs)

    if n_classes <= 1:
        return 0

    ent = 0. #initialization

    # compute entropy
    for i in probs:
        ent -= i * math.log(i+eps, base[type])
    
    return ent

#Extract frequencies
def feature_extraction(x,fs,feats_df,lp_ord,ID,label):
#Extract features from signal x (identified as ID), and concatenate them to dataframe feats_df
#Features' reference: (see Appendix)
#[1]https://link.springer.com/article/10.1007/s10439-013-0741-6
#[2]https://espace.library.uq.edu.au/data/UQ_344963/s41943203_phd_submission.pdf?dsi_version=c5434db897ab74b192ca295a9eeca041&Expires=1585086202&Key-Pair-Id=APKAJKNBJ4MJBJNC6NLQ&Signature=c8k8DmG~KIxg0ToTO8rebm2MzHneCzJGkjSFRB7BYTEQ-MHXEr0ocHmISrldP3hFf9qmeiL11ezyefcNeRVeKIQ9PVjOl9pn7rXWcjA1o2voPn1VnDd8n7G2cT31apdj0LNMclhlXRPnCsGD66qDRqa3d-xaqqXhEqU73aw3ZgBgroO213MfJOqFhJxxXo2QEia0bSlDRTeX9KhSczFK-IFTPC6GwFL2L04por8pQRI3HF7E3f26O9zp9OhkwxSU9qfJah20WxZLA4PxREdv7JGoVBinR6T0mTcIaQi~B4IzYjSPSsTTADMNk5znVYIvSqgtMT~DY~qwlfq4SRdFjQ__
  
    
    #do features in a frame-basis
    x_frames = spe_feats.sigproc.framesig(x,frame_len,frame_step,win_func) #DOUBT: should I use window or not?
                                                                        #at least for formant estimation i should

    nr_frames = x_frames.shape[0]
    #print(nr_frames)
        
    #0)Wavelets #TODO
    
    #DOUBT: if log-energy feature is included, should I also include the first mfcc coefficient (c0) ?
    #1)mfcc
    mfcc_feat = spe_feats.mfcc(x,fs, winlen=frame_len_s,winstep=frame_step_s, numcep=cep_num,winfunc=win_func)
    
    #deltas to capture 
    mfcc_delta_feat = spe_feats.delta(mfcc_feat,1) #mfcc_delta_feat = np.subtract(mfcc_feat[:-1], mfcc_feat[1:]) #same
    mfcc_deltadelta_feat = spe_feats.delta(mfcc_delta_feat,1)          
    
    #2)zero-crossing rate
    zcr_feat = np.apply_along_axis(get_zcr, 1, x_frames)
    
    #3)Formant frequencies
    #using LP-coeffcs-based method
    #formant_feat = np.apply_along_axis(get_formants, 1, x_frames, lp_ord, nr_formants)
    
    #Note: for the moment, it seems some frames are ill-conditioned for lp computing,
    #current solution - we skip those and fill with NaN values
    formants_feat= np.empty((nr_frames,4))
    formants_feat[:] = np.nan
    
    for i_frame in range(0,nr_frames):
        try: 
            formants_feat[i_frame] = get_formants(x_frames[i_frame], lp_ord, nr_formants)
        except:
            pass
    
    #4)Log-energy
    logEnergy_feat =  np.apply_along_axis(get_logEnergy, 1, x_frames)
    
    #5)Pitch (F0)
    F0_feat =  np.apply_along_axis(get_F0, 1, x_frames,fs)
    
    #TODO: compute also F0 with pysptk (a python wrapper for SPTK library), it probably gives better results
    #https://github.com/r9y9/pysptk/blob/master
    #F0_feat = pysptk.rapt(x.astype(np.float32), fs=fs, hopsize=frame_step, min=50, max=500, ,voice_bias=0.0 ,otype=\"f0\")
    #right frame size???
    
    #6)Kurtosis
    kurt_feat =  np.apply_along_axis(kurtosis, 1, x_frames)
    
    #7)Bispectrum Score (BGS)
    #TODO: see PhD thesis for more info on this feature
    
    #8)Non-Gaussianity Score (NGS)
    #TODO: see PhD thesis for more info on this feature
   
    #9) Adding skewness as measure of non-gaussianity (not in paper)
    skew_feat =  np.apply_along_axis(skew, 1, x_frames)
    
    #DOUBT: 10) Shannon entropy GETTING -inf in all cases, WHY??? Don't include until fixed
    entropy_feat = np.apply_along_axis(get_entropy, 1, x_frames)

    
    #TODO: add small value in all entries, this may fix the problem
    
    mfcc_cols = ['mfcc_%s' % s for s in range(0,cep_num)]
    mfcc_delta_cols = ['mfcc_d%s' % s for s in range(0,cep_num)]
    mfcc_deltadelta_cols = ['mfcc_dd%s' % s for s in range(0,cep_num)]
    formants_cols = ['F%s' % s for s in range(1,nr_formants+1)]
          
    feats_segment = pd.concat([pd.DataFrame({'Id': ID, 'kurt': kurt_feat, 'logEnergy': logEnergy_feat,
                                                 'zcr': zcr_feat, 'F0': F0_feat,
                                                 'skewness': skew_feat, 'label': label, 'entropy':entropy_feat}),
                               pd.DataFrame(mfcc_feat,columns=mfcc_cols), 
                            pd.DataFrame(formants_feat,columns=formants_cols)],axis=1)
    
    #print(nr_frames)
    feats_df = feats_df.append(feats_segment,ignore_index=True, sort=False)
    
    return feats_df


# MAIN

## Reading recordings

In [5]:
all_s=[]
all_label=[]
all_id=[]
all_fs=[]

In [6]:
#Read wav data set

if featExtr_skip is False:
    print("Readings wavs...")

    #only list files in FOLDER_PATH directory
    wav_files = [f for f in os.listdir(FOLDER_PATH) if os.path.isfile(os.path.join(FOLDER_PATH, f))]
    for file_name in wav_files:
    
        fname_noExt = os.path.splitext(file_name)[0] #file name without extension
    
        #full path file name
        full_fname = FOLDER_PATH+file_name
        print(full_fname)
    
        # load audio
        s = AudioSegment.from_wav(full_fname)
        print(full_fname)
        all_s.append(s)
        #sampling rate:
        info = mediainfo(full_fname)
        fs = float(info['sample_rate'])
        all_fs.append(fs)
    
        #get ID of recording
        ID = fname_noExt.split('-')[-2] #for the current type of naming
        #print(file_name)
        #print(ID)
        all_id.append(ID)
    
        #get label
        label = fname_noExt.split('-')[-1] #for the current type of naming
        #print(label)
        all_label.append(label)

Readings wavs...
data/YT_set/edited_wavs/edit_Spring Allergy Coughing-7Ez5Wc_esBg-Dry.wav
data/YT_set/edited_wavs/edit_Spring Allergy Coughing-7Ez5Wc_esBg-Dry.wav
data/YT_set/edited_wavs/edit_Coughing 51-LkxvBb2VXbs-Dry.wav
data/YT_set/edited_wavs/edit_Coughing 51-LkxvBb2VXbs-Dry.wav
data/YT_set/edited_wavs/edit_Wet coughing-0QQxKN-KC1U-Wet.wav
data/YT_set/edited_wavs/edit_Wet coughing-0QQxKN-KC1U-Wet.wav
data/YT_set/edited_wavs/edit_Dry Coughing Fit in the Afternoon.-A5s2ZgwQ1VM-Dry.wav
data/YT_set/edited_wavs/edit_Dry Coughing Fit in the Afternoon.-A5s2ZgwQ1VM-Dry.wav
data/YT_set/edited_wavs/edit_Coughing 77-2Mw-s5jnqXU-Wet.wav
data/YT_set/edited_wavs/edit_Coughing 77-2Mw-s5jnqXU-Wet.wav
data/YT_set/edited_wavs/edit_Wheezing Chest and Wet Cough 2-5905FxXz9dI-Wet.wav
data/YT_set/edited_wavs/edit_Wheezing Chest and Wet Cough 2-5905FxXz9dI-Wet.wav
data/YT_set/edited_wavs/edit_Coughing 46-dg-I9j76-t8-Wet.wav
data/YT_set/edited_wavs/edit_Coughing 46-dg-I9j76-t8-Wet.wav
data/YT_set/edited_

Listening to some of the audios

In [7]:
if featExtr_skip is False:
    np.where(np.array(all_label)=='Dry')

In [8]:
if featExtr_skip is False:
    np.where(np.array(all_label)=='Wet')

In [9]:
if featExtr_skip is False:
    s=all_s[15]
    s

## Feature extraction

In [10]:
if featExtr_skip is False:

    for s, ID, label in zip(all_s,all_id,all_label):

            #Pre-processing of the signals:

            ## 0 ) Resampling to target sampling frequency:
            s = s.set_frame_rate(fs_targ)
            fs= fs_targ

            ## 1)
            if norm_skip is False:
                s=match_target_amplitude(s, dB_targ)
                #print(s.rms)

            ## 2) Segmentation of cough streams (silence-based)
            #min_silence_len in ms, silence_thresh in dB
            s_segments = split_on_silence (s, min_silence_len = 600, silence_thresh =s.dBFS-10)

            #checks that segmentation and removal of silence is OK
            #print(len(s_segments))
            #play(s)
            #input("Press Enter to continue...")               
            #for i in range(len(s_segments)):
            #    play(s_segments[i])
            #    input("Press Enter to continue...")               

            ## 3) Convert s_segments to numpy array format
            AudioSegment2numpy_arr = lambda x: np.asarray(x.get_array_of_samples())
            s_segments_np = list(map(AudioSegment2numpy_arr, s_segments))


            ## 4) Pre-emphasis filtering on each segment
            if HPF_skip is False:
                print('High-pass filtering...')       
                preEmph_filtering = lambda x: apply_preEmph(x)
                s_segments_filt = list(map(preEmph_filtering, s_segments_np))
            else:
                s_segments_filt = s_segments_np

            print('Computing features...')
            #Feature extraction for each segment

            #(lambda function doesn't work )
            #feat_extr_step = lambda x, fs, feats_df, lp_ord, ID: feature_extraction(x,fs,feats_df,lp_ord,ID)
            #feats = feat_extr_step(s_segments_filt,fs,feats,lp_ord,ID)
            for idx, seg_i in enumerate(s_segments_filt):
                #print('\tSegment %d' % idx)
                feats = feature_extraction(seg_i,fs,feats,lp_ord,ID,label)
    
       

High-pass filtering...
Computing features...
High-pass filtering...
Computing features...
High-pass filtering...
Computing features...
High-pass filtering...
Computing features...
High-pass filtering...
Computing features...
High-pass filtering...
Computing features...
High-pass filtering...
Computing features...
High-pass filtering...
Computing features...
High-pass filtering...
Computing features...
High-pass filtering...
Computing features...
High-pass filtering...
Computing features...
High-pass filtering...
Computing features...
High-pass filtering...
Computing features...
High-pass filtering...
Computing features...
High-pass filtering...
Computing features...
High-pass filtering...
Computing features...
High-pass filtering...
Computing features...
High-pass filtering...
Computing features...
High-pass filtering...
Computing features...
High-pass filtering...
Computing features...
High-pass filtering...
Computing features...
High-pass filtering...
Computing features...
High-pass 

## Load  (or store) features 

In [11]:

feats_fname = 'feats_df.pkl'

if featExtr_skip is False:
    #Store feature df
    feats.to_pickle(feats_fname)
else:
    #Load feature df
    feats = pd.read_pickle(feats_fname)

## Pre-processing of features

In [12]:
feats

Unnamed: 0,Id,kurt,logEnergy,zcr,F0,skewness,label,entropy,mfcc_0,mfcc_1,...,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,F1,F2,F3,F4
0,7Ez5Wc_esBg,3.706714,2.510354,0.641604,457.142857,-0.350596,Dry,8.643856,12.154072,-53.889708,...,-13.203568,-3.480999,0.217583,-11.359071,-23.046762,9.816940,0.000000,0.000000,0.000000,1671.985365
1,7Ez5Wc_esBg,7.762735,2.599776,0.761905,432.432432,-0.312852,Dry,8.643856,12.473961,-56.181480,...,1.236424,0.631101,-5.085036,6.876460,-13.753278,4.849485,0.000000,178.242624,2006.956659,2836.432554
2,7Ez5Wc_esBg,1.893479,2.469615,0.729323,444.444444,0.127164,Dry,8.643856,12.188062,-54.491861,...,-7.354098,-2.208687,-4.561046,-5.488364,-12.229855,-4.461634,0.000000,0.000000,1604.030767,2521.100143
3,7Ez5Wc_esBg,1.919692,2.738322,0.789474,516.129032,-0.054807,Dry,8.643856,12.892992,-51.279293,...,-12.276520,6.892395,-1.984168,7.735193,-18.072184,-4.649738,0.000000,578.823866,1641.870397,2716.285168
4,7Ez5Wc_esBg,2.692210,2.450934,0.791980,500.000000,-0.125955,Dry,8.643856,12.171122,-52.351841,...,-9.025626,-2.758992,-10.323113,-4.270525,-14.132265,3.084379,0.000000,444.910143,2067.299451,3095.495183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4465,ct3tHDfNKiQ,1.596083,5.224528,0.626566,484.848485,-0.004247,Wet,8.643856,18.315415,-59.223086,...,2.850233,-6.934126,-5.118358,-13.225567,-15.709984,2.325254,964.850117,1866.621631,2409.248954,3057.439451
4466,ct3tHDfNKiQ,1.698796,5.326481,0.644110,500.000000,-0.173511,Wet,8.643856,18.526675,-58.655791,...,-8.142383,-27.868522,-5.177583,-15.276977,-21.546500,11.333826,0.000000,1462.848905,2426.322730,3056.951124
4467,ct3tHDfNKiQ,2.658432,5.181313,0.629073,457.142857,-0.000887,Wet,8.643856,18.197910,-61.197791,...,-10.318832,-16.866847,2.686336,-12.883587,-15.328687,9.527345,0.000000,0.000000,1448.977995,2264.160692
4468,ct3tHDfNKiQ,3.703883,5.159381,0.629073,457.142857,0.269426,Wet,8.643856,18.150308,-60.377438,...,-11.427861,-27.715255,-16.008879,-18.162084,-23.158347,14.375599,697.071838,1540.594508,2540.112508,3104.067576


In [13]:
#1.Check which columns have NaNs values

#feats2 = feats.copy()

#sum(feats.isna().any())
#feats.columns[feats.isna().any()].tolist() --> We get just the ones we have inserted in formants
feats2 = feats.interpolate(method ='cubic')
#feats2 = feats.dropna(axis=0).copy()
#feats2.dropna(axis=0, how="any", thresh=None, subset=None, inplace=True)

#feats2.columns[feats2.isna().any()].tolist()
#feats2.describe()


In [14]:
sum(feats2.isna().any())

0

In [15]:
#Grouping the frames from a same recording (Id) into chunks with the same number of frames.
#The training of the classifier will be based on these chunks

feats2['cum_IDidx'] = feats2.groupby('Id').cumcount()

def get_subidx(cum_Idx,batch_size):
    #batch needs to be an integer (or float like 3.0)
    return int(1.0*cum_Idx/batch_size)

feats2['subIdx'] = feats2.apply(lambda x: get_subidx(x['cum_IDidx'], 10), axis=1)
feats2 = feats2.drop(['cum_IDidx'],axis=1)

In [16]:
feats2

Unnamed: 0,Id,kurt,logEnergy,zcr,F0,skewness,label,entropy,mfcc_0,mfcc_1,...,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,F1,F2,F3,F4,subIdx
0,7Ez5Wc_esBg,3.706714,2.510354,0.641604,457.142857,-0.350596,Dry,8.643856,12.154072,-53.889708,...,-3.480999,0.217583,-11.359071,-23.046762,9.816940,0.000000,0.000000,0.000000,1671.985365,0
1,7Ez5Wc_esBg,7.762735,2.599776,0.761905,432.432432,-0.312852,Dry,8.643856,12.473961,-56.181480,...,0.631101,-5.085036,6.876460,-13.753278,4.849485,0.000000,178.242624,2006.956659,2836.432554,0
2,7Ez5Wc_esBg,1.893479,2.469615,0.729323,444.444444,0.127164,Dry,8.643856,12.188062,-54.491861,...,-2.208687,-4.561046,-5.488364,-12.229855,-4.461634,0.000000,0.000000,1604.030767,2521.100143,0
3,7Ez5Wc_esBg,1.919692,2.738322,0.789474,516.129032,-0.054807,Dry,8.643856,12.892992,-51.279293,...,6.892395,-1.984168,7.735193,-18.072184,-4.649738,0.000000,578.823866,1641.870397,2716.285168,0
4,7Ez5Wc_esBg,2.692210,2.450934,0.791980,500.000000,-0.125955,Dry,8.643856,12.171122,-52.351841,...,-2.758992,-10.323113,-4.270525,-14.132265,3.084379,0.000000,444.910143,2067.299451,3095.495183,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4465,ct3tHDfNKiQ,1.596083,5.224528,0.626566,484.848485,-0.004247,Wet,8.643856,18.315415,-59.223086,...,-6.934126,-5.118358,-13.225567,-15.709984,2.325254,964.850117,1866.621631,2409.248954,3057.439451,11
4466,ct3tHDfNKiQ,1.698796,5.326481,0.644110,500.000000,-0.173511,Wet,8.643856,18.526675,-58.655791,...,-27.868522,-5.177583,-15.276977,-21.546500,11.333826,0.000000,1462.848905,2426.322730,3056.951124,11
4467,ct3tHDfNKiQ,2.658432,5.181313,0.629073,457.142857,-0.000887,Wet,8.643856,18.197910,-61.197791,...,-16.866847,2.686336,-12.883587,-15.328687,9.527345,0.000000,0.000000,1448.977995,2264.160692,11
4468,ct3tHDfNKiQ,3.703883,5.159381,0.629073,457.142857,0.269426,Wet,8.643856,18.150308,-60.377438,...,-27.715255,-16.008879,-18.162084,-23.158347,14.375599,697.071838,1540.594508,2540.112508,3104.067576,11


In [17]:
mean_feats = feats2.groupby(['Id','subIdx']).aggregate('mean').reset_index()
std_feats = feats2.groupby(['Id','subIdx']).agg(lambda x: x.std(ddof=0)).reset_index() #ddof=0 to compute population std (rather than sample std)
keep_same = {'Id', 'subIdx'}
mean_feats.columns = ['{}{}'.format(c, '' if c in keep_same else '_m') for c in mean_feats.columns]
std_feats.columns = ['{}{}'.format(c, '' if c in keep_same else '_std') for c in std_feats.columns]

In [18]:
sum(std_feats.isna().any())

0

In [19]:
mean_std_feats = pd.merge(mean_feats, std_feats, on=['Id','subIdx'], how='outer')

In [20]:
#Make dictionary and add label column using it 
feats_unique = feats.drop_duplicates(subset=['Id'])
label_dict = dict(zip(feats_unique.Id, feats_unique.label))
mean_std_feats['label'] = mean_std_feats["Id"].map(label_dict)
#mean_std_feats[['Id','label']].head(50)

In [21]:
mean_std_feats.columns

Index(['Id', 'subIdx', 'kurt_m', 'logEnergy_m', 'zcr_m', 'F0_m', 'skewness_m',
       'entropy_m', 'mfcc_0_m', 'mfcc_1_m', 'mfcc_2_m', 'mfcc_3_m', 'mfcc_4_m',
       'mfcc_5_m', 'mfcc_6_m', 'mfcc_7_m', 'mfcc_8_m', 'mfcc_9_m', 'mfcc_10_m',
       'mfcc_11_m', 'mfcc_12_m', 'F1_m', 'F2_m', 'F3_m', 'F4_m', 'F0_std',
       'F1_std', 'F2_std', 'F3_std', 'F4_std', 'entropy_std', 'kurt_std',
       'logEnergy_std', 'mfcc_0_std', 'mfcc_1_std', 'mfcc_10_std',
       'mfcc_11_std', 'mfcc_12_std', 'mfcc_2_std', 'mfcc_3_std', 'mfcc_4_std',
       'mfcc_5_std', 'mfcc_6_std', 'mfcc_7_std', 'mfcc_8_std', 'mfcc_9_std',
       'skewness_std', 'zcr_std', 'label'],
      dtype='object')

In [22]:
mean_std_feats.describe()

Unnamed: 0,subIdx,kurt_m,logEnergy_m,zcr_m,F0_m,skewness_m,entropy_m,mfcc_0_m,mfcc_1_m,mfcc_2_m,...,mfcc_2_std,mfcc_3_std,mfcc_4_std,mfcc_5_std,mfcc_6_std,mfcc_7_std,mfcc_8_std,mfcc_9_std,skewness_std,zcr_std
count,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,...,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0,465.0
mean,6.460215,3.89169,4.243611,0.676807,440.164773,-0.003837,8.499361,16.115227,-46.109345,-3.914013,...,4.363459,6.011631,6.492005,6.846066,6.89847,7.955112,8.459117,7.990919,0.236467,0.041937
std,4.458021,3.410781,1.985224,0.164188,26.851309,0.220395,0.6164,4.677118,8.536427,8.423421,...,1.878562,2.6607,2.787865,2.670177,2.827098,3.28476,3.295593,3.245578,0.214387,0.034886
min,0.0,0.605327,-0.921195,0.072682,337.732114,-1.783253,1.59431,4.137886,-61.395391,-27.441467,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,1.94715,2.666927,0.493734,424.025976,-0.049218,8.631363,12.29616,-52.90306,-9.658168,...,3.095411,4.01928,4.780379,5.06503,5.179107,5.692797,6.188364,5.892106,0.100754,0.023867
50%,6.0,2.476054,5.01431,0.729574,442.252243,-0.005165,8.643856,17.793403,-47.318143,-4.219281,...,4.045961,5.417462,5.891193,6.37761,6.485465,7.430721,7.931842,7.554355,0.147863,0.033386
75%,10.0,5.825867,5.791059,0.81203,457.805752,0.040007,8.643856,19.863885,-40.583802,1.390664,...,5.268821,7.583406,7.615271,8.018979,8.180107,9.771119,10.348892,9.594265,0.31238,0.047184
max,20.0,35.066899,6.997471,0.945614,516.129032,3.600665,8.643856,22.621448,-6.754285,25.128835,...,13.459443,16.058676,20.07141,17.924307,20.027083,24.112487,21.593752,25.261439,2.012421,0.362797


In [23]:
mean_std_feats.columns[mean_std_feats.isna().any()].tolist() 

[]

In [24]:
mean_std_feats['Id']

0      1UDFq2InljM
1      1UDFq2InljM
2      1UDFq2InljM
3      1UDFq2InljM
4      1UDFq2InljM
          ...     
460    zjd4HrJbc8o
461    zjd4HrJbc8o
462    zjd4HrJbc8o
463    zjd4HrJbc8o
464    zjd4HrJbc8o
Name: Id, Length: 465, dtype: object

In [25]:
sum(mean_std_feats.isna().any())

0

In [26]:
#mean_std_feats = mean_std_feats.interpolate(method ='cubic')
#mean_std_feats.dropna(axis=0, how="any", thresh=None, subset=None, inplace=True) #--> Doesn't work!? Still get NaN error

In [27]:
sum(mean_std_feats.isna().any())

0

In [28]:
mean_std_feats['Id']

0      1UDFq2InljM
1      1UDFq2InljM
2      1UDFq2InljM
3      1UDFq2InljM
4      1UDFq2InljM
          ...     
460    zjd4HrJbc8o
461    zjd4HrJbc8o
462    zjd4HrJbc8o
463    zjd4HrJbc8o
464    zjd4HrJbc8o
Name: Id, Length: 465, dtype: object

In [29]:
#2. Get feature set, labels, and recording IDs
X_train = mean_std_feats.drop(['label','Id','subIdx'], 1).copy()
y_train =  mean_std_feats['label'].copy()

ID_train = mean_std_feats['Id']
ID_list = ID_train.drop_duplicates()

#ID_train.size
ID_list.size

36

In [30]:
#3. Normalization in case some model requires it

scaler = StandardScaler()
scaler.fit(X_train)

#use same scaler for both, based on X_train data
X_trainNorm = scaler.transform(X_train.values)

In [31]:
sum(X_train.isna().any())

0

## Model training

### Train-test split (k-fold)

In [32]:
k = ID_list.values.size #number of folds

group_kfold = GroupKFold(n_splits=k)
group_kfold.get_n_splits(X_trainNorm, y_train, ID_train)

36

In [33]:
k

36

### Logistic regression

In [34]:
#Do cross-validation
pred_probs = pd.DataFrame([])

idx_acc = 0
for train_index, test_index in group_kfold.split(X_trainNorm,y_train,ID_train):
    X_train1, X_test1 = X_trainNorm[train_index], X_trainNorm[test_index]
    y_train1, y_test1 = y_train[train_index], y_train[test_index]
    
    #TODO: optimize the penaly weight
    #https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html
    logReg = SGDClassifier(loss='log', penalty='elasticnet')
    logReg.fit(X_train1, y_train1)
    y_hat_prob = logReg.predict_proba(X_test1)
    classes =logReg.classes_
    pred_probs = pred_probs.append(pd.DataFrame({'ID': ID_train[test_index], str(classes[0]): y_hat_prob[:,0], str(classes[1]): y_hat_prob[:,1]}),ignore_index=True, sort=False)    

In [35]:
#mean_feats = feats2.groupby(['Id','subIdx']).aggregate('mean').reset_index()
mean_pred_probs = pred_probs.groupby('ID').aggregate('mean').reset_index()

In [36]:
mean_pred_probs

Unnamed: 0,ID,Dry,Wet
0,1UDFq2InljM,3e-06,0.999997
1,4k0ziD0j5BI,0.404324,0.595676
2,5905FxXz9dI,0.63143,0.36857
3,6LK6yHtIung,0.516275,0.483725
4,7Ez5Wc_esBg,0.732633,0.267367
5,A5s2ZgwQ1VM,0.829028,0.170972
6,AQOeIVbhFm4,0.782282,0.217718
7,CTSLdNxN1cc,0.470399,0.529601
8,CsDXlt7Ei1c,0.378254,0.621746
9,DYfjPnty2Ho,0.131271,0.868729


In [37]:
def predict_class(prob_dry,prob_wet):
    if prob_dry > prob_wet :
        return 'Dry'
    else:
        return 'Wet'
    
mean_pred_probs['pred_class'] = mean_pred_probs.apply(lambda x: predict_class(x['Dry'], x['Wet']), axis=1)

In [38]:
#add actual classes
mean_pred_probs['label'] = mean_pred_probs["ID"].map(label_dict)

In [39]:
mean_pred_probs

Unnamed: 0,ID,Dry,Wet,pred_class,label
0,1UDFq2InljM,3e-06,0.999997,Wet,Dry
1,4k0ziD0j5BI,0.404324,0.595676,Wet,Wet
2,5905FxXz9dI,0.63143,0.36857,Dry,Wet
3,6LK6yHtIung,0.516275,0.483725,Dry,Dry
4,7Ez5Wc_esBg,0.732633,0.267367,Dry,Dry
5,A5s2ZgwQ1VM,0.829028,0.170972,Dry,Dry
6,AQOeIVbhFm4,0.782282,0.217718,Dry,Dry
7,CTSLdNxN1cc,0.470399,0.529601,Wet,Wet
8,CsDXlt7Ei1c,0.378254,0.621746,Wet,Wet
9,DYfjPnty2Ho,0.131271,0.868729,Wet,Wet


In [40]:
mean_pred_probs[(mean_pred_probs.pred_class != mean_pred_probs.label)]

Unnamed: 0,ID,Dry,Wet,pred_class,label
0,1UDFq2InljM,3e-06,0.999997,Wet,Dry
2,5905FxXz9dI,0.63143,0.36857,Dry,Wet
10,Dc_aoUCqw2E,0.611396,0.388604,Dry,Wet
15,LkxvBb2VXbs,0.179263,0.820737,Wet,Dry
19,TK4CveeCWfY,0.602562,0.397438,Dry,Wet
20,Xe68,0.136984,0.863016,Wet,Dry
27,oCg,0.412808,0.587192,Wet,Dry
28,q6WsoL3J8U8,0.522757,0.477243,Dry,Wet
35,zjd4HrJbc8o,0.499668,0.500332,Wet,Dry


## Evaluation

In [41]:
#Accuracy
acc = accuracy_score(mean_pred_probs['label'], mean_pred_probs['pred_class'])
print(acc)

0.75


In [42]:
#TODO: Check if following measures are computed OK

In [43]:
#Precision
prec = precision_score(mean_pred_probs['label'], mean_pred_probs['pred_class'],average="macro")
print(prec)

0.7435064935064934


In [44]:
#F1-score
f1 = f1_score(mean_pred_probs['label'], mean_pred_probs['pred_class'],average="macro")
print(f1)

0.7401764234161988


In [45]:
#recall
recall = recall_score(mean_pred_probs['label'], mean_pred_probs['pred_class'],average="macro")
print(recall)

0.7380952380952381


In [46]:
#confusion matrix
conf_mat_df = pd.crosstab(mean_pred_probs['label'], mean_pred_probs['pred_class'], margins=True)

In [47]:
conf_mat_df

pred_class,Dry,Wet,All
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dry,10,5,15
Wet,4,17,21
All,14,22,36


In [48]:
#Cough sound
#Breathing rate
#Breathing rhytm (consistence smoothness)
#Cough rate
#Panic level
#Hoarseness