In [1]:
#Research references:
#1) Dry/wet cough classification: https://link.springer.com/article/10.1007/s10439-013-0741-6
#2) Pneumonia classification: https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6987276

In [96]:
import numpy as np
import os
import sox
#import pywt #wavelets
from pydub import AudioSegment
from pydub.silence import split_on_silence
from pydub.utils import mediainfo
import matplotlib.pyplot as plt
import python_speech_features as spe_feats
import pandas as pd
from scipy.stats import kurtosis, skew, entropy
from scipy.signal import lfilter
import librosa
import math
import sys
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Settings

In [3]:
##DATASET

#folder of training data set
FOLDER_PATH = 'data/YT_set/wavs/'

#folder where normalized wavs are stored
NORM_FOLDER_PATH = 'data/YT_set/wavs/norm/'
norm_skip = False #skip normalization step (because it has been done previously)


##FEATUTRES
#Initialize data frame of features:

feats = pd.DataFrame([])

#tiny constant value
eps = sys.float_info.epsilon

#Features' settings:

fs_targ = 16000 # set all audios to this sampling frequency
n_channels_targ = 1

#framing
frame_len_s=0.025 #12 segments seemed adequeate in paper, since segments are no longer than 400ms (400ms/12=33.3ms)
frame_step_s=frame_len_s #according to paper: non-overlapping frames

frame_len = int(round(frame_len_s*fs_targ)) #in samples
frame_step = int(round(frame_step_s*fs_targ)) #in samples
win_func =np.hamming #at least for mfcc

#mfcc
cep_num= 13 #number of coefficients as in paper (https://link.springer.com/article/10.1007/s10439-013-0741-6)

#lp
lp_ord = int(round(2 + fs_targ/1000)) #standard rule of thumb for LP oder

#formants
nr_formants = 4 #as in paper, first 4 formants

## Functions

In [4]:
#Apply pre-emphasis (high-pass) filter
def apply_preEmph(x):
    x_filt = lfilter([1., -0.97], 1, x)
    return x_filt
        
#Obtain autocorrelation
def autocorr(x):
    result = np.correlate(x, x, mode='full')
    return result[int((result.size+1)/2):] #Note: other people use re.size/2:, but this does not work for me 
                                   # TODO: check consistency in other computers

#Compute zero-crossing rate
def get_zcr(x):
    zcr = (((x[:-1] * x[1:]) < 0).sum())/(len(x)-1)
    return zcr

#Compute log-energy
def get_logEnergy(x):
    logEnergy = np.log10( ( (np.power(x,2)).sum()/len(x) ) + eps)  
    return logEnergy

#Estimate fundamental frequency (F0)
def get_F0(x,fs):
    #autocorrelation-based method to extract F0
    xcorr_arr = autocorr(x)
    
    #looking for F0 in the frequency interval 50-500Hz, but we search in time domain
    min_ms = round(fs/500)
    max_ms = round(fs/50)
    
    xcorr_slot = xcorr_arr[max_ms+1:2*max_ms+1]
    xcorr_slot = xcorr_slot[min_ms:max_ms]
    t0 = np.argmax(xcorr_slot)
    F0 = fs/(min_ms+t0-1)
    return F0

#Estimate formants
def get_formants(x, lp_order, nr_formants):
    
    #compute lp coefficients
    a = librosa.lpc(x, lp_ord)
    

    #get roots from lp coefficients
    rts = np.roots(a)
    rts = [r for r in rts if np.imag(r) >= 0]

    #get angles
    angz = np.arctan2(np.imag(rts), np.real(rts))

    #get formant frequencies
    formants = sorted(angz * (fs_targ / (2 * math.pi)))
    
    return formants[0:nr_formants]

#Extract frequencies
def feature_extraction(x,fs,feats_df,lp_ord,ID,label):
#Extract features from signal x (identified as ID), and concatenate them to dataframe feats_df
#Features' reference: (see Appendix)
#[1]https://link.springer.com/article/10.1007/s10439-013-0741-6
#[2]https://espace.library.uq.edu.au/data/UQ_344963/s41943203_phd_submission.pdf?dsi_version=c5434db897ab74b192ca295a9eeca041&Expires=1585086202&Key-Pair-Id=APKAJKNBJ4MJBJNC6NLQ&Signature=c8k8DmG~KIxg0ToTO8rebm2MzHneCzJGkjSFRB7BYTEQ-MHXEr0ocHmISrldP3hFf9qmeiL11ezyefcNeRVeKIQ9PVjOl9pn7rXWcjA1o2voPn1VnDd8n7G2cT31apdj0LNMclhlXRPnCsGD66qDRqa3d-xaqqXhEqU73aw3ZgBgroO213MfJOqFhJxxXo2QEia0bSlDRTeX9KhSczFK-IFTPC6GwFL2L04por8pQRI3HF7E3f26O9zp9OhkwxSU9qfJah20WxZLA4PxREdv7JGoVBinR6T0mTcIaQi~B4IzYjSPSsTTADMNk5znVYIvSqgtMT~DY~qwlfq4SRdFjQ__
  
    
    #do features in a frame-basis
    x_frames = spe_feats.sigproc.framesig(x,frame_len,frame_step,win_func) #DOUBT: should I use window or not?
                                                                        #at least for formant estimation i should

    nr_frames = x_frames.shape[0]
    #print(nr_frames)
        
    #0)Wavelets #TODO
    
    #DOUBT: if log-energy feature is included, should I also include the first mfcc coefficient (c0) ?
    #1)mfcc
    mfcc_feat = spe_feats.mfcc(x,fs, winlen=frame_len_s,winstep=frame_step_s, numcep=cep_num,winfunc=win_func)
    
    #deltas to capture 
    mfcc_delta_feat = spe_feats.delta(mfcc_feat,1) #mfcc_delta_feat = np.subtract(mfcc_feat[:-1], mfcc_feat[1:]) #same
    mfcc_deltadelta_feat = spe_feats.delta(mfcc_delta_feat,1)          
    
    #2)zero-crossing rate
    zcr_feat = np.apply_along_axis(get_zcr, 1, x_frames)
    
    #3)Formant frequencies
    #using LP-coeffcs-based method
    #formant_feat = np.apply_along_axis(get_formants, 1, x_frames, lp_ord, nr_formants)
    
    #Note: for the moment, it seems some frames are ill-conditioned for lp computing,
    #current solution - we skip those and fill with NaN values
    formants_feat= np.empty((nr_frames,4))
    formants_feat[:] = np.nan
    
    for i_frame in range(0,nr_frames):
        try: 
            formants_feat[i_frame] = get_formants(x_frames[i_frame], lp_ord, nr_formants)
        except:
            pass
    
    #4)Log-energy
    logEnergy_feat =  np.apply_along_axis(get_logEnergy, 1, x_frames)
    
    #5)Pitch (F0)
    F0_feat =  np.apply_along_axis(get_F0, 1, x_frames,fs)
    
    #TODO: compute also F0 with pysptk (a python wrapper for SPTK library), it probably gives better results
    #https://github.com/r9y9/pysptk/blob/master
    
    #6)Kurtosis
    kurt_feat =  np.apply_along_axis(kurtosis, 1, x_frames)
    
    #7)Bispectrum Score (BGS)
    #TODO: see PhD thesis for more info on this feature
    
    #8)Non-Gaussianity Score (NGS)
    #TODO: see PhD thesis for more info on this feature
   
    #9) Adding skewness as measure of non-gaussianity (not in paper)
    skew_feat =  np.apply_along_axis(skew, 1, x_frames)
    
    #DOUBT: 10) Shannon entropy GETTING -inf in all cases, WHY??? Don't include until fixed
    #entropy_feat = entropy(x)
    #Maybe compute directly to check
    
    mfcc_cols = ['mfcc_%s' % s for s in range(0,cep_num)]
    mfcc_delta_cols = ['mfcc_d%s' % s for s in range(0,cep_num)]
    mfcc_deltadelta_cols = ['mfcc_dd%s' % s for s in range(0,cep_num)]
    formants_cols = ['F%s' % s for s in range(1,nr_formants+1)]
          
    feats_segment = pd.concat([pd.DataFrame({'Id': ID, 'kurt': kurt_feat, 'logEnergy': logEnergy_feat,
                                                 'zcr': zcr_feat, 'F0': F0_feat,
                                                 'skewness': skew_feat, 'label': label}),
                               pd.DataFrame(mfcc_feat,columns=mfcc_cols), 
                            pd.DataFrame(formants_feat,columns=formants_cols)],axis=1)
    
    print(nr_frames)
    feats_df = feats_df.append(feats_segment,ignore_index=True, sort=False)
    
    return feats_df


# MAIN

## Reading recordings + feature extraction

In [5]:
#Read wav data set, apply pre-processing and extract features

#only list files in FOLDER_PATH directory
wav_files = [f for f in os.listdir(FOLDER_PATH) if os.path.isfile(os.path.join(FOLDER_PATH, f))]
for file_name in wav_files:
    
    fname_noExt = os.path.splitext(file_name)[0] #file name without extension
    
    #full path file name
    full_fname = FOLDER_PATH+file_name
    #print(full_fname)
    
    #name for normalization
    norm_fname = NORM_FOLDER_PATH + os.path.splitext(file_name)[0] + '_NORM.wav'
    
    if norm_skip is False: 
        ## Normalization
        
        #level to same dB
        tfm = sox.Transformer()
        tfm.gain(gain_db=0.0, normalize=False, limiter=False, balance=None)
        #downsample to 16kHz and 1 channel
        tfm.convert(samplerate=fs_targ, n_channels=n_channels_targ, bitdepth=None) 
        #tfm.norm(db_level=0.0)
    
        # create the output normalized audio
        
        print(norm_fname)
        tfm.build(full_fname, norm_fname)
        tfm.effects_log
    
    # load normalized audio
    s = AudioSegment.from_wav(norm_fname)
    #sampling rate:
    info = mediainfo(norm_fname)
    fs = float(info['sample_rate'])
    
    #get ID of recording
    ID = fname_noExt.split('-')[-2] #for the current type of naming
    print(file_name)
    print(ID)
    
    #get label
    label = fname_noExt.split('-')[-1] #for the current type of naming
    print(label)
    
    ## Segmentation of cough streams (silence-based)
    #min_silence_len in ms, silence_thresh in dB
    s_segments = split_on_silence (s, min_silence_len = 600, silence_thresh = -30)
    ## TODO: set more accurate thresholds, or find other way to split (variance-based?)
    
    #convert s_segments to numpy array format
    AudioSegment2numpy_arr = lambda x: np.asarray(x.get_array_of_samples())
    s_segments_np = list(map(AudioSegment2numpy_arr, s_segments))
    
    print('High-pass filtering...')
    #pre-emphasis filtering to each segment
    preEmph_filtering = lambda x: apply_preEmph(x)
    s_segments_filt = list(map(preEmph_filtering, s_segments_np))
    
    #TODO
    #2) the segment is divided into X non-overlapping subsegments (X=3 for dry/wet cough paper,
    #X=12 for pneumonia paper)
    #TODO: window framing: I think maybe the segments need to be windowed with non-overlapping frames? 
    #(see frame settings at beggining of notebook)
    
    print('Computing features...')
    #Feature extraction for each segment
    
    #(lambda function doesn't work )
    #feat_extr_step = lambda x, fs, feats_df, lp_ord, ID: feature_extraction(x,fs,feats_df,lp_ord,ID)
    #feats = feat_extr_step(s_segments_filt,fs,feats,lp_ord,ID)
    for idx, seg_i in enumerate(s_segments_filt):
        print('\tSegment %d' % idx)
        feats = feature_extraction(seg_i,fs,feats,lp_ord,ID,label)
    
       

output_file: data/YT_set/wavs/norm/More Allergy Coughing-NfKZNt25L-Q-Dry_NORM.wav already exists and will be overwritten on build


data/YT_set/wavs/norm/More Allergy Coughing-NfKZNt25L-Q-Dry_NORM.wav
More Allergy Coughing-NfKZNt25L-Q-Dry.wav
Q
Dry
High-pass filtering...
Computing features...
	Segment 0
9
	Segment 1
9
	Segment 2
22
	Segment 3
49
	Segment 4
29
	Segment 5
35
	Segment 6
38
	Segment 7
45
	Segment 8
39
	Segment 9
33
	Segment 10
12
	Segment 11
157
	Segment 12
23
	Segment 13
43
	Segment 14
46
	Segment 15
39
	Segment 16
15
	Segment 17
93
	Segment 18


output_file: data/YT_set/wavs/norm/Spring Allergy Coughing-7Ez5Wc_esBg-Dry_NORM.wav already exists and will be overwritten on build


38
	Segment 19
29
	Segment 20
80
data/YT_set/wavs/norm/Spring Allergy Coughing-7Ez5Wc_esBg-Dry_NORM.wav
Spring Allergy Coughing-7Ez5Wc_esBg-Dry.wav
7Ez5Wc_esBg
Dry
High-pass filtering...
Computing features...
	Segment 0
76
	Segment 1
21
	Segment 2
29
	Segment 3
48
	Segment 4
83
	Segment 5
57
	Segment 6
71
	Segment 7
9
	Segment 8
73
	Segment 9
27
	Segment 10
32
	Segment 11
29
	Segment 12


output_file: data/YT_set/wavs/norm/Spring Cold Coughing 2-AQOeIVbhFm4-Dry_NORM.wav already exists and will be overwritten on build


45
	Segment 13
44
data/YT_set/wavs/norm/Spring Cold Coughing 2-AQOeIVbhFm4-Dry_NORM.wav
Spring Cold Coughing 2-AQOeIVbhFm4-Dry.wav
AQOeIVbhFm4
Dry
High-pass filtering...
Computing features...
	Segment 0
27
	Segment 1
52
	Segment 2
60
	Segment 3
27
	Segment 4
66
	Segment 5
72
	Segment 6
40
	Segment 7
43
	Segment 8
9
	Segment 9
27
	Segment 10
51
	Segment 11
50
	Segment 12
24
	Segment 13
19
	Segment 14
32
	Segment 15
50
	Segment 16
58
	Segment 17
34
	Segment 18
56
	Segment 19
41
	Segment 20
31
	Segment 21
57
	Segment 22
82
	Segment 23


output_file: data/YT_set/wavs/norm/Spring Cold Coughing 3-tZtJaS2ZtME-Dry_NORM.wav already exists and will be overwritten on build


19
	Segment 24
33
	Segment 25
16
	Segment 26
27
data/YT_set/wavs/norm/Spring Cold Coughing 3-tZtJaS2ZtME-Dry_NORM.wav
Spring Cold Coughing 3-tZtJaS2ZtME-Dry.wav
tZtJaS2ZtME
Dry
High-pass filtering...
Computing features...
	Segment 0
31
	Segment 1
26
	Segment 2
86
	Segment 3
98
	Segment 4
63
	Segment 5
23
	Segment 6
13
	Segment 7
76
	Segment 8
60
	Segment 9
84
	Segment 10
93
	Segment 11
46
	Segment 12
33
	Segment 13
16
	Segment 14
237
	Segment 15
53
	Segment 16
108
	Segment 17
40
	Segment 18
61
	Segment 19
14
	Segment 20
9
	Segment 21
28
	Segment 22
30
	Segment 23
80
	Segment 24
62
	Segment 25
36
	Segment 26
12
	Segment 27
26
	Segment 28
20
	Segment 29
47
	Segment 30
40
	Segment 31
9
	Segment 32
10
	Segment 33
36
	Segment 34
35
	Segment 35
52
	Segment 36


output_file: data/YT_set/wavs/norm/Coughing 79-h2FLCKMcEX0-Wet_NORM.wav already exists and will be overwritten on build


38
	Segment 37
21
data/YT_set/wavs/norm/Coughing 79-h2FLCKMcEX0-Wet_NORM.wav
Coughing 79-h2FLCKMcEX0-Wet.wav
h2FLCKMcEX0
Wet
High-pass filtering...
Computing features...
	Segment 0
91
	Segment 1
50
	Segment 2
54
	Segment 3
38
	Segment 4
27
	Segment 5
9
	Segment 6
49
	Segment 7
9
	Segment 8
38
	Segment 9
25
	Segment 10


output_file: data/YT_set/wavs/norm/Male bronchitis cough-IzPMbIll3LE-Wet_NORM.wav already exists and will be overwritten on build


38
	Segment 11
37
	Segment 12
36
	Segment 13
30
	Segment 14
30
	Segment 15
9
	Segment 16
31
data/YT_set/wavs/norm/Male bronchitis cough-IzPMbIll3LE-Wet_NORM.wav
Male bronchitis cough-IzPMbIll3LE-Wet.wav
IzPMbIll3LE
Wet
High-pass filtering...
Computing features...
	Segment 0
79
	Segment 1
25
	Segment 2
54
	Segment 3
72
	Segment 4
67
	Segment 5
53
	Segment 6
9
	Segment 7
58
	Segment 8
97
	Segment 9
46
	Segment 10
91
	Segment 11
34
	Segment 12
9
	Segment 13
110
	Segment 14
86
	Segment 15
36
	Segment 16
65
	Segment 17
40
	Segment 18
114
	Segment 19
10
	Segment 20
35
	Segment 21
39
	Segment 22
31
	Segment 23
41
	Segment 24
23
	Segment 25


output_file: data/YT_set/wavs/norm/Coughing 77-2Mw-s5jnqXU-Wet_NORM.wav already exists and will be overwritten on build


166
	Segment 26
185
data/YT_set/wavs/norm/Coughing 77-2Mw-s5jnqXU-Wet_NORM.wav
Coughing 77-2Mw-s5jnqXU-Wet.wav
s5jnqXU
Wet
High-pass filtering...
Computing features...
	Segment 0
91
	Segment 1
72
	Segment 2
9
	Segment 3
35
	Segment 4
10
	Segment 5
34
	Segment 6
42
	Segment 7
20
	Segment 8
33
	Segment 9
34
	Segment 10
37
	Segment 11
31
	Segment 12
35
	Segment 13
11
	Segment 14
31
	Segment 15
27
	Segment 16
53
	Segment 17
39
	Segment 18
36
	Segment 19
15
	Segment 20
20
	Segment 21
34
	Segment 22
28
	Segment 23
26
	Segment 24
53
	Segment 25
75
	Segment 26
28
	Segment 27
25
	Segment 28
24
	Segment 29
25
	Segment 30
32
	Segment 31


output_file: data/YT_set/wavs/norm/November cold (wet coughing)-DYfjPnty2Ho-Wet_NORM.wav already exists and will be overwritten on build


9
	Segment 32
26
data/YT_set/wavs/norm/November cold (wet coughing)-DYfjPnty2Ho-Wet_NORM.wav
November cold (wet coughing)-DYfjPnty2Ho-Wet.wav
DYfjPnty2Ho
Wet
High-pass filtering...
Computing features...
	Segment 0
52
	Segment 1
24
	Segment 2
25
	Segment 3
24
	Segment 4
40
	Segment 5
25
	Segment 6
25
	Segment 7
32
	Segment 8
25
	Segment 9
35
	Segment 10
23
	Segment 11
25
	Segment 12
40
	Segment 13
25
	Segment 14
25
	Segment 15
32
	Segment 16
25
	Segment 17
28
	Segment 18
26
	Segment 19
30
	Segment 20
27
	Segment 21
25
	Segment 22
9
	Segment 23
26
	Segment 24
27
	Segment 25
25
	Segment 26


output_file: data/YT_set/wavs/norm/Dry Afternoon Cough-6LK6yHtIung-Dry_NORM.wav already exists and will be overwritten on build


20
	Segment 27
11
	Segment 28
12
	Segment 29
25
	Segment 30
35
	Segment 31
28
	Segment 32
24
	Segment 33
27
data/YT_set/wavs/norm/Dry Afternoon Cough-6LK6yHtIung-Dry_NORM.wav
Dry Afternoon Cough-6LK6yHtIung-Dry.wav
6LK6yHtIung
Dry
High-pass filtering...
Computing features...
	Segment 0
46
	Segment 1
40
	Segment 2
86
	Segment 3
29
	Segment 4
22
	Segment 5
11
	Segment 6
45
	Segment 7
110
	Segment 8
45
	Segment 9
20
	Segment 10
38
	Segment 11
30
	Segment 12
30
	Segment 13
38
	Segment 14
30
	Segment 15
14
	Segment 16
37
	Segment 17
28
	Segment 18
13
	Segment 19


output_file: data/YT_set/wavs/norm/Man Coughing Sound - Wet Cough Sound Effect-q6WsoL3J8U8-Wet_NORM.wav already exists and will be overwritten on build


21
data/YT_set/wavs/norm/Man Coughing Sound - Wet Cough Sound Effect-q6WsoL3J8U8-Wet_NORM.wav
Man Coughing Sound - Wet Cough Sound Effect-q6WsoL3J8U8-Wet.wav
q6WsoL3J8U8
Wet
High-pass filtering...
Computing features...
	Segment 0
35
	Segment 1
30
	Segment 2
43
	Segment 3
28
	Segment 4


output_file: data/YT_set/wavs/norm/Coughing 46-dg-I9j76-t8-Wet_NORM.wav already exists and will be overwritten on build


27
	Segment 5
16
	Segment 6
30
	Segment 7
43
	Segment 8
35
data/YT_set/wavs/norm/Coughing 46-dg-I9j76-t8-Wet_NORM.wav
Coughing 46-dg-I9j76-t8-Wet.wav
t8
Wet


output_file: data/YT_set/wavs/norm/# 55 gaggy wet cough-ct3tHDfNKiQ-Wet_NORM.wav already exists and will be overwritten on build


High-pass filtering...
Computing features...
	Segment 0
25
	Segment 1
9
	Segment 2
16
	Segment 3
18
	Segment 4
16
	Segment 5
30
	Segment 6
30
	Segment 7
21
data/YT_set/wavs/norm/# 55 gaggy wet cough-ct3tHDfNKiQ-Wet_NORM.wav
# 55 gaggy wet cough-ct3tHDfNKiQ-Wet.wav
ct3tHDfNKiQ
Wet
High-pass filtering...
Computing features...
	Segment 0
13
	Segment 1
44
	Segment 2
26
	Segment 3
17
	Segment 4
17
	Segment 5
47
	Segment 6
49
	Segment 7
38
	Segment 8
18
	Segment 9
51
	Segment 10
24
	Segment 11
35
	Segment 12
38
	Segment 13
55
	Segment 14
62
	Segment 15
29
	Segment 16


output_file: data/YT_set/wavs/norm/# 30 Chesty and wet cough-d2wkdrScerU-Wet_NORM.wav already exists and will be overwritten on build


41
data/YT_set/wavs/norm/# 30 Chesty and wet cough-d2wkdrScerU-Wet_NORM.wav
# 30 Chesty and wet cough-d2wkdrScerU-Wet.wav
d2wkdrScerU
Wet
High-pass filtering...
Computing features...
	Segment 0
19
	Segment 1
9
	Segment 2
9
	Segment 3
9
	Segment 4
18
	Segment 5
23
	Segment 6
33
	Segment 7
10
	Segment 8
35
	Segment 9
20
	Segment 10
21
	Segment 11
24
	Segment 12
21
	Segment 13
21
	Segment 14
25
	Segment 15
21
	Segment 16
22
	Segment 17
24
	Segment 18


output_file: data/YT_set/wavs/norm/Single wet cough-CTSLdNxN1cc-Wet_NORM.wav already exists and will be overwritten on build


19
	Segment 19
27
	Segment 20
11
	Segment 21
21
data/YT_set/wavs/norm/Single wet cough-CTSLdNxN1cc-Wet_NORM.wav
Single wet cough-CTSLdNxN1cc-Wet.wav
CTSLdNxN1cc
Wet
High-pass filtering...
Computing features...
	Segment 0
42
	Segment 1


output_file: data/YT_set/wavs/norm/Dry Early Morning Cough-XrpB4DTNQZw-Dry_NORM.wav already exists and will be overwritten on build


39
data/YT_set/wavs/norm/Dry Early Morning Cough-XrpB4DTNQZw-Dry_NORM.wav
Dry Early Morning Cough-XrpB4DTNQZw-Dry.wav
XrpB4DTNQZw
Dry
High-pass filtering...
Computing features...
	Segment 0
27
	Segment 1
62
	Segment 2
80
	Segment 3
49
	Segment 4
49
	Segment 5
35
	Segment 6
22
	Segment 7
26
	Segment 8
24
	Segment 9
19
	Segment 10
53
	Segment 11
33
	Segment 12
49
	Segment 13
67


output_file: data/YT_set/wavs/norm/Another Girl Coughing-iYxUHA-Pwsk-Dry_NORM.wav already exists and will be overwritten on build


	Segment 14
33
	Segment 15
12
	Segment 16
40
	Segment 17
26
	Segment 18
16
	Segment 19
12
	Segment 20
21
	Segment 21
12
data/YT_set/wavs/norm/Another Girl Coughing-iYxUHA-Pwsk-Dry_NORM.wav
Another Girl Coughing-iYxUHA-Pwsk-Dry.wav
Pwsk
Dry
High-pass filtering...
Computing features...
	Segment 0
25
	Segment 1
30
	Segment 2
36
	Segment 3
19
	Segment 4


output_file: data/YT_set/wavs/norm/Wet Throat Infection Cough-tfc5cXiXMDc-Wet_NORM.wav already exists and will be overwritten on build


52
data/YT_set/wavs/norm/Wet Throat Infection Cough-tfc5cXiXMDc-Wet_NORM.wav
Wet Throat Infection Cough-tfc5cXiXMDc-Wet.wav
tfc5cXiXMDc
Wet
High-pass filtering...
Computing features...
	Segment 0
48
	Segment 1
78
	Segment 2
32
	Segment 3
14
	Segment 4
46
	Segment 5
46
	Segment 6
42
	Segment 7
46
	Segment 8


output_file: data/YT_set/wavs/norm/# 61 morning phlegmy cough...again-qfpJg179YNk-Wet_NORM.wav already exists and will be overwritten on build


57
	Segment 9
46
	Segment 10
42
	Segment 11
34
	Segment 12
21
data/YT_set/wavs/norm/# 61 morning phlegmy cough...again-qfpJg179YNk-Wet_NORM.wav
# 61 morning phlegmy cough...again-qfpJg179YNk-Wet.wav
qfpJg179YNk
Wet
High-pass filtering...
Computing features...
	Segment 0
15
	Segment 1
35
	Segment 2
34
	Segment 3
31
	Segment 4
27
	Segment 5
23
	Segment 6
26
	Segment 7
31
	Segment 8
33
	Segment 9
38
	Segment 10
32
	Segment 11
36
	Segment 12
9
	Segment 13
16
	Segment 14
19
	Segment 15
12
	Segment 16
19
	Segment 17
9
	Segment 18
19
	Segment 19
36
	Segment 20
31
	Segment 21
15
	Segment 22
11
	Segment 23
10
	Segment 24
13
	Segment 25
19
	Segment 26
10
	Segment 27
43
	Segment 28
15
	Segment 29
58
	Segment 30
9


output_file: data/YT_set/wavs/norm/Wet coughing-0QQxKN-KC1U-Wet_NORM.wav already exists and will be overwritten on build


	Segment 31
29
	Segment 32
27
	Segment 33
27
	Segment 34
30
data/YT_set/wavs/norm/Wet coughing-0QQxKN-KC1U-Wet_NORM.wav
Wet coughing-0QQxKN-KC1U-Wet.wav
KC1U
Wet
High-pass filtering...
Computing features...
	Segment 0
40
	Segment 1
24
	Segment 2
53
	Segment 3
25
	Segment 4
29
	Segment 5
9
	Segment 6
42
	Segment 7
44
	Segment 8
27
	Segment 9
57


output_file: data/YT_set/wavs/norm/My deep wet cough-De4HdyocTHY-Wet_NORM.wav already exists and will be overwritten on build


	Segment 10
17
	Segment 11
44
	Segment 12
19
	Segment 13
27
data/YT_set/wavs/norm/My deep wet cough-De4HdyocTHY-Wet_NORM.wav
My deep wet cough-De4HdyocTHY-Wet.wav
De4HdyocTHY
Wet
High-pass filtering...
Computing features...
	Segment 0
22
	Segment 1
40
	Segment 2
54
	Segment 3
16
	Segment 4
43
	Segment 5
26
	Segment 6
32
	Segment 7
15
	Segment 8
17
	Segment 9
19
	Segment 10
32
	Segment 11
12
	Segment 12
13
	Segment 13
12
	Segment 14
14
	Segment 15
12
	Segment 16
10
	Segment 17
13
	Segment 18
9
	Segment 19
57
	Segment 20
43
	Segment 21
77
	Segment 22
43
	Segment 23
12
	Segment 24
22
	Segment 25
23
	Segment 26
27
	Segment 27
35
	Segment 28
10
	Segment 29
27
	Segment 30
25
	Segment 31
10
	Segment 32
47
	Segment 33
25
	Segment 34
21
	Segment 35
30
	Segment 36
40
	Segment 37
30
	Segment 38
40
	Segment 39
10
	Segment 40
38
	Segment 41
43
	Segment 42
27
	Segment 43
56
	Segment 44
43
	Segment 45
48
	Segment 46
12
	Segment 47
32
	Segment 48
11
	Segment 49
50
	Segment 50
9
	Segment 51
36
	Segment

output_file: data/YT_set/wavs/norm/Wheezing Chest and Wet Cough 2-5905FxXz9dI-Wet_NORM.wav already exists and will be overwritten on build


24
	Segment 76
38
	Segment 77
18
	Segment 78
48
data/YT_set/wavs/norm/Wheezing Chest and Wet Cough 2-5905FxXz9dI-Wet_NORM.wav
Wheezing Chest and Wet Cough 2-5905FxXz9dI-Wet.wav
5905FxXz9dI
Wet
High-pass filtering...
Computing features...
	Segment 0
10
	Segment 1
30
	Segment 2
17
	Segment 3
55
	Segment 4
27
	Segment 5
22
	Segment 6
30
	Segment 7
41
	Segment 8
19
	Segment 9
56
	Segment 10
35
	Segment 11
9
	Segment 12
24
	Segment 13
27
	Segment 14
13
	Segment 15
67
	Segment 16
23
	Segment 17
27
	Segment 18
13
	Segment 19
24
	Segment 20
36
	Segment 21
23
	Segment 22
47
	Segment 23
45
	Segment 24


output_file: data/YT_set/wavs/norm/Dry Coughing Fit in the Afternoon.-A5s2ZgwQ1VM-Dry_NORM.wav already exists and will be overwritten on build


18
	Segment 25
42
data/YT_set/wavs/norm/Dry Coughing Fit in the Afternoon.-A5s2ZgwQ1VM-Dry_NORM.wav
Dry Coughing Fit in the Afternoon.-A5s2ZgwQ1VM-Dry.wav
A5s2ZgwQ1VM
Dry
High-pass filtering...
Computing features...
	Segment 0
81
	Segment 1
23
	Segment 2
35
	Segment 3
27
	Segment 4
22
	Segment 5
25
	Segment 6
48
	Segment 7
67
	Segment 8
9
	Segment 9
62
	Segment 10
22
	Segment 11
27
	Segment 12
14
	Segment 13
14
	Segment 14
10
	Segment 15
42
	Segment 16
12
	Segment 17
20
	Segment 18
34
	Segment 19
9
	Segment 20
26
	Segment 21
20
	Segment 22
50
	Segment 23
46
	Segment 24
24
	Segment 25


output_file: data/YT_set/wavs/norm/Heavy cold and sore throat coughing.-NaOVmYoIjbs-Dry_NORM.wav already exists and will be overwritten on build


18
	Segment 26
24
data/YT_set/wavs/norm/Heavy cold and sore throat coughing.-NaOVmYoIjbs-Dry_NORM.wav
Heavy cold and sore throat coughing.-NaOVmYoIjbs-Dry.wav
NaOVmYoIjbs
Dry
High-pass filtering...
Computing features...
	Segment 0
25
	Segment 1
78
	Segment 2
59
	Segment 3
10
	Segment 4
29
	Segment 5
49
	Segment 6
39
	Segment 7
38
	Segment 8


output_file: data/YT_set/wavs/norm/Spring Cold Coughing.-u2KMBD5-oCg-Dry_NORM.wav already exists and will be overwritten on build


62
	Segment 9
26
	Segment 10
31
	Segment 11
28
	Segment 12
23
data/YT_set/wavs/norm/Spring Cold Coughing.-u2KMBD5-oCg-Dry_NORM.wav
Spring Cold Coughing.-u2KMBD5-oCg-Dry.wav
oCg
Dry
High-pass filtering...
Computing features...
	Segment 0
10
	Segment 1
41
	Segment 2
30
	Segment 3
9
	Segment 4
44
	Segment 5
23
	Segment 6
45
	Segment 7
48
	Segment 8
34
	Segment 9
47
	Segment 10
43
	Segment 11
12
	Segment 12
39
	Segment 13
54
	Segment 14
47
	Segment 15
21
	Segment 16
32
	Segment 17
11
	Segment 18
24
	Segment 19
31
	Segment 20
9
	Segment 21
56
	Segment 22


output_file: data/YT_set/wavs/norm/# 60 coughing still (deep and wet cough)-jxYNLCYTwZQ-Wet_NORM.wav already exists and will be overwritten on build


36
	Segment 23
30
data/YT_set/wavs/norm/# 60 coughing still (deep and wet cough)-jxYNLCYTwZQ-Wet_NORM.wav
# 60 coughing still (deep and wet cough)-jxYNLCYTwZQ-Wet.wav
jxYNLCYTwZQ
Wet
High-pass filtering...
Computing features...
	Segment 0
24
	Segment 1
11
	Segment 2
20
	Segment 3
26
	Segment 4
33
	Segment 5
46
	Segment 6
26
	Segment 7
32
	Segment 8
18
	Segment 9
9
	Segment 10
16
	Segment 11
40
	Segment 12


output_file: data/YT_set/wavs/norm/Coughing Woman Sound - Woman Cough Sound Effect-zjd4HrJbc8o-Dry_NORM.wav already exists and will be overwritten on build


33
data/YT_set/wavs/norm/Coughing Woman Sound - Woman Cough Sound Effect-zjd4HrJbc8o-Dry_NORM.wav
Coughing Woman Sound - Woman Cough Sound Effect-zjd4HrJbc8o-Dry.wav
zjd4HrJbc8o
Dry
High-pass filtering...
Computing features...
	Segment 0
27
	Segment 1
17
	Segment 2
10
	Segment 3
30
	Segment 4
24
	Segment 5
10
	Segment 6
9
	Segment 7
19
	Segment 8


output_file: data/YT_set/wavs/norm/Cough Around the Clock!-4k0ziD0j5BI-Wet_NORM.wav already exists and will be overwritten on build


20
data/YT_set/wavs/norm/Cough Around the Clock!-4k0ziD0j5BI-Wet_NORM.wav
Cough Around the Clock!-4k0ziD0j5BI-Wet.wav
4k0ziD0j5BI
Wet
High-pass filtering...
Computing features...
	Segment 0
40
	Segment 1
47
	Segment 2
32
	Segment 3
37
	Segment 4
23
	Segment 5
20
	Segment 6
16
	Segment 7
26
	Segment 8
56
	Segment 9
33
	Segment 10
29
	Segment 11
9
	Segment 12
14
	Segment 13
14
	Segment 14
35
	Segment 15
44
	Segment 16
42
	Segment 17
37
	Segment 18
37
	Segment 19
29
	Segment 20
15
	Segment 21
38
	Segment 22
39
	Segment 23
30
	Segment 24


output_file: data/YT_set/wavs/norm/#64 coughing, allergies, singing lungs-CsDXlt7Ei1c-Wet_NORM.wav already exists and will be overwritten on build


26
	Segment 25
34
	Segment 26
30
	Segment 27
32
data/YT_set/wavs/norm/#64 coughing, allergies, singing lungs-CsDXlt7Ei1c-Wet_NORM.wav
#64 coughing, allergies, singing lungs-CsDXlt7Ei1c-Wet.wav
CsDXlt7Ei1c
Wet
High-pass filtering...
Computing features...
	Segment 0
13
	Segment 1
30
	Segment 2
31
	Segment 3
9
	Segment 4
28
	Segment 5
32
	Segment 6
22
	Segment 7
50
	Segment 8
10
	Segment 9
25
	Segment 10
66
	Segment 11
9
	Segment 12
30
	Segment 13
13
	Segment 14
10
	Segment 15
34
	Segment 16
12
	Segment 17
33
	Segment 18
35
	Segment 19
31
	Segment 20
31
	Segment 21
33
	Segment 22
15
	Segment 23
20
	Segment 24
9
	Segment 25
35
	Segment 26
33


output_file: data/YT_set/wavs/norm/# 34 coughing up crap again-rkF_uMizqoc-Wet_NORM.wav already exists and will be overwritten on build


	Segment 27
13
	Segment 28
10
data/YT_set/wavs/norm/# 34 coughing up crap again-rkF_uMizqoc-Wet_NORM.wav
# 34 coughing up crap again-rkF_uMizqoc-Wet.wav
rkF_uMizqoc
Wet
High-pass filtering...
Computing features...
	Segment 0
20
	Segment 1
18
	Segment 2
19
	Segment 3
33
	Segment 4
11
	Segment 5
38
	Segment 6
27
	Segment 7
22
	Segment 8
20
	Segment 9
28
	Segment 10
24
	Segment 11
22
	Segment 12


output_file: data/YT_set/wavs/norm/Coughing 51-LkxvBb2VXbs-Dry_NORM.wav already exists and will be overwritten on build


24
data/YT_set/wavs/norm/Coughing 51-LkxvBb2VXbs-Dry_NORM.wav
Coughing 51-LkxvBb2VXbs-Dry.wav
LkxvBb2VXbs
Dry
High-pass filtering...
Computing features...
	Segment 0
15
	Segment 1
11
	Segment 2
12
	Segment 3
13
	Segment 4
22
	Segment 5
20
	Segment 6
9
	Segment 7
12
	Segment 8
9
	Segment 9
10
	Segment 10
11
	Segment 11
21
	Segment 12
15
	Segment 13
9
	Segment 14
9
	Segment 15
39
	Segment 16
38
	Segment 17
13
	Segment 18
13
	Segment 19
37
	Segment 20
14
	Segment 21
14
	Segment 22


output_file: data/YT_set/wavs/norm/Dry Morning Cough turns Chesty and Barking.-ekqLlw-Xe68-Dry_NORM.wav already exists and will be overwritten on build


17
	Segment 23
30
	Segment 24
40
data/YT_set/wavs/norm/Dry Morning Cough turns Chesty and Barking.-ekqLlw-Xe68-Dry_NORM.wav
Dry Morning Cough turns Chesty and Barking.-ekqLlw-Xe68-Dry.wav
Xe68
Dry
High-pass filtering...
Computing features...
	Segment 0
63
	Segment 1
34
	Segment 2
26
	Segment 3
33
	Segment 4
53
	Segment 5
30
	Segment 6
23
	Segment 7
38
	Segment 8
32
	Segment 9
38
	Segment 10
61
	Segment 11
30
	Segment 12
12
	Segment 13
10
	Segment 14
30
	Segment 15
29
	Segment 16
23
	Segment 17
42
	Segment 18
34
	Segment 19
33
	Segment 20
79
	Segment 21
41
	Segment 22
25
	Segment 23
28
	Segment 24
37
	Segment 25
59


output_file: data/YT_set/wavs/norm/# 31 night wet cough-Dc_aoUCqw2E-Wet_NORM.wav already exists and will be overwritten on build


	Segment 26
21
	Segment 27
27
	Segment 28
29
	Segment 29
31
	Segment 30
34
	Segment 31
9
data/YT_set/wavs/norm/# 31 night wet cough-Dc_aoUCqw2E-Wet_NORM.wav
# 31 night wet cough-Dc_aoUCqw2E-Wet.wav
Dc_aoUCqw2E
Wet
High-pass filtering...
Computing features...
	Segment 0
18
	Segment 1
32
	Segment 2
14
	Segment 3
34
	Segment 4
19
	Segment 5
24
	Segment 6
9
	Segment 7
31
	Segment 8
9
	Segment 9
36
	Segment 10
23
	Segment 11
29
	Segment 12
36
	Segment 13
23
	Segment 14


output_file: data/YT_set/wavs/norm/Coughing 60-diuuEXKzNB8-Wet_NORM.wav already exists and will be overwritten on build


10
	Segment 15
45
data/YT_set/wavs/norm/Coughing 60-diuuEXKzNB8-Wet_NORM.wav
Coughing 60-diuuEXKzNB8-Wet.wav
diuuEXKzNB8
Wet
High-pass filtering...
Computing features...
	Segment 0
70
	Segment 1
63
	Segment 2
57
	Segment 3
78
	Segment 4
9
	Segment 5
61
	Segment 6
11
	Segment 7
54
	Segment 8
66
	Segment 9
42
	Segment 10
31
	Segment 11
43
	Segment 12


output_file: data/YT_set/wavs/norm/Mid-morning Winter Coughing Fit-h-GtQfDCoaE-Dry_NORM.wav already exists and will be overwritten on build


52
data/YT_set/wavs/norm/Mid-morning Winter Coughing Fit-h-GtQfDCoaE-Dry_NORM.wav
Mid-morning Winter Coughing Fit-h-GtQfDCoaE-Dry.wav
GtQfDCoaE
Dry
High-pass filtering...
Computing features...
	Segment 0
56
	Segment 1
45
	Segment 2
30
	Segment 3
86
	Segment 4
26
	Segment 5
18
	Segment 6
33
	Segment 7
9
	Segment 8
49
	Segment 9
44
	Segment 10
84
	Segment 11
40
	Segment 12
48
	Segment 13
49
	Segment 14
40
	Segment 15


output_file: data/YT_set/wavs/norm/Residual Phlegmy Morning Coughing and Gagging-TK4CveeCWfY-Wet_NORM.wav already exists and will be overwritten on build


33
	Segment 16
50
	Segment 17
14
	Segment 18
15
data/YT_set/wavs/norm/Residual Phlegmy Morning Coughing and Gagging-TK4CveeCWfY-Wet_NORM.wav
Residual Phlegmy Morning Coughing and Gagging-TK4CveeCWfY-Wet.wav
TK4CveeCWfY
Wet
High-pass filtering...
Computing features...
	Segment 0
87
	Segment 1
27
	Segment 2
48
	Segment 3
9
	Segment 4
74
	Segment 5
73
	Segment 6
40
	Segment 7
37
	Segment 8
53
	Segment 9
51
	Segment 10
13
	Segment 11
31
	Segment 12
26
	Segment 13
47
	Segment 14
86
	Segment 15
38
	Segment 16
11
	Segment 17
24
	Segment 18
23
	Segment 19
90
	Segment 20
15
	Segment 21


output_file: data/YT_set/wavs/norm/Coughing 14 - After work-1UDFq2InljM-Dry_NORM.wav already exists and will be overwritten on build


31
	Segment 22
14
data/YT_set/wavs/norm/Coughing 14 - After work-1UDFq2InljM-Dry_NORM.wav
Coughing 14 - After work-1UDFq2InljM-Dry.wav
1UDFq2InljM
Dry
High-pass filtering...
Computing features...
	Segment 0
32
	Segment 1
31
	Segment 2
32
	Segment 3
33
	Segment 4
29
	Segment 5
30
	Segment 6
9
	Segment 7
40
	Segment 8
39
	Segment 9
9
	Segment 10
58
	Segment 11
23


In [6]:
feats

Unnamed: 0,Id,kurt,logEnergy,zcr,F0,skewness,label,mfcc_0,mfcc_1,mfcc_2,...,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,F1,F2,F3,F4
0,Q,2.130898,5.535231,0.493734,500.000000,-0.073570,Dry,18.726310,-49.164042,-21.943514,...,4.222008,-6.405460,-23.258862,-30.374339,21.403037,24.980936,0.000000,925.103724,1840.988979,2514.599705
1,Q,1.545643,5.650084,0.481203,355.555556,-0.023850,Dry,18.853876,-46.084848,-23.129777,...,1.274268,-21.431062,2.649611,-2.911352,23.824741,6.022679,984.918270,1336.004572,2393.647660,3457.490333
2,Q,3.455492,5.693125,0.398496,340.425532,0.060297,Dry,18.881794,-46.953603,-29.814847,...,14.064683,-33.612875,0.582441,-8.900234,18.914764,21.723671,1082.497991,1521.894130,2303.224957,3334.387099
3,Q,2.705560,5.950192,0.373434,390.243902,-0.028460,Dry,19.155061,-45.559214,-34.543965,...,14.530844,-55.256140,1.971662,4.050639,26.670061,15.960065,1137.112141,1159.306893,2339.993180,3205.991440
4,Q,1.262319,5.939354,0.270677,400.000000,0.261261,Dry,18.881250,-47.445340,-34.115887,...,17.923054,-40.710783,-5.962290,11.440152,25.719434,-5.918450,1157.686238,1232.201808,2319.610594,3314.860067
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26066,1UDFq2InljM,3.047776,6.269337,0.182957,470.588235,-1.157113,Dry,18.547338,-27.690078,-19.460858,...,-42.477016,-24.957201,-35.001481,5.110327,-22.201085,-5.170684,0.000000,485.811290,1127.858286,2362.825897
26067,1UDFq2InljM,2.946299,4.816438,0.290727,444.444444,-0.049535,Dry,16.141393,-29.294489,-13.170555,...,-11.641517,-44.159174,-19.803384,16.631558,6.202915,-5.215586,403.598514,1156.610226,2403.494580,3549.297585
26068,1UDFq2InljM,2.435713,4.248565,0.310777,516.129032,0.039825,Dry,14.692141,-27.883082,-13.303023,...,-15.676712,-27.726053,-0.864033,4.824035,-10.168872,-22.813406,0.000000,565.937275,1223.541129,2561.363375
26069,1UDFq2InljM,2.741493,3.926815,0.421053,421.052632,0.242132,Dry,14.522135,-32.537145,-11.114888,...,-23.824930,-34.327166,-8.661193,-0.349616,-9.037070,-4.645845,0.000000,442.061606,1276.099474,2520.313825


In [7]:
#Store feature df:
feats_fname = 'feats_df.pkl'
feats.to_pickle(feats_fname)

## Pre-processing of features

In [21]:
#1.Check which columns have NaNs values

#feats.columns[feats.isna().any()].tolist() --> We get just the ones we have inserted in formants
feats2 = feats.interpolate(method ='cubic')

#feats2.columns[feats2.isna().any()].tolist()
feats2.describe()

Unnamed: 0,kurt,logEnergy,zcr,F0,skewness,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,...,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,F1,F2,F3,F4
count,26071.0,26071.0,26071.0,26071.0,26071.0,26071.0,26071.0,26071.0,26071.0,26071.0,...,26071.0,26071.0,26071.0,26071.0,26071.0,26071.0,26071.0,26071.0,26071.0,26071.0
mean,2.972985,5.064151,0.442942,436.271165,0.024222,17.310277,-41.131276,-16.316976,-18.061951,-11.628431,...,-11.854346,-9.242122,2.613025,-1.524919,-1.229979,-1.902861,456.206607,1230.832845,2112.192826,2992.705809
std,4.68212,1.792228,0.152986,66.628232,0.390001,4.345712,11.668752,11.661586,13.019689,13.704011,...,17.549195,16.889033,17.622099,15.268481,14.219725,13.735149,303.440971,449.343032,472.711375,456.585632
min,-3.0,-15.65356,0.0,210.526316,-7.3405,-36.043653,-76.124856,-64.24896,-91.176248,-83.995654,...,-88.29596,-84.558419,-69.139495,-70.65029,-60.882308,-63.796756,-357.092729,-446.875904,0.0,0.0
25%,1.666832,4.485423,0.338346,390.243902,-0.143412,15.464972,-49.83421,-23.877204,-26.674046,-20.430958,...,-23.562303,-19.788195,-9.004312,-11.191404,-10.604431,-10.772917,258.65079,1081.785667,1851.228675,2732.034992
50%,2.218974,5.508103,0.461153,444.444444,0.010905,18.311801,-42.401186,-16.273715,-17.690588,-11.384874,...,-11.247365,-8.213625,2.28227,-1.660474,-1.460138,-2.075129,500.30024,1336.956344,2200.911659,3095.411955
75%,2.988356,6.212164,0.561404,500.0,0.172778,20.18294,-33.69881,-8.661568,-9.432803,-2.708925,...,0.06711,1.985476,14.481733,8.217148,7.762313,6.689347,651.861895,1522.655123,2458.363723,3322.961469
max,276.587737,8.569225,0.814536,516.129032,15.525267,26.007117,11.409921,28.565271,41.85114,46.906596,...,62.767509,60.724107,74.775819,69.424473,73.729549,68.077051,1395.180747,2505.344668,3254.624397,4398.377803


In [93]:
#2. Get feature set, labels, and recording IDs
X_train = feats2.drop(['label','Id'], 1).copy()
y_train = feats2['label'].copy()

ID_train = feats2['Id'].copy()
ID_list = ID_train.drop_duplicates()

#ID_train.size
#ID_list.size

In [94]:
#3. Normalization in case some model requires it

scaler = StandardScaler()
scaler.fit(X_train)

#use same scaler for both, based on X_train data
X_trainNorm = scaler.transform(X_train.values)

## Model training

### Train-test split (k-fold)

In [95]:
k = ID_list.values.size #number of folds

group_kfold = GroupKFold(n_splits=k)
group_kfold.get_n_splits(X_trainNorm, y_train, ID_train)

36

### Logistic regression

In [101]:
#Do cross-validation
acc= np.empty([k, 1])

idx_acc = 0
for train_index, test_index in group_kfold.split(X_trainNorm,y_train,ID_train):
    X_train1, X_test1 = X_trainNorm[train_index], X_trainNorm[test_index]
    y_train1, y_test1 = Y_train[train_index], Y_train[test_index]
    
    logReg = LogisticRegression()
    logReg.fit(X_train1, y_train1)
    y_hat = train_predictions = logReg.predict(X_test1)
    acc[idx_acc] = accuracy_score(y_test1, y_hat)
    idx_acc = idx_acc +1
    



In [107]:
print(acc.mean())
print(acc.std())

0.6197853809784839
0.2140665483275574


In [108]:
acc #TODO check exactly the speech that has given the most problems

array([[0.94074074],
       [0.1987041 ],
       [0.7880597 ],
       [0.81504986],
       [0.94363636],
       [0.31170018],
       [0.4556962 ],
       [0.80705623],
       [0.47678369],
       [0.40972222],
       [0.75324675],
       [0.59802713],
       [0.31282051],
       [0.28737113],
       [0.63459038],
       [0.46675359],
       [0.62482947],
       [0.80055402],
       [0.57919255],
       [0.81161695],
       [0.8692053 ],
       [0.47587354],
       [0.28442029],
       [0.84507042],
       [0.7702407 ],
       [0.59161148],
       [0.71493213],
       [0.73979592],
       [0.38082192],
       [0.82634731],
       [0.68627451],
       [0.86062718],
       [0.39759036],
       [0.92121212],
       [0.37654321],
       [0.55555556]])

In [None]:
#TODO: check confusion matrix, to see where there was the most problems

In [10]:
#Cough sound
#Breathing rate
#Breathing rhytm (consistence smoothness)
#Cough rate
#Panic level
#Hoarseness