## 1. Volume normalization and silence-based cough segmentation

In [1]:
#Research references:
#1) Dry/wet cough classification: https://link.springer.com/article/10.1007/s10439-013-0741-6
#2) Pneumonia classification: https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6987276

In [2]:
#from scipy.io.wavfile import read
#import wave
import numpy as np
import os
import sox
#import pywt #wavelets
from pydub import AudioSegment
from pydub.silence import split_on_silence
from pydub.utils import mediainfo
import matplotlib.pyplot as plt
import python_speech_features as spe_feats
import pandas as pd
from scipy.stats import kurtosis
import sys

In [3]:
#Initialize data frame of features:

feats = pd.DataFrame([])

#tiny constant value
eps = sys.float_info.epsilon

#Features' settings:

fs_targ = 16000 # set all audios to this sampling frequency
n_channels_targ = 1

#framing
winlen=0.025
winstep=0.01

#mfcc
mfcc_coeffcs= 12 #as paper (https://link.springer.com/article/10.1007/s10439-013-0741-6)

In [4]:
norm_skip = False #skip normalization step (because it has been done previously)

#s = read(audiofile)
FOLDER_PATH = 'data/YT_set/wavs/1/'
for file_name in os.listdir(FOLDER_PATH):
    
    fname_noExt = os.path.splitext(file_name)[0] #file name without extension
    
    #full path file name
    full_fname = FOLDER_PATH+file_name
    #print(full_fname)
    
    #TODO: put normalized wavs in other folder
    #name for normalization
    NORM_FOLDER_PATH = 'data/YT_set/wavs/norm/'
    norm_fname = NORM_FOLDER_PATH + os.path.splitext(file_name)[0] + '_NORM.wav'
    
    if norm_skip is False: 
        ## Normalization
        
        #level to same dB
        tfm = sox.Transformer()
        tfm.gain(gain_db=0.0, normalize=False, limiter=False, balance=None)
        #downsample to 16kHz and 1 channel
        tfm.convert(samplerate=fs_targ, n_channels=n_channels_targ, bitdepth=None) 
        #tfm.norm(db_level=0.0)
    
        # create the output normalized audio
        
        print(norm_fname)
        tfm.build(full_fname, norm_fname)
        tfm.effects_log
    
    # load normalized audio
    s = AudioSegment.from_wav(norm_fname)
    #sampling rate:
    info = mediainfo(norm_fname)
    fs = float(info['sample_rate'])
    
    #get ID of recording
    ID = fname_noExt.split('-')[1] #for the current type of naming
    
    #get label
    ## TODO: probably better to insert label in the file name, during creation of data set
    
    
    ## Segmentation of cough streams (silence-based)
    #min_silence_len in ms, silence_thresh in dB
    s_segments = split_on_silence (s, min_silence_len = 600, silence_thresh = -30)
    ## TODO: set more accurate thresholds, or find other way to split (variance-based?)
    

output_file: data/YT_set/wavs/norm/Dry Afternoon Cough-6LK6yHtIung_NORM.wav already exists and will be overwritten on build


data/YT_set/wavs/norm/Dry Afternoon Cough-6LK6yHtIung_NORM.wav


In [5]:
#TODO: prior to feature extraction, according to paper:

#1) high pass filter (I guess this is just a pre-emphasis filter)
#2) the segment is divided into X non-overlapping subsegments (X=3 for dry/wet cough paper, X=12 for pneumonia paper)

In [6]:
s

In [7]:
s_segments

[<pydub.audio_segment.AudioSegment at 0x7f55c1fc1470>,
 <pydub.audio_segment.AudioSegment at 0x7f55c1fc16a0>,
 <pydub.audio_segment.AudioSegment at 0x7f55c1fc16d8>,
 <pydub.audio_segment.AudioSegment at 0x7f55c1fc1cf8>,
 <pydub.audio_segment.AudioSegment at 0x7f55c1fc1d30>,
 <pydub.audio_segment.AudioSegment at 0x7f55c1fc1d68>,
 <pydub.audio_segment.AudioSegment at 0x7f55c1fc1da0>,
 <pydub.audio_segment.AudioSegment at 0x7f55c1fc1dd8>,
 <pydub.audio_segment.AudioSegment at 0x7f55c1fc1e10>,
 <pydub.audio_segment.AudioSegment at 0x7f55c1fc1e48>,
 <pydub.audio_segment.AudioSegment at 0x7f55c1fc1e80>,
 <pydub.audio_segment.AudioSegment at 0x7f55c1fc1eb8>,
 <pydub.audio_segment.AudioSegment at 0x7f55c1fc1ef0>,
 <pydub.audio_segment.AudioSegment at 0x7f55c1fc1f28>,
 <pydub.audio_segment.AudioSegment at 0x7f55c1fc1f60>,
 <pydub.audio_segment.AudioSegment at 0x7f55c1fc1f98>,
 <pydub.audio_segment.AudioSegment at 0x7f55c1fc1fd0>,
 <pydub.audio_segment.AudioSegment at 0x7f55c1fbc048>,
 <pydub.au

In [8]:
s_segments[14]

In [9]:
fs

16000.0

In [10]:
seg_array = np.asarray(s_segments[3].get_array_of_samples())


## Feature extraction

In [11]:
#TODO: window framing: I think maybe the segments need to be windowed with non-overlapping frames? 
#(see frame settings at beggining of notebook)

#TODO: Convert feature extraction into a function

print('Computing features...')
for idx, seg_i in enumerate(s_segments):
    seg_i_array = np.asarray(seg_i.get_array_of_samples())
    
    print(idx)
    #Feature extraction:
    #Reference: https://link.springer.com/article/10.1007/s10439-013-0741-6
    
    #TODO:
    #0)Wavelets
    
    #DOUBT: if log-energy feature is included, should I also include the first mfcc coefficient (c0) ?
    #1)mfcc
    mfcc_feat = spe_feats.mfcc(seg_i_array,fs)
          
    #2)zero-crossing rate
    zcr_feat = (((seg_i_array[:-1] * seg_i_array[1:]) < 0).sum())/len(seg_i_array)
    
    #3)Formant frequencies (first 4: F1,F2,F3,F4)
    #TODO: use LP method to find formants (or Praat?)
    
    #4)Log-energy
    logEnergy_feat = np.log10( ( (np.power(seg_i_array,2)).sum()/len(seg_i_array) ) + eps)    
    
    #5)Pitch (F0)
    #TODO: Check to use Praat for F0 (maybe also use it for other stuff?)
    
    #6)Kurtosis
    kurt_feat = kurtosis(seg_i_array)
    
    #7)Bispectrum Score (BGS)
    #TODO
    
    #8)Non-Gaussianity Score (NGS)
    #TODO
    
    
    
    feats = feats.append(pd.DataFrame({'Id': ID, 'mfcc': [mfcc_feat], 
                                       'kurtosis': kurt_feat, 'logEnergy': logEnergy_feat, 'zcr': zcr_feat},
                                      index=[0]), ignore_index=True, sort=False)

Computing features...
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


In [12]:
feats

Unnamed: 0,Id,mfcc,kurtosis,logEnergy,zcr
0,6LK6yHtIung,"[[12.36276530370189, -19.963203496598798, -17....",2.8104,2.87316,0.473482
1,6LK6yHtIung,"[[11.887527236226413, -24.915677748615128, -19...",2.717761,2.97319,0.489075
2,6LK6yHtIung,"[[12.014100104970545, -22.975764171061737, -16...",4.119877,2.964489,0.462441
3,6LK6yHtIung,"[[11.922238892260383, -23.466792610148442, -11...",3.498607,2.702768,0.47993
4,6LK6yHtIung,"[[11.955470067394275, -22.2358427134386, -15.6...",2.295001,2.977876,0.445105
5,6LK6yHtIung,"[[19.35201007910828, -33.953070006511346, -21....",3.589704,2.690761,0.508681
6,6LK6yHtIung,"[[14.338701087086028, -18.17531434751059, -8.9...",3.293099,2.786984,0.489986
7,6LK6yHtIung,"[[15.533184602190428, -25.73546663659931, -5.9...",8.980241,3.168842,0.479357
8,6LK6yHtIung,"[[12.008056194040137, -24.70512080454013, -15....",7.555325,3.16406,0.46415
9,6LK6yHtIung,"[[12.008452875258532, -25.972160770324447, -13...",3.619942,2.854311,0.463877


In [None]:
mfcc_feat.shape

In [None]:
seg_i_array

In [None]:
#get numpy array from sound file
samples = np.asarray(s.get_array_of_samples())

In [None]:
plt.plot(samples)

In [None]:
samples_0 = np.asarray(s_segments[0].get_array_of_samples())
plt.plot(samples_0)

In [None]:
#s = wave.open(norm_fname,"rb")

#sampling rate
#fs = s.getframerate()
#fs = sox.file_info.sample_rate(full_fname)

#number of channels
#channels = s.getnchannels()

## TODO: Cough detection and segmentation

## Pre-processing

In [None]:
#Cough sound
#Breathing rate
#Breathing rhytm (consistence smoothness)
#Cough rate
#Panic level
#Hoarseness