In [None]:
#Research references:
#1) Dry/wet cough classification: https://link.springer.com/article/10.1007/s10439-013-0741-6
#2) Pneumonia classification: https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6987276
#3) https://espace.library.uq.edu.au/data/UQ_344963/s41943203_phd_submission.pdf?Expires=1585601065&Key-Pair-Id=APKAJKNBJ4MJBJNC6NLQ&Signature=Lnpf6wT8rkozSh9av7U9nGuC7WAH6KuI2Cj3Y7G366gkGlh8D-Ie1Kc~TyBAUu~uMsVltleJcSv3p6TCm6HdFnhpyoTgLcYh6eFfvQwIUqbk1Bf4JZldgB~BDKUOwY1G0pA-HoKjvIAu3avO98SMO35upakm9OEBByd4nC9aXsjKRThd6bTpq1qIuuD9gh1l5FaM6hNRB0c2lCf4Q3adx7C3FW0NMwdWhcuF45A9f~dO3zTWWSQamoo5Otc-PHMMt96TetNcML~jy9ghgJeCPY6DJLUIwQAt03fENBluS~TjTJ17WD~n51xiRofb94fEJHoRHh0d-430LLwr7BX4IA__

In [None]:
import numpy as np
import os
#import pywt #wavelets
from pydub import AudioSegment
from pydub.silence import split_on_silence
from pydub.utils import mediainfo
from pydub.playback import play
import matplotlib.pyplot as plt
#import seaborn as sn
import python_speech_features as spe_feats
import pandas as pd
from scipy.stats import kurtosis, skew
from scipy.signal import lfilter
import librosa
import pysptk
import math
import sys
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
#from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier


#settings
import config

## Reading recordings

In [None]:
all_s=[]
all_label=[]
all_id=[]
all_fs=[]

In [None]:
#Read wav data set

if config.featExtr_skip is False:
    print("Readings wavs...")

    #only list files in FOLDER_PATH directory
    wav_files = [f for f in os.listdir(config.FOLDER_PATH) if os.path.isfile(os.path.join(config.FOLDER_PATH, f))]
    for file_name in wav_files:
    
        fname_noExt = os.path.splitext(file_name)[0] #file name without extension
    
        #full path file name
        full_fname = config.FOLDER_PATH+file_name
        print(full_fname)
    
        # load audio
        s = AudioSegment.from_wav(full_fname)
        print(full_fname)
        all_s.append(s)
        #sampling rate:
        info = mediainfo(full_fname)
        fs = float(info['sample_rate'])
        all_fs.append(fs)
    
        #get ID of recording
        ID = fname_noExt.split('-')[-2] #for the current type of naming
        #print(file_name)
        #print(ID)
        all_id.append(ID)
    
        #get label
        label = fname_noExt.split('-')[-1] #for the current type of naming
        #print(label)
        all_label.append(label)

Listening to some of the audios

In [None]:
if config.featExtr_skip is False:
    np.where(np.array(all_label)=='Dry')

In [None]:
if config.featExtr_skip is False:
    np.where(np.array(all_label)=='Wet')

In [None]:
if config.featExtr_skip is False:
    s=all_s[15]
    s

## Feature extraction

In [None]:
import featureExtractionFunctions as feat

if config.featExtr_skip is False:

    feats = feat.feature_extraction_Step(all_s,all_id,all_label)
    
       

## Load  (or store) features 

In [None]:

feats_fname = 'feats_df.pkl'

if config.featExtr_skip is False:
    #Store feature df
    feats.to_pickle(feats_fname)
else:
    #Load feature df
    feats = pd.read_pickle(feats_fname)

## Pre-processing of features

In [None]:
#1.Check which columns have NaNs values

#feats2 = feats.copy()

#sum(feats.isna().any())
#feats.columns[feats.isna().any()].tolist() --> We get just the ones we have inserted in formants
#feats2 = feats.interpolate(method ='cubic')
feats2 = feats.dropna(axis=1).copy()
#feats2.dropna(axis=0, how="any", thresh=None, subset=None, inplace=True)

#feats2.columns[feats2.isna().any()].tolist()
#feats2.describe()
#sum(feats2.isna().any())


In [None]:
 
#Make dictionary and add label column using it 
def addLabel2df(feats):
    feats_unique = feats.drop_duplicates(subset=['Id'])
    label_dict = dict(zip(feats_unique.Id, feats_unique.label))
    return label_dict


def frame_mean_std_chunk_modeling (feats2, label_dict):
    
    #Grouping the frames from a same recording (Id) into chunks with the same number of frames.
    #The training of the classifier will be based on these chunks mean and standard deviation.

    feats2['cum_IDidx'] = feats2.groupby('Id').cumcount()

    def get_subidx(cum_Idx,batch_size):
        #batch needs to be an integer (or float like 3.0)
        return int(1.0*cum_Idx/batch_size)

    feats2['subIdx'] = feats2.apply(lambda x: get_subidx(x['cum_IDidx'], 10), axis=1)
    feats2 = feats2.drop(['cum_IDidx'],axis=1)
    
    mean_feats = feats2.groupby(['Id','subIdx']).aggregate('mean').reset_index()
    std_feats = feats2.groupby(['Id','subIdx']).agg(lambda x: x.std(ddof=0)).reset_index() #ddof=0 to compute population std (rather than sample std)
    keep_same = {'Id', 'subIdx'}
    mean_feats.columns = ['{}{}'.format(c, '' if c in keep_same else '_m') for c in mean_feats.columns]
    std_feats.columns = ['{}{}'.format(c, '' if c in keep_same else '_std') for c in std_feats.columns]
    
    mean_std_feats = pd.merge(mean_feats, std_feats, on=['Id','subIdx'], how='outer')
    
    mean_std_feats['label'] = mean_std_feats["Id"].map(label_dict)
    #mean_std_feats[['Id','label']].head(50)
    
    return mean_std_feats
   

#TODO: modeling of chunks using sequence models too

label_dict = addLabel2df(feats2)
mean_std_feats = frame_mean_std_chunk_modeling (feats2,label_dict)


#sum(mean_std_feats.isna().any())
#mean_std_feats.describe()

In [None]:
#2. Get feature set, labels, and recording IDs
X_train = mean_std_feats.drop(['label','Id','subIdx'], 1).copy()
y_train =  mean_std_feats['label'].copy()

ID_train = mean_std_feats['Id']
ID_list = ID_train.drop_duplicates()

#ID_train.size
ID_list.size

In [None]:
#3. Normalization in case some model requires it

scaler = StandardScaler()
scaler.fit(X_train)

#use same scaler for both, based on X_train data
X_trainNorm = scaler.transform(X_train.values)

In [None]:
sum(X_train.isna().any())

## Model training

### Train-test split (k-fold)

In [None]:
k = ID_list.values.size #number of folds

group_kfold = GroupKFold(n_splits=k)
group_kfold.get_n_splits(X_trainNorm, y_train, ID_train)

In [None]:
k

### Logistic regression

In [None]:
#Do cross-validation
pred_probs = pd.DataFrame([])

idx_acc = 0
for train_index, test_index in group_kfold.split(X_trainNorm,y_train,ID_train):
    X_train1, X_test1 = X_trainNorm[train_index], X_trainNorm[test_index]
    y_train1, y_test1 = y_train[train_index], y_train[test_index]
    
    #TODO: optimize the penaly weight
    #https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html
    logReg = SGDClassifier(loss='log', penalty='elasticnet')
    logReg.fit(X_train1, y_train1)
    y_hat_prob = logReg.predict_proba(X_test1)
    classes =logReg.classes_
    pred_probs = pred_probs.append(pd.DataFrame({'ID': ID_train[test_index], str(classes[0]): y_hat_prob[:,0], str(classes[1]): y_hat_prob[:,1]}),ignore_index=True, sort=False)    

In [None]:
def predict_class(prob_dry,prob_wet):
    if prob_dry > prob_wet :
        return 'Dry'
    else:
        return 'Wet'

#get probability per recording
def get_predClass_per_audio(pred_probs, label_dict):

    mean_pred_probs = pred_probs.groupby('ID').aggregate('mean').reset_index()

    mean_pred_probs['pred_class'] = mean_pred_probs.apply(lambda x: predict_class(x['Dry'], x['Wet']), axis=1)
    
    #add actual classes
    mean_pred_probs['label'] = mean_pred_probs["ID"].map(label_dict)
    return mean_pred_probs

mean_pred_probs = get_predClass_per_audio(pred_probs, label_dict)

## Evaluation

In [None]:
import classifEvaluationFunctions as eval
eval.evaluation_Step(mean_pred_probs)
    