In [25]:
# general purpose libraries
import numpy as np
import datetime as dt
import pandas as pd
import os
import warnings
import pickle
from timeit import default_timer as timer
from collections import OrderedDict
from itertools import chain

from tqdm.notebook import tqdm
from scipy.signal import hilbert

pd.set_option("display.max_columns", None)


import warnings
warnings.filterwarnings('ignore', '.*PySoundFile failed. Trying audioread instead*.', )


In [2]:
# plots and visualisation
import matplotlib.pyplot as plt
import plotly.graph_objects as ply_go
import plotly.figure_factory as ply_ff
import plotly.colors as ply_colors #.sequential.Oranges as orange_palette
#print(plotly.colors.named_colorscales() )
#plotly.colors.sequential.swatches()
#ply_colors.sequential.Oranges

In [3]:
# DSP libraries
from scipy import signal
import librosa
import librosa.display as librosa_display

In [4]:
# ML and data modelling libraries
from sklearn.preprocessing   import MinMaxScaler, OneHotEncoder,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score,roc_curve, precision_recall_curve,confusion_matrix,precision_score, recall_score,average_precision_score, classification_report
from sklearn.linear_model import LogisticRegression

import xgboost as xgb

In [11]:
### Setup paths and directories
work_dir = r'dataset/' 
data_dir = work_dir + r"COUGHVID/"
audio_outdir = "out"

metadata_file = "metadata_compiled.csv"

In [12]:
metadata=pd.read_csv(data_dir+metadata_file,sep=",")
#print(metadata.columns)

# convert strings 'True'/'False' to genuine booleans
cols_to_boolean = (['respiratory_condition', 'fever_muscle_pain',
                     'dyspnea_1', 'wheezing_1', 'stridor_1','choking_1', 'congestion_1', 'nothing_1',
                     'dyspnea_2', 'wheezing_2', 'stridor_2','choking_2', 'congestion_2', 'nothing_2',
                     'dyspnea_3', 'wheezing_3', 'stridor_3','choking_3', 'congestion_3', 'nothing_3',
                     'dyspnea_4', 'wheezing_4', 'stridor_4','choking_4', 'congestion_4', 'nothing_4'])
for c in cols_to_boolean:
    metadata.loc[metadata[c].notnull(),c] = metadata.loc[metadata[c].notnull(),c].astype(bool) 

# remove entries where either status or age is NA
print("Metadata df entries before cleaning NAs: {}".format(metadata.shape[0]))
metadata = metadata.loc[~((metadata['status'].isnull() ) | (metadata['age'].isnull()) ),]
print("Metadata df entries after cleaning NAs: {}".format(metadata.shape[0]))



Metadata df entries before cleaning NAs: 27550
Metadata df entries after cleaning NAs: 15218


In [13]:
metadata['audio_class'] = 'X' # default, we should have none by the end of this classification process
metadata.loc[ (metadata['cough_detected'] >= 0.80) & (metadata['status'] == "COVID-19") ,'audio_class'] = 1
metadata.loc[ (metadata['cough_detected'] >= 0.80) & (metadata['status'] == "healthy") ,'audio_class'] = 0
metadata.loc[ (metadata['cough_detected'] >= 0.80) & (metadata['status'] == "symptomatic") ,'audio_class'] = 2

print("Entries subdivided in classes. Printing the number of entries for each class:")
print(metadata[['audio_class','uuid']].groupby(['audio_class']).count().rename(columns={'uuid':'N_entries'}) )

print("\n\n\nSplitting count by class and status:")
print(metadata[['audio_class','status','uuid']].groupby(['audio_class','status']).count().rename(columns={'uuid':'N_entries'}) )


Entries subdivided in classes. Printing the number of entries for each class:
             N_entries
audio_class           
0                 7905
1                  608
2                 1778
X                 4927



Splitting count by class and status:
                         N_entries
audio_class status                
0           healthy           7905
1           COVID-19           608
2           symptomatic       1778
X           COVID-19           357
            healthy           3827
            symptomatic        743


In [14]:
validdata = metadata.loc[(metadata['audio_class'] != 'X')]
print(validdata[['audio_class','uuid']].groupby(['audio_class']).count().rename(columns={'uuid':'N_entries'}) )

             N_entries
audio_class           
0                 7905
1                  608
2                 1778


In [15]:
# Balance the number of each classes
num = len(validdata.loc[validdata["audio_class"] == 1])
balanced_data = pd.concat([
    validdata.loc[validdata["audio_class"] == 1],
    validdata.loc[validdata["audio_class"] == 0].sample(n = num),
    validdata.loc[validdata["audio_class"] == 2].sample(n = num)
])

In [16]:
def movmean(values, window):
    weights = np.repeat(1.0, window)/window
    sma = np.convolve(values, weights, 'valid')
    return sma

def calarea(filename, indir, sr=None ):
    signal, sr = librosa.load(indir+filename, sr=sr, mono=True)
    signal = movmean(signal, 3)
    area = np.sum(np.abs(hilbert(signal)))

    return area

    

In [28]:
for uuid in tqdm(balanced_data["uuid"].values):
    filename = uuid + ".webm"
    if not os.path.exists(data_dir + filename):
        # try to look for a .ogg file
        filename = uuid + ".ogg"
        if not os.path.exists(data_dir + filename):
            print(f"could not find audio file for uuid: {uuid}")
            continue

    area = calarea(filename, data_dir)
    balanced_data.loc[balanced_data["uuid"] == uuid, "area"] = area

  0%|          | 0/1824 [00:00<?, ?it/s]

In [33]:
balanced_data.to_csv(data_dir + "balanced_data.csv")

In [41]:
X = balanced_data["area"].values.reshape(-1, 1).astype("float32")
Y = balanced_data["audio_class"].values.reshape(-1).astype("uint8")

In [42]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20)

In [46]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,accuracy_score,auc,roc_curve,cohen_kappa_score


scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

classifier = KNeighborsClassifier(n_neighbors = 3)
classifier.fit(X_train, Y_train)

Y_pred = classifier.predict(X_test)

print(classification_report(Y_test, Y_pred))
print(accuracy_score(Y_test, Y_pred))
print(cohen_kappa_score(Y_test, Y_pred))


              precision    recall  f1-score   support

           0       0.35      0.45      0.39       126
           1       0.30      0.28      0.29       120
           2       0.30      0.23      0.26       119

    accuracy                           0.32       365
   macro avg       0.32      0.32      0.32       365
weighted avg       0.32      0.32      0.32       365

0.3232876712328767
-0.018286346796783315
