# Data Pre-processing on Coughvid dataset

This notebook describes the pre-processing steps applied on the Coughvid dataset. These include:

1. Conversion to .wav format
2. Filtering cough_detected < 0.8.
3. Filtering "symptomatic" and unlabelled.
4. Downsampling to 16khz.
5. Standardising to 10 seconds by padding/cropping.
6. Augmenting the binary labels and saving into a single .npz file.
7. Importing data into a tensorflow format.


## Imports

In [None]:
########################################################################
#                            Imports                                   #
########################################################################

# Data processing
import numpy as np
import pandas as pd

# Data Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')
%matplotlib inline

# Audio files processing
import wave
import librosa

# Manipualting File paths
from pathlib import Path
import os
import re
import glob
import scipy.sparse as sparse

# Tensorflow
import tensorflow as tf

## Paths

In [None]:
########################################################################
#                           Data Paths                                 #
########################################################################

# Path to folder containing the data (can be used in both Linux and Windows)
data_path = Path(Path.cwd().parent, "datasets", "coughvid_dataset_updated", "dataset_cleaned_labelled")
print(f"Data folder: {data_path}")

data_wav = Path(Path.cwd().parent, "datasets", "coughvid_dataset_updated", "dataset_wav")
print(f"Data .wav folder: {data_wav}")

data_npz_16k = Path(Path.cwd().parent, "datasets", "coughvid_dataset_updated", "dataset_npz_16k")
print(f"Data npz 12k folder: {data_npz_16k}")

metadata_summary_path = os.path.join(data_path, 'metadata.csv')
print(f"Data summary file: {metadata_summary_path}")

metadata = pd.read_csv(metadata_summary_path, delimiter = ',')

n_samples = metadata.shape[0]

## Conversion to .wav files

The .webm and .ogg files are converted to .wav files using the command line utility command os.system 

In [None]:
## Converted webm and ogg files to wav

file_names = metadata.uuid.to_numpy()
n = file_names.shape[0]

for counter, name in enumerate(file_names):
    
    if (counter%1000 == 0):
        print("Finished {0}/{1}".format(counter,len(names_to_convert)))
    if os.path.isfile(data_folder + name + '.webm'):
        os.system('cmd /c "ffmpeg -i {0} {1}"'.format(data_folder+name+".webm", data_folder+name+".wav"))
    elif os.path.isfile(data_folder + name + '.ogg'):
        os.system('cmd /c "ffmpeg -i {0} {1}"'.format(data_folder+name+".ogg", data_folder+name+".wav"))
    else:
        print("Error: No file name {0}".format(name))

## Filtering

Filtering of samples with cough_detected < 0.8, "symptomatic" label and those which do not have a label is done in Excel. The filtered csv is then imported here

In [None]:
metadata_filtered_path = os.path.join(data_path, 'metadata_filtered.csv')
print(f"Data summary file: {metadata_summary_path}")

metadata = pd.read_csv(metadata_filtered_path, delimiter = ',')

## Downsampling and Standardising

In [None]:
########################################################################
#                       Data Downsampling                              #
########################################################################

sample_rate = 16000
length = sample_rate * 10 # 10 seconds

data = []

for i, file in enumerate(metadata["uuid"]):
    
    if i % 100 == 0:
        print(f"Completed {i} files")
       
    x, _ = librosa.load(Path(data_wav, file + ".wav"), sr=sample_rate)
    l = x.shape[0]
    
    if l >= length:
        x = x[0:length]
    else:
        x = np.pad(x, (0,length-l), 'constant')
    
    data.append(x)


## Augment binary labels and save as .npz file

In [None]:
labels = np.array([1 if y=="COVID-19" else 0 for y in metadata["status"] ])
np.savez(Path(data_npz_12k.parent,"coughvid_16k.npz"), x=data, y=labels)

## Load data into a tensorflow format

These can then be split into train/valid/test sets

In [None]:
########################################################################
#                           Data Loading                               #
########################################################################

with np.load(Path(data_npz_12k.parent,"coughvid_12k.npz")) as data:
    X = data["x"]
    y = data["y"]

train_dataset = tf.data.Dataset.from_tensor_slices((X, y))

In [None]:
train_dataset