# Coswara Data Processing

This notebook includes are pre-processing on the Coswara dataset. Note that this has not been used in the final report since a clean and validated iteration of this dataset was released as the DiCOVA Challenge dataset and was thus used instead of this version

## Imports

In [None]:
########################################################################
#                            Imports                                   #
########################################################################

# Data processing
import numpy as np
import pandas as pd

# Data Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')
%matplotlib inline

# Audio files processing
import wave
import librosa

# Manipualting File paths
from pathlib import Path
import os
import re
import glob
import scipy.sparse as sparse

# Tensorflow
import tensorflow as tf

## Paths

In [None]:
########################################################################
#                           Data Paths                                 #
########################################################################

# Path to folder containing the data (can be used in both Linux and Windows)
data_path = Path(Path.cwd().parent, "datasets", "coswara_dataset")
print(f"Data folder: {data_path}")

metadata_path = os.path.join(data_path, 'Coswara_coughs.csv')
print(f"Data summary file: {metadata_path}")

metadata_cd_path = os.path.join(data_path, 'coswara_metadata_summary.csv')
print(f"Data cough detection file: {metadata_cd_path}")

metadata = pd.read_csv(metadata_path, delimiter = ',')
metadata_cd = pd.read_csv(metadata_cd_path, delimiter = ',')

files = os.listdir(data_path)
# files

## Dropping unused columns

In [None]:
## Dropping unused columns

df = pd.merge(metadata, metadata_cd, how="left", on="id")
df = df.drop(["Unnamed: 3","Unnamed: 4","Unnamed: 5","Unnamed: 6"], axis=1)
df = df.drop(["ep","l_l","l_s","rU"], axis=1)
df = df.drop(["asthma","cough","smoker","ht","cold"], axis=1)
df = df.drop(["diabetes","um","ihd","bd","st"], axis=1)
df = df.drop(["fever","ftg","mp","loss_of_smell","test_status","pneumonia","diarrhoea","cld"], axis=1)
df = df.drop(["path","copy_path"], axis=1)
df.head()

## Augmenting the output of the cough detection algorithm on the dataframe

In [None]:
## Adding cough detection output

cough_detected = [(c1, c2) for (c1,c2) in zip(df["cough_detected_heavy"], df["cough_detected_shallow"])]
cd = [c[0] if i%2==0 else c[1] for i, c in enumerate(cough_detected)]
len(cd)

df = df.drop(["cough_detected_heavy","cough_detected_shallow"], axis=1)
df["cough_detected"] = cd
df.head()

## Filtering out samples which will not be used

In [None]:
## Filtering

print(f"Number of samples: {df.shape[0]}")

df2 = df[df["cough_detected"]>0.8]
print(f"Number of samples: {df2.shape[0]}")

indices = df2[ df2['covid_status'] == "resp_illness_not_identified" ].index
df2.drop(indices, inplace=True)
print(f"Number of samples: {df2.shape[0]}")

indices = df2[ df2['covid_status'] == "no_resp_illness_exposed" ].index
df2.drop(indices, inplace=True)
print(f"Number of samples: {df2.shape[0]}")

indices = df2[ df2['covid_status'] == "recovered_full" ].index
df2.drop(indices, inplace=True)
print(f"Number of samples: {df2.shape[0]}")


In [None]:
df2.to_csv("metadata_coswara.csv")

## Downsampling to 16khz

In [None]:
########################################################################
#                       Data Downsampling                              #
########################################################################

sample_rate = 16000
length = sample_rate * 10 # 10 seconds

data = []
prev_file = "a"

for i, file in enumerate(metadata["id"]):
    
    if i % 100 == 0:
        print(f"Completed {i} files")
    
    if prev_file == file:
        x, _ = librosa.load(Path(data_path, file + '/cough-shallow.wav'), sr=sample_rate)
    else:
        x, _ = librosa.load(Path(data_path, file + '/cough-heavy.wav'), sr=sample_rate) 
    
    l = x.shape[0]
    
    if l >= length:
        x = x[0:length]
    else:
        x = np.pad(x, (0,length-l), 'constant')

    prev_file = file
    
    data.append(x)

In [None]:
np.savez(Path(data_path,"coswara_16k.npz"), x=data, y=labels)

## Load data into tensorflow format

In [None]:
########################################################################
#                           Data Loading                               #
########################################################################

with np.load(Path(data_path,"coswara_16k.npz")) as data:
    X = data["x"]
    y = data["y"]

train_dataset = tf.data.Dataset.from_tensor_slices((X, y))

In [None]:
train_dataset

In [None]:
y.shape