In [1]:
import os
import pandas as pd
import librosa
import librosa.display
import numpy as np
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
import soundfile
import pickle

In [2]:
### Setup paths and directories
dataset_path = ".."
rawdata_path = os.path.join(dataset_path, "Extracted_data")
out_path = "raw"
metadata_path = os.path.join(dataset_path, "combined_data.csv")

In [3]:
metadata = pd.read_csv(metadata_path, sep = ",")

In [4]:
metadata["covid"] = 'X'
metadata.loc[metadata["covid_status"] == "healthy", "covid"] = 0
metadata.loc[(metadata["covid_status"] == "positive_mild") | (metadata["covid_status"] == "positive_moderate"), "covid"] = 1
metadata = metadata.loc[metadata["covid"] != 'X']

In [5]:
print(metadata[['covid','id']].groupby(['covid']).count().rename(columns={'id':'N_entries'}))

       N_entries
covid           
0           1433
1            591


In [6]:
# Balance the number of each classes
num = len(metadata.loc[metadata["covid"] == 1])

balanced_data = pd.concat([
    metadata.loc[metadata["covid"] == 1],
    metadata.loc[metadata["covid"] == 0].sample(n = num)
    # metadata.loc[metadata["covid"] == 2].sample(n = num)
])

print(balanced_data[['covid','id']].groupby(['covid']).count().rename(columns={'id':'N_entries'}))

       N_entries
covid           
0            591
1            591


In [7]:
data = {
    "filename": [],
    "signal": [],
    "label": []
}

sample_rate = 48000
fix_length = sample_rate * 3

for uuid, covid in tqdm(zip(balanced_data["id"].values, balanced_data["covid"].values), total = len(balanced_data)):

    file_path = os.path.join(rawdata_path, uuid, "cough-heavy.wav")
    
    if not os.path.exists(file_path):
        print(f"could not find audio file for uuid: {uuid}")
        continue

    signal, sr = librosa.load(file_path, sr = None)

    if signal.size == 0:
        print("Empty audio")
        continue

    if sr != sample_rate:
        print("Resampling...")
        signal = librosa.resample(signal, orig_sr = sr, target_sr = sample_rate)

    signal = librosa.util.fix_length(signal, size = fix_length)

    data["filename"].append(uuid)
    data["signal"].append(signal)
    data["label"].append(covid)


  0%|          | 0/1182 [00:00<?, ?it/s]

Empty audio
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Empty audio
Resampling...
Resampling...
Empty audio
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Empty audio
Resampling...
Empty audio
Resampling...
Resampling...
Resampling...
Resampling...
Empty audio
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Empty audio
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Empty audio
Empty audio
Empty audio
Resampling...
Resampling...
Resampling...
Resampling...
Empty audio
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Resampling...
Empty audio
Empty audio
Resampling...
Resampling...
Empty audio
Empty audio
Empty audio
Empty audio
Resampling..

In [8]:
print("covid: 0, {}".format(np.sum(np.asarray(data["label"]) == 0)))
print("covid: 1, {}".format(np.sum(np.asarray(data["label"]) == 1)))

covid: 0, 574
covid: 1, 589


In [9]:
signals = np.asarray(data["signal"])
signals.shape

(1163, 144000)

In [10]:
with open("signals_144000.pkl", "wb") as f:
    pickle.dump(data, f)

In [11]:
n_mfcc = 39
frame_length = 2048 # about 40ms per frame
hop_length = frame_length // 2


mfccs = librosa.feature.mfcc(y = signals, n_mfcc = n_mfcc, n_fft = frame_length, hop_length = hop_length)

In [12]:
mfccs.shape

(1163, 39, 141)

In [13]:
filename = f"mfcc39_fixlength_{fix_length}_nmfcc_{n_mfcc}_framelength_{frame_length}_hoplength_{hop_length}.npy"

In [14]:
np.save(filename, mfccs)