# Dataset Preprocessing
The Coughvid Dataset comes with some corrupted wav files. This notebook finds the corrupted files and removes them from the dataset.



In [None]:
import os
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from tqdm import tqdm
# import audiomentations
# from audiomentations import Compose, AddGaussianNoise, PitchShift
# import torch_audiomentations
# from torch_audiomentations import Compose, AddGaussianNoise, PitchShift
import torchaudio
import subprocess

1. Find the bad wav files.

In [None]:
train_df = pd.read_parquet(os.path.join(AUDIO_DIR, "train.parquet.gzip"))
all_df = pd.read_parquet(os.path.join(AUDIO_DIR, "metadata_compiled_valid.parquet.gzip"))
# loop through all samples and load
error_files = []
error_labels = []
for i in tqdm(range(len(all_df))):
    path = os.path.join(AUDIO_DIR, all_df.iloc[i, 0])+".wav"
    label = all_df.iloc[i, 9]
    try:
        audio, sr = torchaudio.load(path)
    except:
        #print(f"Error loading {path}")
        error_files.append(path)
        error_labels.append(label)

2. Count how many bad wav files there are

In [None]:
print('Number of bad wav files in all:', len(error_files))
#count how many of each label
healthy = 0
covid = 0
symptomatic = 0
for label in error_labels:
    if label == 'healthy':
        healthy += 1
    elif label == 'COVID-19':
        covid += 1
    elif label == 'symptomatic':
        symptomatic += 1
print('Number of bad healthy samples:', healthy)
print('Number of bad covid samples:', covid)
print('Number of bad symptomatic samples:', symptomatic)


#write error_files to csv
with open('error_files.csv', 'w') as f:
    for item in error_files:
        f.write(item[14:-4])
        f.write('\n')

3. Tried reconverting all the bad wav files -- found that they are all existing wav files in the Coughvid database.

In [None]:
coughvid = '../../coughvid_20211012/'
current_dir = os.getcwd()

#read in error files in pd
error_files = pd.read_csv('error_files.csv', header=None)
#cut off the front and .wav at the end
test = error_files[0].str[14:-4]

count = 0
for file in tqdm(test):
# run ffmpeg -i "file.webm" -vn "file.wav" in terminal to convert to wav
    print(coughvid + file + '.webm')
    if os.path.isfile(coughvid + file + '.webm'):
        subprocess.call(["ffmpeg", "-i", coughvid+file+".webm", current_dir+'./temp_wav/'+file+".wav"])
    elif os.path.isfile(coughvid + file + '.ogg'):
        subprocess.call(["ffmpeg", "-i", coughvid+file+".ogg", current_dir+'./temp_wav/'+file+".wav"])
    else:
        print("Error: No file name {0}".format(file))
        count += 1
print(count)

4. Create the new data without the bad wav files.

In [None]:
all_df = pd.read_parquet(os.path.join(AUDIO_DIR, "metadata_compiled_valid.parquet.gzip"))
# for each file name in error files
# remove it from all_df
# create new train/test split


#read in error files in pd
error_files = pd.read_csv('error_files.csv', header=None)

#for each file name in error files
#remove it from all_df
# create new train/test split
for file in tqdm(error_files[0]):
    #if file equals uuid in all_df remove it from all_df
    #remove row from all_df
    all_df = all_df.drop(all_df.index[all_df['uuid'] == file])

#save new all_df
all_df.to_parquet(os.path.join(AUDIO_DIR, "metadata_compiled_valid_edited.parquet.gzip"))
#save new all_df as csv
all_df.to_csv(os.path.join(AUDIO_DIR, "metadata_compiled_valid_edited.csv"))