In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
AUDIO_DIR = "../valid_data/"
ANNOTATIONS_FILE = os.path.join(AUDIO_DIR, "metadata_compiled_valid.parquet.gzip")

In [4]:
AUDIO_DIR = "../valid_data/"
ANNOTATIONS_FILE = os.path.join(AUDIO_DIR, "metadata_compiled_valid_edited.parquet.gzip")

In [5]:
df = pd.read_parquet(ANNOTATIONS_FILE)

# Creating a more even distribution of healthy v. covid samples

Creating a csv file with only 700 healthy and 700 covid samples

In [11]:
#for each entry in df, find the status of the file
print(df)
healthy = 0
covid = 0
for index, row in df.iterrows():
    if os.path.isfile(os.path.join(AUDIO_DIR, row['uuid'] + '.wav')):
        if row['status'] == 'healthy':
            if row['cough_detected'] > 0.98:
                if healthy < 700:
                    healthy += 1
                else:
                    #remove the row from the dataframe
                    df.drop(index, inplace=True)
            else:
                #remove the row from the dataframe
                df.drop(index, inplace=True)
        elif row['status'] == 'symptomatic':
            df.drop(index, inplace=True)
        else:
            covid += 1
#save the dataframe
df.to_csv(os.path.join(AUDIO_DIR, "metadata_compiled_valid_balanced_700.csv"))
df.to_parquet(os.path.join(AUDIO_DIR, "metadata_compiled_valid_balanced_700.parquet.gzip"))
print('healthy: ', healthy)
print('covid: ', covid)

                                       uuid                          datetime  \
2      001328dc-ea5d-4847-9ccf-c5aa2a3f2d0f  2020-04-13T22:23:06.997578+00:00   
4      00291cce-36a0-4a29-9e2d-c1d96ca17242  2020-04-13T15:10:58.405156+00:00   
6      002d28bc-7806-4dfb-9c9b-afa8cb623cac  2020-04-16T22:10:24.107938+00:00   
11     0044cb7b-448c-44e5-8302-ad8bd106fe3e  2020-05-13T18:38:39.956383+00:00   
12     004c24d8-e8cd-4755-86f6-5a1d8c7920c7  2020-04-13T18:23:26.964464+00:00   
...                                     ...                               ...   
13409  fd849b72-f4bf-4852-9bc2-fd9becc9571e  2020-04-22T07:06:57.562437+00:00   
13414  fd8d6e46-75de-42b3-a7ef-30d5294b915d  2020-05-05T17:57:01.830977+00:00   
13420  fdbabb79-c296-4b93-9e02-06b290916fe7  2020-04-10T14:52:39.570226+00:00   
13505  ff8363d2-016d-4738-9499-4c62480886fb  2020-07-12T08:22:56.853133+00:00   
13530  ffe0658f-bade-4654-ad79-40a468aabb03  2020-04-14T01:58:32.200245+00:00   

       cough_detected  lati

Creating the train, test, val dataset

In [13]:
df_bal = pd.read_parquet(os.path.join(AUDIO_DIR, "metadata_compiled_valid_balanced_700.parquet.gzip"))
print(df_bal)

train_val_b, test_b = train_test_split(df_bal, test_size=0.1, random_state=0, stratify=df_bal[['status']])
train_b, val_b = train_test_split(train_val_b, test_size=0.1, random_state=0, stratify=train_val_b[['status']])

train_b = train_b.reset_index(drop=True)
val_b = val_b.reset_index(drop=True)
test_b = test_b.reset_index(drop=True)

train_b.to_parquet("train_balanced_700.parquet.gzip")
val_b.to_parquet("val_balanced_700.parquet.gzip")
test_b.to_parquet("test_balanced_700.parquet.gzip")

print(train_b)
print(val_b)

# train_b.to_csv("train_balanced.csv")
# val_b.to_csv("val_balanced.csv")
# test_b.to_csv("test_balanced.csv")

                                       uuid                          datetime  \
2      001328dc-ea5d-4847-9ccf-c5aa2a3f2d0f  2020-04-13T22:23:06.997578+00:00   
4      00291cce-36a0-4a29-9e2d-c1d96ca17242  2020-04-13T15:10:58.405156+00:00   
6      002d28bc-7806-4dfb-9c9b-afa8cb623cac  2020-04-16T22:10:24.107938+00:00   
11     0044cb7b-448c-44e5-8302-ad8bd106fe3e  2020-05-13T18:38:39.956383+00:00   
12     004c24d8-e8cd-4755-86f6-5a1d8c7920c7  2020-04-13T18:23:26.964464+00:00   
...                                     ...                               ...   
13409  fd849b72-f4bf-4852-9bc2-fd9becc9571e  2020-04-22T07:06:57.562437+00:00   
13414  fd8d6e46-75de-42b3-a7ef-30d5294b915d  2020-05-05T17:57:01.830977+00:00   
13420  fdbabb79-c296-4b93-9e02-06b290916fe7  2020-04-10T14:52:39.570226+00:00   
13505  ff8363d2-016d-4738-9499-4c62480886fb  2020-07-12T08:22:56.853133+00:00   
13530  ffe0658f-bade-4654-ad79-40a468aabb03  2020-04-14T01:58:32.200245+00:00   

       cough_detected  lati

# Creating segmented audio dataset

In [16]:
import sys
import torch
import torchaudio
sys.path.insert(0, '../src')
from segmentation import segment_cough

NEW_DIR = "../balanced_segmented_data/"

if not os.path.exists(NEW_DIR):
   os.makedirs(NEW_DIR)

df = pd.read_parquet(os.path.join(AUDIO_DIR, "metadata_compiled_valid_balanced_700.parquet.gzip"))

#create a copy of df
df_seg = df.copy()

#for each entry in df, load and segment the audio file
for index, row in df.iterrows():
    if os.path.isfile(os.path.join(AUDIO_DIR, row['uuid'] + '.wav')):
        path = os.path.join(AUDIO_DIR, row['uuid'] + ".wav")
        signal, sr = torchaudio.load(path)
        segments, segmented_mask = segment_cough(signal.numpy()[0], sr)
        if len(segments) > 0:
            signal = torch.tensor(segments[0])
            signal = signal.unsqueeze(0)
            torchaudio.save(os.path.join(NEW_DIR, row['uuid'])+"_segmented.wav", signal, sr)
        else:
            signal = signal
            # write the unsegmented files to a file called unsegmented
            with open(os.path.join(NEW_DIR, "unsegmented.csv"), "a") as f:
                f.write(row['uuid'] + "_unsegmented.wav \n")
            # remove the row from the dataframe
            df_seg.drop(index, inplace=True)
            torchaudio.save(os.path.join(NEW_DIR, row['uuid'])+"_unsegmented.wav", signal, sr)        

df_seg.to_parquet(os.path.join(NEW_DIR, "metadata_compiled_valid_balanced_700_segmented.parquet.gzip"))
        

In [19]:
df_seg = pd.read_parquet(os.path.join(NEW_DIR, "metadata_compiled_valid_balanced_700_segmented.parquet.gzip"))
#print status = healthy number in the dataframe
print('healthy: ', df_seg[df_seg['status'] == 'healthy'].count())
#print status = covid number in the dataframe
print('covid: ', df_seg[df_seg['status'] == 'COVID-19'].count())

train_val_s, test_s = train_test_split(df_seg, test_size=0.1, random_state=0, stratify=df_seg[['status']])
train_s, val_s = train_test_split(train_val_s, test_size=0.1, random_state=0, stratify=train_val_s[['status']])

train_s = train_s.reset_index(drop=True)
val_s = val_s.reset_index(drop=True)
test_s = test_s.reset_index(drop=True)

train_s.to_parquet("train_balanced_700_segmented.parquet.gzip")
val_s.to_parquet("val_balanced_700_segmented.parquet.gzip")
test_s.to_parquet("test_balanced_700_segmented.parquet.gzip")



healthy:  uuid                       669
datetime                   669
cough_detected             669
latitude                   404
longitude                  404
age                        647
gender                     669
respiratory_condition      669
fever_muscle_pain          669
status                     669
status_SSL                 466
quality_1                    9
cough_type_1                 9
dyspnea_1                    9
wheezing_1                   9
stridor_1                    9
choking_1                    9
congestion_1                 9
nothing_1                    9
diagnosis_1                  9
severity_1                   9
quality_2                   15
cough_type_2                14
dyspnea_2                   15
wheezing_2                  15
stridor_2                   15
choking_2                   15
congestion_2                15
nothing_2                   15
diagnosis_2                 15
severity_2                  15
quality_3                   1

# Create Oversampled 3,500 Covid / 3,500 Healthy Dataset

In [None]:
#for each entry in df, find the status of the file
print(df)
healthy = 0
covid = 0
for index, row in df.iterrows():
    if os.path.isfile(os.path.join(AUDIO_DIR, row['uuid'] + '.wav')):
        if row['status'] == 'healthy':
            if row['cough_detected'] > 0.98:
                if healthy < 3500:
                    healthy += 1
                else:
                    #remove the row from the dataframe
                    df.drop(index, inplace=True)
            else:
                #remove the row from the dataframe
                df.drop(index, inplace=True)
        elif row['status'] == 'symptomatic':
            df.drop(index, inplace=True)
        else:
            covid += 1
#save the dataframe
df.to_csv(os.path.join(AUDIO_DIR, "metadata_compiled_valid_balanced_700.csv"))
df.to_parquet(os.path.join(AUDIO_DIR, "metadata_compiled_valid_balanced_700.parquet.gzip"))
print('healthy: ', healthy)
print('covid: ', covid)

# Create the original dataset

In [10]:
train_val, test = train_test_split(df, test_size=0.1, random_state=0, stratify=df[['status']])
train, val = train_test_split(train_val, test_size=0.1, random_state=0, stratify=train_val[['status']])

train = train.reset_index(drop=True)
val = val.reset_index(drop=True)
test = test.reset_index(drop=True)

In [11]:
train.to_parquet("train_edited.parquet.gzip")
val.to_parquet("val_edited.parquet.gzip")
test.to_parquet("test_edited.parquet.gzip")