In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
AUDIO_DIR = "../valid_data/"
ANNOTATIONS_FILE = os.path.join(AUDIO_DIR, "metadata_compiled_valid.parquet.gzip")

In [4]:
AUDIO_DIR = "../valid_data/"
ANNOTATIONS_FILE = os.path.join(AUDIO_DIR, "metadata_compiled_valid_edited.parquet.gzip")

In [5]:
df = pd.read_parquet(ANNOTATIONS_FILE)

# Creating a more even distribution of healthy v. covid samples

Creating a csv file with only 1000 healthy and 700 covid samples

In [6]:
#for each entry in df, find the status of the file
print(df)
healthy = 0
covid = 0
for index, row in df.iterrows():
    if os.path.isfile(os.path.join(AUDIO_DIR, row['uuid'] + '.wav')):
        if row['status'] == 'healthy':
            if row['cough_detected'] > 0.98:
                if healthy < 1000:
                    healthy += 1
                else:
                    #remove the row from the dataframe
                    df.drop(index, inplace=True)
            else:
                #remove the row from the dataframe
                df.drop(index, inplace=True)
        elif row['status'] == 'symptomatic':
            df.drop(index, inplace=True)
        else:
            covid += 1
#save the dataframe
df.to_csv(os.path.join(AUDIO_DIR, "metadata_compiled_valid_balanced.csv"))
df.to_parquet(os.path.join(AUDIO_DIR, "metadata_compiled_valid_balanced.parquet.gzip"))
print('healthy: ', healthy)
print('covid: ', covid)

                                       uuid                          datetime  \
0      00039425-7f3a-42aa-ac13-834aaa2b6b92  2020-04-13T21:30:59.801831+00:00   
1      0009eb28-d8be-4dc1-92bb-907e53bc5c7a  2020-04-12T04:02:18.159383+00:00   
2      001328dc-ea5d-4847-9ccf-c5aa2a3f2d0f  2020-04-13T22:23:06.997578+00:00   
3      0028b68c-aca4-4f4f-bb1d-cb4ed5bbd952  2020-05-24T12:12:46.394647+00:00   
4      00291cce-36a0-4a29-9e2d-c1d96ca17242  2020-04-13T15:10:58.405156+00:00   
...                                     ...                               ...   
13529  ffdc1fbe-ae22-4488-ad01-307edd2912ed  2021-03-09T08:56:33.522312+00:00   
13530  ffe0658f-bade-4654-ad79-40a468aabb03  2020-04-14T01:58:32.200245+00:00   
13531  ffe13fcf-c5c2-4a6a-a9fc-e010f4f033c1  2020-04-13T21:08:50.708320+00:00   
13532  ffedc843-bfc2-4ad6-a749-2bc86bdac84a  2020-06-05T03:41:37.481463+00:00   
13533  ffeea120-92a4-40f9-b692-c3865c7a983f  2020-05-02T10:18:27.348859+00:00   

       cough_detected  lati

Creating the train, test, val dataset

In [7]:
df_bal = pd.read_parquet(os.path.join(AUDIO_DIR, "metadata_compiled_valid_balanced.parquet.gzip"))
print(df_bal)

train_val_b, test_b = train_test_split(df_bal, test_size=0.1, random_state=0, stratify=df_bal[['status']])
train_b, val_b = train_test_split(train_val_b, test_size=0.1, random_state=0, stratify=train_val_b[['status']])

train_b = train_b.reset_index(drop=True)
val_b = val_b.reset_index(drop=True)
test_b = test_b.reset_index(drop=True)

train_b.to_parquet("train_balanced.parquet.gzip")
val_b.to_parquet("val_balanced.parquet.gzip")
test_b.to_parquet("test_balanced.parquet.gzip")

print(train_b)
print(val_b)

train_b.to_csv("train_balanced.csv")
val_b.to_csv("val_balanced.csv")
test_b.to_csv("test_balanced.csv")

                                       uuid                          datetime  \
2      001328dc-ea5d-4847-9ccf-c5aa2a3f2d0f  2020-04-13T22:23:06.997578+00:00   
4      00291cce-36a0-4a29-9e2d-c1d96ca17242  2020-04-13T15:10:58.405156+00:00   
6      002d28bc-7806-4dfb-9c9b-afa8cb623cac  2020-04-16T22:10:24.107938+00:00   
11     0044cb7b-448c-44e5-8302-ad8bd106fe3e  2020-05-13T18:38:39.956383+00:00   
12     004c24d8-e8cd-4755-86f6-5a1d8c7920c7  2020-04-13T18:23:26.964464+00:00   
...                                     ...                               ...   
13409  fd849b72-f4bf-4852-9bc2-fd9becc9571e  2020-04-22T07:06:57.562437+00:00   
13414  fd8d6e46-75de-42b3-a7ef-30d5294b915d  2020-05-05T17:57:01.830977+00:00   
13420  fdbabb79-c296-4b93-9e02-06b290916fe7  2020-04-10T14:52:39.570226+00:00   
13505  ff8363d2-016d-4738-9499-4c62480886fb  2020-07-12T08:22:56.853133+00:00   
13530  ffe0658f-bade-4654-ad79-40a468aabb03  2020-04-14T01:58:32.200245+00:00   

       cough_detected  lati

# Creating segmented audio dataset

In [None]:
import sys
import torch
import torchaudio
sys.path.insert(0, '../src')
from segmentation import segment_cough

NEW_DIR = "../balanced_data/"

df = pd.read_parquet("metadata_compiled_valid_balanced.parquet.gzip")

#for each entry in df, load and segment the audio file
for index, row in df.iterrows():
    if os.path.isfile(os.path.join(AUDIO_DIR, row['uuid'] + '.wav')):
        path = os.path.join(AUDIO_DIR, df.iloc[index, 0])+".wav"
        signal, sr = torchaudio.load(path)
        segments, segmented_mask = segment_cough(signal, sr)
        if len(segments) > 0:
            signal = torch.tensor(segments[0])
            signal = signal.unsqueeze(0)
        else:
            signal = signal
        print('signal in self.do_segment: ', signal)
        torchaudio.save(os.path.join(NEW_DIR, df.iloc[index, 0])+"_segmented.wav", signal, sr)
        

In [10]:
train_val, test = train_test_split(df, test_size=0.1, random_state=0, stratify=df[['status']])
train, val = train_test_split(train_val, test_size=0.1, random_state=0, stratify=train_val[['status']])

train = train.reset_index(drop=True)
val = val.reset_index(drop=True)
test = test.reset_index(drop=True)

In [11]:
train.to_parquet("train_edited.parquet.gzip")
val.to_parquet("val_edited.parquet.gzip")
test.to_parquet("test_edited.parquet.gzip")