In [2]:
import os
import random
import numpy as np
import pandas as pd
import librosa
from tqdm import tqdm

In [4]:
# Set seed for reproducibility
seed_value= 32 
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

# set variables
ROOT = 'F:/DATAS/COUGHVID-public_dataset_v3/coughvid_20211012/'
class_names = ['healthy','COVID-19','symptomatic']
audio_length = 32306

# load coughvid meta
data_raw = pd.read_csv(ROOT+'metadata_compiled.csv', header=0, index_col=0)
data_raw.head(3)

Unnamed: 0,uuid,datetime,cough_detected,latitude,longitude,age,gender,respiratory_condition,fever_muscle_pain,status,...,quality_4,cough_type_4,dyspnea_4,wheezing_4,stridor_4,choking_4,congestion_4,nothing_4,diagnosis_4,severity_4
0,00014dcc-0f06-4c27-8c7b-737b18a2cf4c,2020-11-25T18:58:50.488301+00:00,0.0155,48.9,2.4,,,,,,...,,,,,,,,,,
1,00039425-7f3a-42aa-ac13-834aaa2b6b92,2020-04-13T21:30:59.801831+00:00,0.9609,31.3,34.8,15.0,male,False,False,healthy,...,,,,,,,,,,
2,0007c6f1-5441-40e6-9aaf-a761d8f2da3b,2020-10-18T15:38:38.205870+00:00,0.1643,,,46.0,female,False,False,healthy,...,,,,,,,,,,


In [5]:
# data_raw.groupby("status")["uuid"].count()
data_raw.status.value_counts()

healthy        15476
symptomatic     3873
COVID-19        1315
Name: status, dtype: int64

# 筛选优质的数据
首先取出专家标注的部分，这些才是真正用来做监督学习的。

然后
1. 去除没有status词条的行
2. 去除cough_detected小于0.8的行
3. 去除quality不是good的行
4. 仅保留专家标注的列，不再需要用户自己上报的列。

In [6]:
def split_by_physicians(df):
    column_names = ['uuid', 'datetime', 'cough_detected', 'SNR', 'latitude', 'longitude', 
                    'age', 'gender', 'respiratory_condition', 'fever_muscle_pain', 'status', 
                    'quality', 'cough_type', 'dyspnea', 'wheezing', 'stridor', 'choking', 
                    'congestion', 'nothing', 'diagnosis', 'severity' ]
    physician_01 = df.iloc[:, 0:21]
    physician_01 = physician_01[physician_01.quality_1.notna()].reset_index(drop=True)
    physician_01.columns = column_names

    physician_02 = pd.concat([df.iloc[:, 0:11], df.iloc[:, 21:31]], axis=1)
    physician_02 = physician_02[physician_02.quality_2.notna()].reset_index(drop=True)
    physician_02.columns = column_names

    physician_03 = pd.concat([df.iloc[:, 0:11], df.iloc[:, 31:41]], axis=1)
    physician_03 = physician_03[physician_03.quality_3.notna()].reset_index(drop=True)
    physician_03.columns = column_names

    physician_04 = pd.concat([df.iloc[:, 0:11], df.iloc[:, 41:51]], axis=1)
    physician_04 = physician_04[physician_04.quality_4.notna()].reset_index(drop=True)
    physician_04.columns = column_names
    return physician_01, physician_02, physician_03, physician_04
    
def process_csv(df):
    #split by physicians
    physician_01, physician_02, physician_03, physician_04 = split_by_physicians(df)
    # combine into one dataframe
    df = pd.concat([physician_01,physician_02,physician_03,physician_04]).reset_index(drop=True)  
    # drop null status
    df = df[df.status.notna()]
    # drop cough_detected < 0.8
    df = df[df.cough_detected >= 0.8 ]
    # select good and ok quality
    df = df[df.quality == 'good']
    # shuffle
    df = df.sample(frac=1).reset_index(drop=True) 
    df = df[['uuid', 'status','cough_type', 'dyspnea', 'wheezing', 'stridor', 'choking', 'congestion', 'severity']]
    return df

processed_df = process_csv(data_raw)
processed_df.head(3)

Unnamed: 0,uuid,status,cough_type,dyspnea,wheezing,stridor,choking,congestion,severity
0,de543d13-541c-4ad7-bb3c-c5c302de3aaf,healthy,wet,False,False,False,False,False,mild
1,0733f882-d7fd-4dc5-a1b0-8aeec64fc112,healthy,dry,False,False,False,False,False,pseudocough
2,eaae5d4e-ca16-468f-a1d9-3567a396b6da,healthy,dry,False,False,False,False,False,mild


In [7]:
print(processed_df.shape)
processed_df.groupby("status")["uuid"].count()

(720, 9)


status
COVID-19    152
healthy     568
Name: uuid, dtype: int64

In [8]:
def segment_cough(x,fs, cough_padding=0.2,min_cough_len=0.2, th_l_multiplier = 0.1, th_h_multiplier = 2):
    #Preprocess the data by segmenting each file into individual coughs using a hysteresis comparator on the signal power                
    cough_mask = np.array([False]*len(x))
    
    #Define hysteresis thresholds
    rms = np.sqrt(np.mean(np.square(x)))
    seg_th_l = th_l_multiplier * rms
    seg_th_h =  th_h_multiplier*rms

    #Segment coughs
    coughSegments = []
    padding = round(fs*cough_padding)
    min_cough_samples = round(fs*min_cough_len)
    cough_start = 0
    cough_end = 0
    cough_in_progress = False
    tolerance = round(0.01*fs)
    below_th_counter = 0
    
    for i, sample in enumerate(x**2):
        if cough_in_progress:
            if sample<seg_th_l:
                below_th_counter += 1
                if below_th_counter > tolerance:
                    cough_end = i+padding if (i+padding < len(x)) else len(x)-1
                    cough_in_progress = False
                    if (cough_end+1-cough_start-2*padding>min_cough_samples):
                        coughSegments.append(x[cough_start:cough_end+1])
                        cough_mask[cough_start:cough_end+1] = True
            elif i == (len(x)-1):
                cough_end=i
                cough_in_progress = False
                if (cough_end+1-cough_start-2*padding>min_cough_samples):
                    coughSegments.append(x[cough_start:cough_end+1])
            else:
                below_th_counter = 0
        else:
            if sample>seg_th_h:
                cough_start = i-padding if (i-padding >=0) else 0
                cough_in_progress = True
    
    return coughSegments, cough_mask

In [9]:
# def insert_into_df(df, row, idx):
#     df2 = df.iloc[idx+1:, :]
#     df = df.iloc[:idx, :]
#     df.append(row)
#     df = pd.concat(df, df2)
#     return df

# https://blog.csdn.net/sunmingyang1987/article/details/105486710
def insert_addidx(df, row, idx):
    df = df.reindex(index=df.index.insert(idx, str(idx)))
    df.loc[str(idx)] = row
    return df

# 创建meta文件csv

# 切分音频并存储

In [10]:
audio_length = 32306
all_data, all_fname = [], []
all_sr = []
new_df = processed_df
for idx in tqdm(range(len(processed_df))):
    fname = processed_df.uuid.iloc[idx]
    for ext in ["webm", "wav", "ogg"]:
        path = ROOT+fname+'.'+ext
        if os.path.exists(path):
            break

    # load sound sample
    audio, sample_rate = librosa.load(path, mono=True)

    # Segment each audio into individual coughs using a hysteresis comparator on the signal power
    cough_segments, cough_mask = segment_cough(audio, sample_rate, min_cough_len=0.1, cough_padding=0.1, th_l_multiplier = 0.1, th_h_multiplier = 2)

    # For each segment, resize to the same length(11025)
    if len(cough_segments) > 0 :
        i = 0
        for audio in cough_segments:
            i+=1
            if len(audio) > 8000:
                if len(audio) < audio_length:
                    audio_pad = librosa.util.pad_center(data=audio, size=audio_length)
                else:
                    # audio_pad = audio[:audio_length] 
                    audio_pad = audio

            # feature = extract_features(audio_pad, sample_rate)
            #print(len(feature))
            # all_data.append(feature)
            all_data.append(audio_pad)
            all_fname.append(fname)
            all_sr.append(sample_rate)
            new_df = pd.concat([new_df, processed_df.iloc[[idx], :]], axis=0)

# uuid, X = np.array(all_fname), np.array(all_data)
# # This may take some time, so go watch some Korean dramas first.
# # uuid, X = load_features(processed_df)
# print(uuid.shape)
# print(X.shape)

new_df = new_df.iloc[len(processed_df):, :]

  audio, sample_rate = librosa.load(path, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, sample_rate = librosa.load(path, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, sample_rate = librosa.load(path, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, sample_rate = librosa.load(path, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, sample_rate = librosa.load(path, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offse

In [12]:
print(len(processed_df), processed_df.shape)
print(len(new_df), new_df.shape)

720 (720, 9)
2850 (2850, 9)


In [22]:
new_df["uuid"] = processed_df["uuid"]
new_df = new_df.reset_index(drop=True)

In [23]:
ind = 0
for index, row in tqdm(new_df.iterrows(), total=len(new_df)):
    # print(new_df.loc[ind,:]["uuid"])
    new_df.iloc[index, :]["uuid"] = "sound"+("000"+str(index))[-4:]+'_'+new_df.loc[index,:]["uuid"]
    ind += 1
new_df

100%|███████████████████████████████████████████████████████████████████████████| 2850/2850 [00:00<00:00, 14058.03it/s]


Unnamed: 0,uuid,status,cough_type,dyspnea,wheezing,stridor,choking,congestion,severity
0,sound0000_de543d13-541c-4ad7-bb3c-c5c302de3aaf,healthy,wet,False,False,False,False,False,mild
1,sound0001_de543d13-541c-4ad7-bb3c-c5c302de3aaf,healthy,wet,False,False,False,False,False,mild
2,sound0002_de543d13-541c-4ad7-bb3c-c5c302de3aaf,healthy,wet,False,False,False,False,False,mild
3,sound0003_de543d13-541c-4ad7-bb3c-c5c302de3aaf,healthy,wet,False,False,False,False,False,mild
4,sound0004_0733f882-d7fd-4dc5-a1b0-8aeec64fc112,healthy,dry,False,False,False,False,False,pseudocough
...,...,...,...,...,...,...,...,...,...
2845,sound2845_a084b953-189d-422d-a40a-b545b4f01618,healthy,dry,False,False,False,False,False,mild
2846,sound2846_a084b953-189d-422d-a40a-b545b4f01618,healthy,dry,False,False,False,False,False,mild
2847,sound2847_a084b953-189d-422d-a40a-b545b4f01618,healthy,dry,False,False,False,False,False,mild
2848,sound2848_a084b953-189d-422d-a40a-b545b4f01618,healthy,dry,False,False,False,False,False,mild


In [24]:
new_df.to_csv("F:/DATAS/COUGHVID-public_dataset_v3/waveinfo_fewtoml_split.csv", sep=',')

In [30]:
import soundfile
save_dir = "F:/DATAS/COUGHVID-public_dataset_v3/coughvid_20211012_fine/"
for i in tqdm(range(len(all_data)), desc="save sound"):
    idx = "000"+str(i)
    soundfile.write(save_dir+f"sound{idx[-4:]}_{all_fname[i]}.wav", all_data[i], 22050)

save sound: 100%|█████████████████████████████████████████████████████████████████| 2850/2850 [00:05<00:00, 525.51it/s]


In [24]:
maxmi, mini, mean = 0, 99999, 0
for item in all_data:
    maxmi = max(maxmi, len(item))
    mini = min(mini, len(item))
    mean += len(item)
print(maxmi, mini, mean/len(all_data))

32306 32306 32306.0
