In [1]:
import os
import random
import numpy as np
import pandas as pd
import librosa
from tqdm import tqdm
ROOT = "F:/DATAS/COUGHVID-public_dataset_v3/"

# 目录：
## Step3 抽取特征存为pickle
## Step1 按条件筛选好数据
## Step2 按静音切分数据

In [13]:
y, sr = librosa.load(ROOT+"coughvid_20211012_fine/sound2830_6c9a5f33-9e90-4c08-8782-20b0d86abc46.wav")
print(y.shape, sr)
print(y.shape[0]/sr)

(32306,) 22050
1.465124716553288


In [3]:
# Set seed for reproducibility
seed_value= 32 
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

# set variables
ROOT = 'F:/DATAS/COUGHVID-public_dataset_v3/coughvid_20211012/'
class_names = ['healthy','COVID-19','symptomatic']
audio_length = 32306

# load coughvid meta
data_raw = pd.read_csv(ROOT+'metadata_compiled.csv', header=0, index_col=0)
data_raw.head(3)

Unnamed: 0,uuid,datetime,cough_detected,latitude,longitude,age,gender,respiratory_condition,fever_muscle_pain,status,...,quality_4,cough_type_4,dyspnea_4,wheezing_4,stridor_4,choking_4,congestion_4,nothing_4,diagnosis_4,severity_4
0,00014dcc-0f06-4c27-8c7b-737b18a2cf4c,2020-11-25T18:58:50.488301+00:00,0.0155,48.9,2.4,,,,,,...,,,,,,,,,,
1,00039425-7f3a-42aa-ac13-834aaa2b6b92,2020-04-13T21:30:59.801831+00:00,0.9609,31.3,34.8,15.0,male,False,False,healthy,...,,,,,,,,,,
2,0007c6f1-5441-40e6-9aaf-a761d8f2da3b,2020-10-18T15:38:38.205870+00:00,0.1643,,,46.0,female,False,False,healthy,...,,,,,,,,,,


In [4]:
print(data_raw.groupby("status")["uuid"].count())

status
COVID-19        1315
healthy        15476
symptomatic     3873
Name: uuid, dtype: int64


# Step3 读取切分后的数据，抽取特征，创建pkl

In [6]:
data_raw = pd.read_csv(ROOT+'waveinfo_fewtoml_split.csv', header=0, index_col=0)
data_raw

Unnamed: 0,uuid,status,cough_type,dyspnea,wheezing,stridor,choking,congestion,severity
0,sound0000_de543d13-541c-4ad7-bb3c-c5c302de3aaf,healthy,wet,False,False,False,False,False,mild
1,sound0001_de543d13-541c-4ad7-bb3c-c5c302de3aaf,healthy,wet,False,False,False,False,False,mild
2,sound0002_de543d13-541c-4ad7-bb3c-c5c302de3aaf,healthy,wet,False,False,False,False,False,mild
3,sound0003_de543d13-541c-4ad7-bb3c-c5c302de3aaf,healthy,wet,False,False,False,False,False,mild
7,sound0007_eaae5d4e-ca16-468f-a1d9-3567a396b6da,healthy,dry,False,False,False,False,False,mild
...,...,...,...,...,...,...,...,...,...
2811,sound2811_5aa4dae0-fe23-4eb2-9d62-e6016a1c9f8a,COVID-19,unknown,False,False,False,False,False,unknown
819,sound0819_637d6498-1d52-4860-9976-5a7a94f2a0c3,COVID-19,dry,False,False,False,False,True,unknown
820,sound0820_637d6498-1d52-4860-9976-5a7a94f2a0c3,COVID-19,dry,False,False,False,False,True,unknown
821,sound0821_637d6498-1d52-4860-9976-5a7a94f2a0c3,COVID-19,dry,False,False,False,False,True,unknown


In [3]:
print("labels:")
print(data_raw.groupby("status")["uuid"].count())

labels:
status
COVID-19     774
healthy     2076
Name: uuid, dtype: int64


In [4]:
HOP_LENGTH = 512        # number of samples between successive frames
WINDOW_LENGTH = 512     # length of the window in samples
N_MEL = 128             # number of Mel bands to generate

def compute_melspectrogram_with_fixed_length(audio, sampling_rate, num_of_samples=86):
    try:
        # compute a mel-scaled spectrogram
        melspectrogram = librosa.feature.melspectrogram(y=audio, 
                                                        sr=sampling_rate, 
                                                        hop_length=HOP_LENGTH,
                                                        win_length=WINDOW_LENGTH, 
                                                        n_mels=N_MEL)

        # convert a power spectrogram to decibel units (log-mel spectrogram)
        melspectrogram_db = librosa.power_to_db(melspectrogram, ref=np.max)
        
        melspectrogram_length = melspectrogram_db.shape[1]
        # # pad or fix the length of spectrogram 
        # if melspectrogram_length != num_of_samples:
        #     melspectrogram_db = librosa.util.fix_length(melspectrogram_db, 
        #                                                 size=num_of_samples, 
        #                                                 axis=1, 
        #                                                 constant_values=(0, -80.0))
        # print(melspectrogram_db.shape)
    except Exception as e:
        print("\nError encountered while parsing files\n>>", e)
        return None 
    
    return melspectrogram_db

In [7]:
audio_length = 32306
# sample_rate = 22050
all_data = []
all_fname = []
all_sr = []
all_labels = []
term1 = []
term2 = []
term3 = []
term4 = []
term5 = []
term6 = []
term7 = []
m2l = {"healthy":0, "COVID-19":1}
bool2int = {True: 0, False: 1}
type2int={"dry": 0, "wet":1, "unknown": 2}
seve2int = {"mild": 0, "pseudocough": 1, "severe": 2, "unknown": 3}
# new_df = df_f
maxi, mini = 0, 999999
for idx, row in tqdm(enumerate(data_raw.itertuples()), total=len(data_raw)):
    fname = ROOT+"coughvid_20211012_fine/" +getattr(row, "uuid")+".wav"
    label = getattr(row, "uuid")
    # load sound sample
    try:
        audio, sample_rate = librosa.load(fname, mono=True)
    except Exception as e:
        # print(e)
        print("Error file:", fname)
        continue
    maxi = max(maxi, audio.shape[0]/sample_rate)
    mini = min(mini, audio.shape[0]/sample_rate)
    

    # feature = extract_features(audio_pad, sample_rate)
    #print(len(feature))
    # all_data.append(feature)
    all_data.append(audio)
    all_fname.append(fname)
    all_labels.append(m2l[getattr(row, "status")])
    all_sr.append(sample_rate)
    term1.append(type2int[getattr(row, "cough_type")])
    term2.append(bool2int[getattr(row, "dyspnea")])
    term3.append(bool2int[getattr(row, "wheezing")])
    term4.append(bool2int[getattr(row, "stridor")])
    term5.append(bool2int[getattr(row, "choking")])
    term6.append(bool2int[getattr(row, "congestion")])
    term7.append(seve2int[getattr(row, "severity", 3)])
#     new_df = pd.concat([new_df, df_f.iloc[[idx], :]], axis=0)
# new_df = new_df.iloc[len(df_f):, :]

100%|████████████████████████████████████████████████████████████████████████████| 2850/2850 [00:00<00:00, 6093.97it/s]


In [8]:
print(len(all_data), len(all_labels))

2850 2850


In [9]:
features_1 = []
features_2 = []
# 
for i in tqdm(range(len(all_data)),desc="calc.."):
    melspect = compute_melspectrogram_with_fixed_length(all_data[i], all_sr[i])
    # print(melspec.shape)
    # melspects.append(melspect)
    # print(neg_idx, pos_idx)
    if all_labels[i] == 0:
        if len(features_1)<100:
            features_1.append([melspect, 0, term1[i],term2[i],term3[i],term4[i],term5[i],term6[i],term7[i], 0])
        else:
            features_1.append([melspect, 0, term1[i],term2[i],term3[i],term4[i],term5[i],term6[i],term7[i], random.randint(1, 9)])
    else:
        if len(features_2)<100:
            features_2.append([melspect, 1, term1[i],term2[i],term3[i],term4[i],term5[i],term6[i],term7[i], 0])
        else:
            features_2.append([melspect, 1, term1[i],term2[i],term3[i],term4[i],term5[i],term6[i],term7[i], random.randint(1, 9)])

calc..: 100%|█████████████████████████████████████████████████████████████████████| 2850/2850 [00:08<00:00, 355.24it/s]


In [10]:
features = []
features.extend(features_1)
features.extend(features_2)

In [11]:
us8k_df = pd.DataFrame(features, columns=["melspectrogram", "label", "cough_type", "dyspnea", "wheezing", "stridor", "choking", "congestion", "severity", "fold"])
us8k_df.to_pickle(ROOT+"coughvid_split_specattri.pkl")

In [22]:
coughvid_df = pd.read_pickle("F:/DATAS/COUGHVID-public_dataset_v3/coughvid_split_specattri.pkl")
coughvid_df = coughvid_df.iloc[:, [0, 1, 2, 8, 9]]
spectrogram, label, ty, seve, fold = coughvid_df.iloc[1]
print(spectrogram.shape, label, ty, seve, fold)

(128, 64) 0 1 0 0


In [18]:
import pandas as pd
df = pd.read_pickle(ROOT+"coughvid_split_specattri.pkl")
print(df.head())
neg_list = list(range(2076))
pos_list = list(range(2076, 2850))
# print(df.iloc[pos_list, :].groupby("label")["melspectrogram"].count())
# print(df.iloc[neg_list, :].groupby("label")["melspectrogram"].count())
print(df.groupby("cough_type")["melspectrogram"].count())
print(df.groupby("dyspnea")["melspectrogram"].count())
print(df.groupby("wheezing")["melspectrogram"].count())
print(df.groupby("stridor")["melspectrogram"].count())
print(df.groupby("choking")["melspectrogram"].count())
print(df.groupby("congestion")["melspectrogram"].count())
print(df.groupby("severity")["melspectrogram"].count())
# for item in df.itertuples():
    # print(getattr(item, "melspectrogram").shape)

                                      melspectrogram  label  cough_type  \
0  [[-80.0, -80.0, -80.0, -80.0, -80.0, -80.0, -8...      0           1   
1  [[-80.0, -80.0, -80.0, -80.0, -80.0, -80.0, -8...      0           1   
2  [[-80.0, -80.0, -80.0, -80.0, -80.0, -80.0, -8...      0           1   
3  [[-80.0, -80.0, -80.0, -80.0, -80.0, -80.0, -8...      0           1   
4  [[-80.0, -80.0, -80.0, -80.0, -80.0, -80.0, -8...      0           0   

   dyspnea  wheezing  stridor  choking  congestion  severity  fold  
0        1         1        1        1           1         0     0  
1        1         1        1        1           1         0     0  
2        1         1        1        1           1         0     0  
3        1         1        1        1           1         0     0  
4        1         1        1        1           1         0     0  
cough_type
0    2039
1     612
2     199
Name: melspectrogram, dtype: int64
dyspnea
0     109
1    2741
Name: melspectrogram, dtype: in

# Step 1 筛选优质的数据
首先取出专家标注的部分，这些才是真正用来做监督学习的。

然后
1. 去除没有status词条的行
2. 去除cough_detected小于0.8的行
3. 去除quality不是good的行
4. 仅保留专家标注的列，不再需要用户自己上报的列。

In [7]:
def split_by_physicians(df):
    column_names = ['uuid', 'datetime', 'cough_detected', 'SNR', 'latitude', 'longitude', 
                    'age', 'gender', 'respiratory_condition', 'fever_muscle_pain', 'status', 
                    'quality', 'cough_type', 'dyspnea', 'wheezing', 'stridor', 'choking', 
                    'congestion', 'nothing', 'diagnosis', 'severity' ]
    physician_01 = df.iloc[:, 0:21]
    physician_01 = physician_01[physician_01.quality_1.notna()].reset_index(drop=True)
    physician_01.columns = column_names

    physician_02 = pd.concat([df.iloc[:, 0:11], df.iloc[:, 21:31]], axis=1)
    physician_02 = physician_02[physician_02.quality_2.notna()].reset_index(drop=True)
    physician_02.columns = column_names

    physician_03 = pd.concat([df.iloc[:, 0:11], df.iloc[:, 31:41]], axis=1)
    physician_03 = physician_03[physician_03.quality_3.notna()].reset_index(drop=True)
    physician_03.columns = column_names

    physician_04 = pd.concat([df.iloc[:, 0:11], df.iloc[:, 41:51]], axis=1)
    physician_04 = physician_04[physician_04.quality_4.notna()].reset_index(drop=True)
    physician_04.columns = column_names
    return physician_01, physician_02, physician_03, physician_04
    
def process_csv(df):
    #split by physicians
    physician_01, physician_02, physician_03, physician_04 = split_by_physicians(df)
    # combine into one dataframe
    df = pd.concat([physician_01,physician_02,physician_03,physician_04]).reset_index(drop=True)  
    print(df.shape)
    # drop null status
    df = df[df.status.notna()]
    print(df.shape)
    # drop cough_detected < 0.8
    df = df[df.cough_detected >= 0.8 ]
    print(df.shape)
    # select good and ok quality
    df = df[df.quality == 'good']
    print(df.shape)
    # shuffle
    df = df.sample(frac=1).reset_index(drop=True) 
    df = df[['uuid', 'status','cough_type', 'dyspnea', 'wheezing', 'stridor', 'choking', 'congestion', 'severity']]
    return df

processed_df = process_csv(data_raw)
processed_df.head(3)

(3280, 21)
(1168, 21)
(1168, 21)
(720, 21)


Unnamed: 0,uuid,status,cough_type,dyspnea,wheezing,stridor,choking,congestion,severity
0,d57d4c31-4f34-41cd-9fdf-84e4df3f2dcc,healthy,dry,False,False,False,False,True,mild
1,6f5e643c-437e-4c63-9092-39c7ea186334,healthy,wet,False,False,False,False,False,mild
2,2e687a52-6e9b-4343-9688-b96132f52f87,healthy,dry,False,False,False,False,False,pseudocough


In [6]:
print(processed_df.shape)
processed_df.groupby("status")["uuid"].count()

(720, 9)


status
COVID-19    152
healthy     568
Name: uuid, dtype: int64

In [None]:
def segment_cough(x,fs, cough_padding=0.2,min_cough_len=0.2, th_l_multiplier = 0.1, th_h_multiplier = 2):
    #Preprocess the data by segmenting each file into individual coughs using a hysteresis comparator on the signal power                
    cough_mask = np.array([False]*len(x))
    
    #Define hysteresis thresholds
    rms = np.sqrt(np.mean(np.square(x)))
    seg_th_l = th_l_multiplier * rms
    seg_th_h =  th_h_multiplier*rms

    #Segment coughs
    coughSegments = []
    padding = round(fs*cough_padding)
    min_cough_samples = round(fs*min_cough_len)
    cough_start = 0
    cough_end = 0
    cough_in_progress = False
    tolerance = round(0.01*fs)
    below_th_counter = 0
    
    for i, sample in enumerate(x**2):
        if cough_in_progress:
            if sample<seg_th_l:
                below_th_counter += 1
                if below_th_counter > tolerance:
                    cough_end = i+padding if (i+padding < len(x)) else len(x)-1
                    cough_in_progress = False
                    if (cough_end+1-cough_start-2*padding>min_cough_samples):
                        coughSegments.append(x[cough_start:cough_end+1])
                        cough_mask[cough_start:cough_end+1] = True
            elif i == (len(x)-1):
                cough_end=i
                cough_in_progress = False
                if (cough_end+1-cough_start-2*padding>min_cough_samples):
                    coughSegments.append(x[cough_start:cough_end+1])
            else:
                below_th_counter = 0
        else:
            if sample>seg_th_h:
                cough_start = i-padding if (i-padding >=0) else 0
                cough_in_progress = True
    
    return coughSegments, cough_mask

In [None]:
# def insert_into_df(df, row, idx):
#     df2 = df.iloc[idx+1:, :]
#     df = df.iloc[:idx, :]
#     df.append(row)
#     df = pd.concat(df, df2)
#     return df

# https://blog.csdn.net/sunmingyang1987/article/details/105486710
def insert_addidx(df, row, idx):
    df = df.reindex(index=df.index.insert(idx, str(idx)))
    df.loc[str(idx)] = row
    return df

# Step 2 切分音频并存储并创建meta文件csv

In [None]:
audio_length = 32306
all_data, all_fname = [], []
all_sr = []
new_df = processed_df
for idx in tqdm(range(len(processed_df))):
    fname = processed_df.uuid.iloc[idx]
    for ext in ["webm", "wav", "ogg"]:
        path = ROOT+fname+'.'+ext
        if os.path.exists(path):
            break

    # load sound sample
    audio, sample_rate = librosa.load(path, mono=True)

    # Segment each audio into individual coughs using a hysteresis comparator on the signal power
    cough_segments, cough_mask = segment_cough(audio, sample_rate, min_cough_len=0.1, cough_padding=0.1, th_l_multiplier = 0.1, th_h_multiplier = 2)

    # For each segment, resize to the same length(11025)
    if len(cough_segments) > 0 :
        i = 0
        for audio in cough_segments:
            i+=1
            if len(audio) > 8000:
                if len(audio) < audio_length:
                    audio_pad = librosa.util.pad_center(data=audio, size=audio_length)
                else:
                    # audio_pad = audio[:audio_length] 
                    audio_pad = audio

            # feature = extract_features(audio_pad, sample_rate)
            #print(len(feature))
            # all_data.append(feature)
            all_data.append(audio_pad)
            all_fname.append(fname)
            all_sr.append(sample_rate)
            new_df = pd.concat([new_df, processed_df.iloc[[idx], :]], axis=0)

# uuid, X = np.array(all_fname), np.array(all_data)
# # This may take some time, so go watch some Korean dramas first.
# # uuid, X = load_features(processed_df)
# print(uuid.shape)
# print(X.shape)

new_df = new_df.iloc[len(processed_df):, :]

In [None]:
print(len(processed_df), processed_df.shape)
print(len(new_df), new_df.shape)

In [None]:
new_df["uuid"] = processed_df["uuid"]
new_df = new_df.reset_index(drop=True)

In [None]:
ind = 0
for index, row in tqdm(new_df.iterrows(), total=len(new_df)):
    # print(new_df.loc[ind,:]["uuid"])
    new_df.iloc[index, :]["uuid"] = "sound"+("000"+str(index))[-4:]+'_'+new_df.loc[index,:]["uuid"]
    ind += 1
new_df

In [None]:
new_df.to_csv("F:/DATAS/COUGHVID-public_dataset_v3/waveinfo_fewtoml_split.csv", sep=',')

In [None]:
import soundfile
save_dir = "F:/DATAS/COUGHVID-public_dataset_v3/coughvid_20211012_fine/"
for i in tqdm(range(len(all_data)), desc="save sound"):
    idx = "000"+str(i)
    soundfile.write(save_dir+f"sound{idx[-4:]}_{all_fname[i]}.wav", all_data[i], 22050)

In [None]:
maxmi, mini, mean = 0, 99999, 0
for item in all_data:
    maxmi = max(maxmi, len(item))
    mini = min(mini, len(item))
    mean += len(item)
print(maxmi, mini, mean/len(all_data))

# End