In [None]:
import os
import random
import numpy as np
import pandas as pd
import librosa
from tqdm import tqdm
ROOT = "F:/DATAS/NEUCOUGHDATA_COUGH/"

In [10]:
y, sr = librosa.load(ROOT+"20240921195035_audiodata.wav")
print(y.shape, sr)
print(y.shape[0]/sr)

(30078,) 22050
1.3640816326530611


# 按静音切分数据

In [None]:
name_list = []
for item in os.listdir(ROOT):
    if item[-3:] == "wav":
        name_list.append(item[:14])
# print(name_list)

annotated_df = pd.read_csv(ROOT+"metainfo.csv", delimiter=',', header=0, index_col=None)
# annotated_df

In [18]:
def segment_cough(x,fs, cough_padding=0.2,min_cough_len=0.2, th_l_multiplier = 0.1, th_h_multiplier = 2):
    #Preprocess the data by segmenting each file into individual coughs using a hysteresis comparator on the signal power                
    cough_mask = np.array([False]*len(x))
    
    #Define hysteresis thresholds
    rms = np.sqrt(np.mean(np.square(x)))
    seg_th_l = th_l_multiplier * rms
    seg_th_h =  th_h_multiplier*rms

    #Segment coughs
    coughSegments = []
    padding = round(fs*cough_padding)
    min_cough_samples = round(fs*min_cough_len)
    cough_start = 0
    cough_end = 0
    cough_in_progress = False
    tolerance = round(0.01*fs)
    below_th_counter = 0
    
    for i, sample in enumerate(x**2):
        if cough_in_progress:
            if sample<seg_th_l:
                below_th_counter += 1
                if below_th_counter > tolerance:
                    cough_end = i+padding if (i+padding < len(x)) else len(x)-1
                    cough_in_progress = False
                    if (cough_end+1-cough_start-2*padding>min_cough_samples):
                        coughSegments.append(x[cough_start:cough_end+1])
                        cough_mask[cough_start:cough_end+1] = True
            elif i == (len(x)-1):
                cough_end=i
                cough_in_progress = False
                if (cough_end+1-cough_start-2*padding>min_cough_samples):
                    coughSegments.append(x[cough_start:cough_end+1])
            else:
                below_th_counter = 0
        else:
            if sample>seg_th_h:
                cough_start = i-padding if (i-padding >=0) else 0
                cough_in_progress = True
    
    return coughSegments, cough_mask

all_data = []
all_file = []
for idx, row in enumerate(annotated_df.itertuples()):
    fname = ROOT+str(row[1])+"_audiodata.wav"
    audio, sample_rate = librosa.load(fname, mono=True)

    # Segment each audio into individual coughs using a hysteresis comparator on the signal power
    cough_segments, cough_mask = segment_cough(audio, sample_rate, min_cough_len=0.1, cough_padding=0.1, th_l_multiplier = 0.1, th_h_multiplier = 2)

    # For each segment, resize to the same length(11025)
    i = 0
    for audio in cough_segments:
        # print(len(audio))  # , len(audio)/22005)
        all_data.append(len(audio))
        all_file.append(str(row[1])+"_audiodata_{}.wav".format(i))
        i += 1

for j in range(len(all_data)):
    print(all_data[j], all_file[j])
# print(len(all_data))


6727 20240921104740_audiodata_0.wav
6761 20240921125652_audiodata_0.wav
8589 20240921150814_audiodata_0.wav
6747 20240921150814_audiodata_1.wav
7761 20240921150814_audiodata_2.wav
7246 20240921151655_audiodata_0.wav
7016 20240921162153_audiodata_0.wav
6893 20240927184318_audiodata_0.wav
8029 20240927190536_audiodata_0.wav
7212 20240927192134_audiodata_0.wav
7824 20240927194322_audiodata_0.wav
7914 20240927194322_audiodata_1.wav
7266 20240927194322_audiodata_2.wav
6779 20240930202847_audiodata_0.wav
8359 20241002110641_audiodata_0.wav
6804 20241002110641_audiodata_1.wav
6952 20241002110641_audiodata_2.wav
8007 20241003101029_audiodata_0.wav
7396 20241003101029_audiodata_1.wav
7110 20241003101029_audiodata_2.wav
9555 20241003102826_audiodata_0.wav
9103 20241003102826_audiodata_1.wav
6811 20241003102826_audiodata_2.wav
8210 20241003104654_audiodata_0.wav
6874 20241003104654_audiodata_1.wav
8232 20241003111239_audiodata_0.wav
8053 20241003111239_audiodata_1.wav
7905 20241003111239_audiodat

In [None]:
audio_length = 32306
all_data, all_fname = [], []
all_sr = []
new_df = processed_df
for idx in tqdm(range(len(processed_df))):
    fname = processed_df.uuid.iloc[idx]
    for ext in ["webm", "wav", "ogg"]:
        path = ROOT+fname+'.'+ext
        if os.path.exists(path):
            break

    # load sound sample
    audio, sample_rate = librosa.load(path, mono=True)

    # Segment each audio into individual coughs using a hysteresis comparator on the signal power
    cough_segments, cough_mask = segment_cough(audio, sample_rate, min_cough_len=0.1, cough_padding=0.1, th_l_multiplier = 0.1, th_h_multiplier = 2)

    # For each segment, resize to the same length(11025)
    if len(cough_segments) > 0 :
        i = 0
        for audio in cough_segments:
            i+=1
            if len(audio) > 8000:
                if len(audio) < audio_length:
                    audio_pad = librosa.util.pad_center(data=audio, size=audio_length)
                else:
                    # audio_pad = audio[:audio_length] 
                    audio_pad = audio

            # feature = extract_features(audio_pad, sample_rate)
            #print(len(feature))
            # all_data.append(feature)
            all_data.append(audio_pad)
            all_fname.append(fname)
            all_sr.append(sample_rate)
            new_df = pd.concat([new_df, processed_df.iloc[[idx], :]], axis=0)

# uuid, X = np.array(all_fname), np.array(all_data)
# # This may take some time, so go watch some Korean dramas first.
# # uuid, X = load_features(processed_df)
# print(uuid.shape)
# print(X.shape)

new_df = new_df.iloc[len(processed_df):, :]

print(len(processed_df), processed_df.shape)
print(len(new_df), new_df.shape)

new_df["uuid"] = processed_df["uuid"]
new_df = new_df.reset_index(drop=True)

ind = 0
for index, row in tqdm(new_df.iterrows(), total=len(new_df)):
    # print(new_df.loc[ind,:]["uuid"])
    new_df.iloc[index, :]["uuid"] = "sound"+("000"+str(index))[-4:]+'_'+new_df.loc[index,:]["uuid"]
    ind += 1
new_df

new_df.to_csv("F:/DATAS/COUGHVID-public_dataset_v3/waveinfo_fewtoml_split.csv", sep=',')

import soundfile
save_dir = "F:/DATAS/COUGHVID-public_dataset_v3/coughvid_20211012_fine/"
for i in tqdm(range(len(all_data)), desc="save sound"):
    idx = "000"+str(i)
    soundfile.write(save_dir+f"sound{idx[-4:]}_{all_fname[i]}.wav", all_data[i], 22050)

maxmi, mini, mean = 0, 99999, 0
for item in all_data:
    maxmi = max(maxmi, len(item))
    mini = min(mini, len(item))
    mean += len(item)
print(maxmi, mini, mean/len(all_data))

In [None]:
# def insert_into_df(df, row, idx):
#     df2 = df.iloc[idx+1:, :]
#     df = df.iloc[:idx, :]
#     df.append(row)
#     df = pd.concat(df, df2)
#     return df

# https://blog.csdn.net/sunmingyang1987/article/details/105486710
def insert_addidx(df, row, idx):
    df = df.reindex(index=df.index.insert(idx, str(idx)))
    df.loc[str(idx)] = row
    return df

In [None]:
# Set seed for reproducibility
seed_value= 32 
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

# set variables
ROOT = 'F:/DATAS/COUGHVID-public_dataset_v3/coughvid_20211012/'
class_names = ['healthy','COVID-19','symptomatic']
audio_length = 32306

# load coughvid meta
data_raw = pd.read_csv(ROOT+'metadata_compiled.csv', header=0, index_col=0)
data_raw.head(3)

In [None]:
print(data_raw.groupby("status")["uuid"].count())

# Step3 读取切分后的数据，抽取特征，创建pkl

In [None]:
data_raw = pd.read_csv(ROOT+'waveinfo_fewtoml_split.csv', header=0, index_col=0)
data_raw

In [None]:
print("labels:")
print(data_raw.groupby("status")["uuid"].count())

In [None]:
HOP_LENGTH = 512        # number of samples between successive frames
WINDOW_LENGTH = 512     # length of the window in samples
N_MEL = 128             # number of Mel bands to generate

def compute_melspectrogram_with_fixed_length(audio, sampling_rate, num_of_samples=86):
    try:
        # compute a mel-scaled spectrogram
        melspectrogram = librosa.feature.melspectrogram(y=audio, 
                                                        sr=sampling_rate, 
                                                        hop_length=HOP_LENGTH,
                                                        win_length=WINDOW_LENGTH, 
                                                        n_mels=N_MEL)

        # convert a power spectrogram to decibel units (log-mel spectrogram)
        melspectrogram_db = librosa.power_to_db(melspectrogram, ref=np.max)
        
        melspectrogram_length = melspectrogram_db.shape[1]
        # # pad or fix the length of spectrogram 
        # if melspectrogram_length != num_of_samples:
        #     melspectrogram_db = librosa.util.fix_length(melspectrogram_db, 
        #                                                 size=num_of_samples, 
        #                                                 axis=1, 
        #                                                 constant_values=(0, -80.0))
        # print(melspectrogram_db.shape)
    except Exception as e:
        print("\nError encountered while parsing files\n>>", e)
        return None 
    
    return melspectrogram_db

In [None]:
audio_length = 32306
# sample_rate = 22050
all_data = []
all_fname = []
all_sr = []
all_labels = []
term1 = []
term2 = []
term3 = []
term4 = []
term5 = []
term6 = []
term7 = []
m2l = {"healthy":0, "COVID-19":1}
bool2int = {True: 0, False: 1}
type2int={"dry": 0, "wet":1, "unknown": 2}
seve2int = {"mild": 0, "pseudocough": 1, "severe": 2, "unknown": 3}
# new_df = df_f
maxi, mini = 0, 999999
for idx, row in tqdm(enumerate(data_raw.itertuples()), total=len(data_raw)):
    fname = ROOT+"coughvid_20211012_fine/" +getattr(row, "uuid")+".wav"
    label = getattr(row, "uuid")
    # load sound sample
    try:
        audio, sample_rate = librosa.load(fname, mono=True)
    except Exception as e:
        # print(e)
        print("Error file:", fname)
        continue
    maxi = max(maxi, audio.shape[0]/sample_rate)
    mini = min(mini, audio.shape[0]/sample_rate)
    

    # feature = extract_features(audio_pad, sample_rate)
    #print(len(feature))
    # all_data.append(feature)
    all_data.append(audio)
    all_fname.append(fname)
    all_labels.append(m2l[getattr(row, "status")])
    all_sr.append(sample_rate)
    term1.append(type2int[getattr(row, "cough_type")])
    term2.append(bool2int[getattr(row, "dyspnea")])
    term3.append(bool2int[getattr(row, "wheezing")])
    term4.append(bool2int[getattr(row, "stridor")])
    term5.append(bool2int[getattr(row, "choking")])
    term6.append(bool2int[getattr(row, "congestion")])
    term7.append(seve2int[getattr(row, "severity", 3)])
#     new_df = pd.concat([new_df, df_f.iloc[[idx], :]], axis=0)
# new_df = new_df.iloc[len(df_f):, :]

In [None]:
print(len(all_data), len(all_labels))

In [None]:
features_1 = []
features_2 = []
# 
for i in tqdm(range(len(all_data)),desc="calc.."):
    melspect = compute_melspectrogram_with_fixed_length(all_data[i], all_sr[i])
    # print(melspec.shape)
    # melspects.append(melspect)
    # print(neg_idx, pos_idx)
    if all_labels[i] == 0:
        if len(features_1)<100:
            features_1.append([melspect, 0, term1[i],term2[i],term3[i],term4[i],term5[i],term6[i],term7[i], 0])
        else:
            features_1.append([melspect, 0, term1[i],term2[i],term3[i],term4[i],term5[i],term6[i],term7[i], random.randint(1, 9)])
    else:
        if len(features_2)<100:
            features_2.append([melspect, 1, term1[i],term2[i],term3[i],term4[i],term5[i],term6[i],term7[i], 0])
        else:
            features_2.append([melspect, 1, term1[i],term2[i],term3[i],term4[i],term5[i],term6[i],term7[i], random.randint(1, 9)])

In [None]:
features = []
features.extend(features_1)
features.extend(features_2)

In [None]:
us8k_df = pd.DataFrame(features, columns=["melspectrogram", "label", "cough_type", "dyspnea", "wheezing", "stridor", "choking", "congestion", "severity", "fold"])
us8k_df.to_pickle(ROOT+"coughvid_split_specattri.pkl")

In [None]:
coughvid_df = pd.read_pickle("F:/DATAS/COUGHVID-public_dataset_v3/coughvid_split_specattri.pkl")
coughvid_df = coughvid_df.iloc[:, [0, 1, 2, 8, 9]]
spectrogram, label, ty, seve, fold = coughvid_df.iloc[1]
print(spectrogram.shape, label, ty, seve, fold)

In [None]:
import pandas as pd
df = pd.read_pickle(ROOT+"coughvid_split_specattri.pkl")
print(df.head())
neg_list = list(range(2076))
pos_list = list(range(2076, 2850))
# print(df.iloc[pos_list, :].groupby("label")["melspectrogram"].count())
# print(df.iloc[neg_list, :].groupby("label")["melspectrogram"].count())
print(df.groupby("cough_type")["melspectrogram"].count())
print(df.groupby("dyspnea")["melspectrogram"].count())
print(df.groupby("wheezing")["melspectrogram"].count())
print(df.groupby("stridor")["melspectrogram"].count())
print(df.groupby("choking")["melspectrogram"].count())
print(df.groupby("congestion")["melspectrogram"].count())
print(df.groupby("severity")["melspectrogram"].count())
# for item in df.itertuples():
    # print(getattr(item, "melspectrogram").shape)

In [None]:
print(processed_df.shape)
processed_df.groupby("status")["uuid"].count()

# End