In [None]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import librosa
import torch
import torch.nn as nn
import torchaudio
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

WAVE_ROOT = "G:/DATAS-Medical/BILIBILICOUGH/"
NOISE_ROOT = "G:/DATAS-Medical/BILINOISE/"
name2label = {"breathe": 0, "cough": 2, "clearthroat": 1, "exhale": 3, "hum": 4, "inhale": 5, "noise": 6, "silence": 7,
              "sniff": 8, "speech": 9, "vomit": 10, "whooping": 11}

In [1]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from readers.bilicough_reader import BiliCoughReader
from readers.neucough_reader import NEUCoughReader
from readers.coughvid_reader import CoughVIDReader
from readers.noise_reader import load_bilinoise_dataset
from models.tdnncnn import WSFNN


def get_combined_data():
    print("Build the Dataset consisting of BiliCough, NeuCough, CoughVID19.")
    bcr = BiliCoughReader()
    ncr = NEUCoughReader()
    cvr = CoughVIDReader()
    sample_list, label_list = [], []
    tmp_sl, tmp_ll = bcr.get_sample_label_list(mode="sed")
    sample_list.extend(tmp_sl)
    label_list.extend(tmp_ll)
    print("bilicough:", len(label_list))
    tmp_sl, tmp_ll = ncr.get_sample_label_list(mode="cough")
    sample_list.extend(tmp_sl)
    label_list.extend(tmp_ll)
    print("bilicough+neucough:", len(label_list))
    tmp_sl, tmp_ll = cvr.get_sample_label_list()
    sample_list.extend(tmp_sl)
    label_list.extend(tmp_ll)
    print("bilicough+neucough+coughvid:", len(label_list))
    # shuffle
    tmplist = list(zip(sample_list, label_list))
    random.shuffle(tmplist)
    sample_list, label_list = zip(*tmplist)

    noise_list, _ = load_bilinoise_dataset(NOISE_ROOT="G:/DATAS-Medical/BILINOISE/", noise_length=bcr.data_length,
                                           number=100)
    print("Loader noise data.")
    return sample_list, label_list, noise_list


class BiliCoughDataset(Dataset):
    def __init__(self, audioseg, labellist, noises):
        self.audioseg = audioseg
        self.labellist = labellist
        self.noises = noises

    def __getitem__(self, ind):
        # When reading waveform data, add noise as data enhancement according to a 1/3 probability.
        if random.random() < 0.333:
            return self.audioseg[ind] + self.noises[random.randint(0, len(self.noises) - 1)], self.labellist[ind]
        else:
            return self.audioseg[ind], self.labellist[ind]

    def __len__(self):
        return len(self.audioseg)


class SEDModel(nn.Module):
    def __init__(self, class_num=10):
        super().__init__()
        self.model = WSFNN(class_num=class_num)

    def forward(self, x_wav):
        return self.wave_conv(x_wav)

In [2]:
configs = {"batch_size": 32, "lr": 0.001, "epoch_num": 30}
save_dir = "./runs/c2sedmodel/"
if not os.path.exists(save_dir):
    os.makedirs(save_dir, exist_ok=True)
run_save_dir = save_dir + time.strftime("%Y%m%d%H%M", time.localtime()) + '/'
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
save_setting_str = "Model:{}, optimizer:{}, loss function:{}\n".format(
            "VADModel(wav TDNN + mel CNN + pool + mlp)", "Adam(lr={})".format(configs["lr"]),
            "nn.CrossEntropyLoss")
save_setting_str += "dataset:{}, batch_size:{}, noise_p:{}\n".format("BiliCough+BiliNoise",
                                                                                  configs["batch_size"], "0.333")
save_setting_str += "epoch_num:{},\n".format(configs["epoch_num"])

In [10]:
class SEDModel(nn.Module):
    def __init__(self, class_num=10):
        super().__init__()
        self.model = WSFNN(class_num=class_num)

    def forward(self, x):
        return self.model(x=x)
model = SEDModel().to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=configs["lr"])

The first Layer of the TDNN: kernel_size:1024, stride:488, padding:512
Build TDNN Extractor with 6 Conv1d Layers.
Build 2 Convolutional Layer and 1 Pool2d Layer.
Pooling after Fusioning the TDNN and CNN.
Build 3-Layer MLP as Classifier for 10-class.


In [4]:
sample_list, label_list, noise_list = get_combined_data()
trte_rate = int(len(sample_list) * 0.9)

train_loader = DataLoader(
            BiliCoughDataset(audioseg=sample_list[:trte_rate], labellist=label_list[:trte_rate], noises=noise_list),
            batch_size=configs["batch_size"], shuffle=True)
valid_loader = DataLoader(
            BiliCoughDataset(audioseg=sample_list[trte_rate:], labellist=label_list[trte_rate:], noises=noise_list),
            batch_size=configs["batch_size"], shuffle=False)

Build the Dataset consisting of BiliCough, NeuCough, CoughVID19.
           filename       st       en  label
0     bilicough_000  00:01.0  00:01.7      2
1     bilicough_000  00:01.7  00:02.2      2
2     bilicough_000  00:02.2  00:02.7      2
3     bilicough_000  00:03.0  00:03.4      2
4     bilicough_000  00:03.4  00:04.0      2
...             ...      ...      ...    ...
1266  bilicough_018  01:51.3  01:52.3     11
1267  bilicough_018  01:52.5  01:52.8      2
1268  bilicough_018  01:52.8  01:53.1      2
1269  bilicough_018  01:53.1  01:53.5      2
1270  bilicough_018  01:54.1  01:55.4      9

[1271 rows x 4 columns]
sound count:5163, all count:1362.
[22050]
bilicough: 1362


100%|████████████████████████████████████████████████████████████████████████████████| 321/321 [00:35<00:00,  9.07it/s]


bilicough+neucough: 1683


100%|█████████████████████████████████████████████████████████████████████████████| 2850/2850 [00:10<00:00, 269.25it/s]


bilicough+neucough+coughvid: 4533
Loader noise data.


In [8]:
print("length of data:", len(sample_list), len(label_list), len(noise_list))

length of data: 4533 4533 100


In [11]:
print("Build Dataset...")
flag = False
Loss_Epoch_List = []
print("Start Training...")
for epoch_id in range(configs["epoch_num"]):
    model.train()
    Loss_Batch_List = []
    for batch_id, (x_wav, y_lab) in tqdm(enumerate(train_loader),
                                         desc="Epoch:{} Training ".format(epoch_id)):
        optimizer.zero_grad()
        x_wav, y_lab = x_wav.to(device).unsqueeze(1).to(torch.float32), y_lab.to(device)  # .to(torch.float32)
        if not flag:
            print("shape of x y:", x_wav.shape, y_lab.shape)
        y_pred = model(x=x_wav)
        if not flag:
            print("shape of pred:", y_pred.shape)
        loss_v = criterion(input=y_pred, target=y_lab)
        if not flag:
            print("shape of loss_v:", loss_v.shape)
            flag = True
        loss_v.backward()
        optimizer.step()
        Loss_Batch_List.append(loss_v.mean().item())
    Loss_Epoch_List.append(np.array(Loss_Batch_List).mean())

Build Dataset...
Start Training...


Epoch:0 Training : 0it [00:00, ?it/s]

shape of x y: torch.Size([32, 1, 22050]) torch.Size([32])
shape of pred: torch.Size([32, 10])
shape of loss_v: torch.Size([])


Epoch:0 Training : 128it [00:05, 25.34it/s] 
Epoch:1 Training : 128it [00:00, 132.50it/s]
Epoch:2 Training : 128it [00:00, 132.78it/s]
Epoch:3 Training : 128it [00:00, 131.46it/s]
Epoch:4 Training : 128it [00:00, 129.39it/s]
Epoch:5 Training : 128it [00:00, 131.69it/s]
Epoch:6 Training : 128it [00:00, 139.36it/s]
Epoch:7 Training : 128it [00:00, 139.79it/s]
Epoch:8 Training : 128it [00:00, 141.36it/s]
Epoch:9 Training : 128it [00:00, 137.31it/s]
Epoch:10 Training : 128it [00:00, 132.54it/s]
Epoch:11 Training : 128it [00:00, 135.65it/s]
Epoch:12 Training : 128it [00:00, 132.53it/s]
Epoch:13 Training : 128it [00:00, 135.68it/s]
Epoch:14 Training : 128it [00:00, 134.54it/s]
Epoch:15 Training : 128it [00:00, 133.57it/s]
Epoch:16 Training : 128it [00:00, 131.84it/s]
Epoch:17 Training : 128it [00:00, 128.97it/s]
Epoch:18 Training : 128it [00:01, 127.03it/s]
Epoch:19 Training : 128it [00:00, 131.87it/s]
Epoch:20 Training : 128it [00:00, 130.57it/s]
Epoch:21 Training : 128it [00:00, 131.22it/s

In [14]:
if not os.path.exists(run_save_dir):
    os.makedirs(run_save_dir, exist_ok=True)
settingf = open(run_save_dir + "train_settings.txt", 'w')
settingf.write(save_setting_str)
settingf.write("loss:[" + ",".join([str(it) for it in Loss_Epoch_List]) + ']\n')
settingf.close()
torch.save(model.state_dict(),
           run_save_dir + "vad_model_epoch{}.pth".format(configs["epoch_num"]))
torch.save(optimizer.state_dict(),
           run_save_dir + "vad_optimizer_epoch{}.pth".format(configs["epoch_num"]))
print("models were saved.")

models were saved.


In [15]:
vad_model = SEDModel()
vad_model.load_state_dict(torch.load("./runs/c2sedmodel/202502161815/sed_model_epoch30.pth"))
vad_model.eval()

The first Layer of the TDNN: kernel_size:1024, stride:488, padding:512
Build TDNN Extractor with 6 Conv1d Layers.
Build 2 Convolutional Layer and 1 Pool2d Layer.
Pooling after Fusioning the TDNN and CNN.
Build 3-Layer MLP as Classifier for 10-class.


SEDModel(
  (model): WSFNN(
    (mel_extractor): MelSpectrogram(
      (spectrogram): Spectrogram()
      (mel_scale): MelScale()
    )
    (wave_conv): TDNN_Extractor(
      (wav2mel): Conv1d(1, 128, kernel_size=(1024,), stride=(488,), padding=(512,), bias=False)
      (layer_norm): LayerNorm((46,), eps=1e-05, elementwise_affine=True)
      (td_layer1): Conv1d(128, 512, kernel_size=(5,), stride=(1,))
      (bn1): LayerNorm((42,), eps=1e-05, elementwise_affine=True)
      (td_layer2): Conv1d(512, 512, kernel_size=(3,), stride=(1,), dilation=(2,), groups=512)
      (bn2): LayerNorm((38,), eps=1e-05, elementwise_affine=True)
      (td_layer3): Conv1d(512, 512, kernel_size=(3,), stride=(1,), dilation=(3,), groups=512)
      (bn3): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      (td_layer4): Conv1d(512, 512, kernel_size=(1,), stride=(1,), groups=512)
      (bn4): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      (td_layer5): Conv1d(512, 1024, kernel_size=(1,), stride=(1

In [21]:
# model.eval()
pre_list = []
rec_list = []
acc_list = []
from sklearn import metrics
for batch_id, (x_wav, y_lab) in tqdm(enumerate(valid_loader), desc="Testing..."):
    with torch.no_grad():
        x_wav = x_wav.to(device).unsqueeze(1).to(torch.float32)
        print(x_wav.shape)
        y_pred = model(x=x_wav)
        y_pred = np.argmax(y_pred.data.cpu().numpy(), axis=1)

        precision = metrics.precision_score(y_true=y_lab, y_pred=y_pred, average="micro")
        recall = metrics.recall_score(y_true=y_lab, y_pred=y_pred, average="micro")
        acc = metrics.accuracy_score(y_true=y_lab, y_pred=y_pred)
        pre_list.append(precision)
        rec_list.append(recall)
        acc_list.append(acc)
        print(pre_list)
        print(rec_list)
        print(acc_list)
print("precision:")
print(pre_list)
print("recall:")
print(rec_list)
print("accuracy:")
print(acc_list)
if not os.path.exists(run_save_dir):
    os.makedirs(run_save_dir, exist_ok=True)


plt.figure(0)
plt.plot(list(range(len(Loss_Epoch_List))), np.array(Loss_Epoch_List), c="black")
plt.savefig(run_save_dir + "vad_meanloss_epoch.png", dpi=300, format="png")
plt.close(0)

settingf = open(run_save_dir + "train_settings.txt", 'w')
settingf.write(save_setting_str)
settingf.write("loss:[" + ",".join([str(it) for it in Loss_Epoch_List]) + ']\n')
settingf.write('precision:{}['.format(np.mean(pre_list)) + ",".join([str(it) for it in pre_list]) + ']\n')
settingf.write('recall:{}['.format(np.mean(rec_list)) + ",".join([str(it) for it in rec_list]) + ']\n')
settingf.write('accuracy:{}['.format(np.mean(acc_list)) + ",".join([str(it) for it in acc_list]) + ']\n')
# plt.show()
settingf.close()

torch.save(model.state_dict(),
           run_save_dir + "vad_model_epoch{}.pth".format(configs["epoch_num"]))
torch.save(optimizer.state_dict(),
           run_save_dir + "vad_optimizer_epoch{}.pth".format(configs["epoch_num"]))
print("models were saved.")

Testing...: 7it [00:00, 61.07it/s]

torch.Size([32, 1, 22050])
[0.96875]
[0.96875]
[0.96875]
torch.Size([32, 1, 22050])
[0.96875, 0.96875]
[0.96875, 0.96875]
[0.96875, 0.96875]
torch.Size([32, 1, 22050])
[0.96875, 0.96875, 0.96875]
[0.96875, 0.96875, 0.96875]
[0.96875, 0.96875, 0.96875]
torch.Size([32, 1, 22050])
[0.96875, 0.96875, 0.96875, 0.96875]
[0.96875, 0.96875, 0.96875, 0.96875]
[0.96875, 0.96875, 0.96875, 0.96875]
torch.Size([32, 1, 22050])
[0.96875, 0.96875, 0.96875, 0.96875, 0.875]
[0.96875, 0.96875, 0.96875, 0.96875, 0.875]
[0.96875, 0.96875, 0.96875, 0.96875, 0.875]
torch.Size([32, 1, 22050])
[0.96875, 0.96875, 0.96875, 0.96875, 0.875, 1.0]
[0.96875, 0.96875, 0.96875, 0.96875, 0.875, 1.0]
[0.96875, 0.96875, 0.96875, 0.96875, 0.875, 1.0]
torch.Size([32, 1, 22050])
[0.96875, 0.96875, 0.96875, 0.96875, 0.875, 1.0, 0.96875]
[0.96875, 0.96875, 0.96875, 0.96875, 0.875, 1.0, 0.96875]
[0.96875, 0.96875, 0.96875, 0.96875, 0.875, 1.0, 0.96875]





RuntimeError: stack expects each tensor to be equal size, but got [22050] at entry 0 and [23815] at entry 5

# Training End.
# Detection SED

In [None]:
import sys
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa
import torch
import torch.nn as nn
# sys.path.append(r'D:/PythonTorchVITS/MedicalSignal/SoundDL-CoughVID')
sys.path.append(r'C:/Program Files (zk)/PythonFiles/AClassification/SoundDL-CoughVID')
from chapter2_SEDmodel import SEDModel

In [2]:
save_dir = "./runs/c2sedmodel/"
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
sed_model = SEDModel()
sed_model.load_state_dict(torch.load(save_dir+"202502161815/sed_model_epoch30.pth"))
sed_model.to(device)
sed_model.eval()

The first Layer of the TDNN: kernel_size:1024, stride:488, padding:512
Build TDNN Extractor with 6 Conv1d Layers.
Build 2 Convolutional Layer and 1 Pool2d Layer.
Pooling after Fusioning the TDNN and CNN.
Build 3-Layer MLP as Classifier for 10-class.


SEDModel(
  (model): WSFNN(
    (mel_extractor): MelSpectrogram(
      (spectrogram): Spectrogram()
      (mel_scale): MelScale()
    )
    (wave_conv): TDNN_Extractor(
      (wav2mel): Conv1d(1, 128, kernel_size=(1024,), stride=(488,), padding=(512,), bias=False)
      (layer_norm): LayerNorm((46,), eps=1e-05, elementwise_affine=True)
      (td_layer1): Conv1d(128, 512, kernel_size=(5,), stride=(1,))
      (bn1): LayerNorm((42,), eps=1e-05, elementwise_affine=True)
      (td_layer2): Conv1d(512, 512, kernel_size=(3,), stride=(1,), dilation=(2,), groups=512)
      (bn2): LayerNorm((38,), eps=1e-05, elementwise_affine=True)
      (td_layer3): Conv1d(512, 512, kernel_size=(3,), stride=(1,), dilation=(3,), groups=512)
      (bn3): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      (td_layer4): Conv1d(512, 512, kernel_size=(1,), stride=(1,), groups=512)
      (bn4): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      (td_layer5): Conv1d(512, 1024, kernel_size=(1,), stride=(1

In [3]:
testwav, sr = librosa.load("F:/DATAS/NEUCOUGHDATA_FULL/20240921133332_audiodata_元音字母a.wav")
N = len(testwav)
maxv = max(testwav)
seg_list = []
st, step, overlap = 0, 22050, 22050//3
while st+step<=N:
    seg_list.append(testwav[st:st+step])
    st = st+step-overlap
tmp = testwav[st:]
new_tmp = np.zeros(step)
st = (step-len(tmp))//2
new_tmp[st:st+len(tmp)] = tmp
seg_list.append(new_tmp)
print(len(seg_list))

seg_list = [torch.from_numpy(it) for it in seg_list]

batch_size = 32
x_batchs = []
ind = 0
while ind+batch_size < len(seg_list):
    x_batchs.append(torch.stack(seg_list[ind:ind+batch_size], dim=0))
    ind += batch_size
x_batchs.append(torch.stack(seg_list[ind:], dim=0))
print("batch num:{}, batch shape:{}".format(len(x_batchs), x_batchs[0].shape))

273
batch num:9, batch shape:torch.Size([32, 22050])


In [5]:
pred_list = None
for batch_id, x_wav in enumerate(x_batchs):
    with torch.no_grad():
        y_pred = sed_model(x=x_wav.to(device).unsqueeze(1).to(torch.float32))
        if pred_list is None:
            pred_list = y_pred
        else:
            pred_list = torch.concat((pred_list, y_pred), dim=0)
pred_list = np.argmax(pred_list.data.cpu().numpy(), axis=1)


In [6]:
print(pred_list)
sed_label2name = {0: "breathe", 1: "clearthroat", 2: "cough", 3: "exhale", 4: "hum", 5: "inhale",
                               6: "sniff", 7: "speech", 8: "vomit", 9: "whooping"}
print(sed_label2name[it] for it in pred_list)

[2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 2 2 2 2 2 2 2 2 2 4 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 2 2 2 2 2 2 2 2 7 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 7 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 2 2 2 2 2 2 2 2 4 2 2 2 2 2 2 2 2 7 2 2
 2 2 2 2 2 2 7 4 4 2 2 2 2 2 2 2 2 2 7 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 7
 2 2 2 2 2 2 2 2 7 2 7 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 7 2 2 2 2 2 2 2 2 2
 2 2 7 7 2 2 2 2 2 2 2 2 2 2]
<generator object <genexpr> at 0x000002028A965430>
