In [None]:
# !wget https://github.com/karoldvl/ESC-50/archive/master.zip
# !mkdir -p data && cd data && unzip ../master.zip

In [1]:
import torch
import torchaudio
import numpy as np
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import ConcatDataset

from utils.transform import UnsqueezeTransform, NoiseGeneratorTransform, FixedValueTransform, BinaryTransform
from utils.dataset import dump_dataset_info
from dataset.cry import CryDataset
from dataset.noise import NoiseDataset


# 定义超参数
classes = 2
train_batch_size = 200
test_batch_size = 5
train_data_esc50 = 'e:/dataset/out/esc50pp/training128mel1.pkl'
valid_data_esc50 = 'e:/dataset/out/esc50pp/validation128mel1.pkl'
train_data_donateacry = 'e:/dataset/out/dnac/donateacry.pkl'

train_data_us8k = 'e:/dataset/out/us8k/training128mel1.pkl'
valid_data_us8k = 'e:/dataset/out/us8k/validation128mel1.pkl'

noise = NoiseGeneratorTransform(noise_std=1e-6, gamma=10, milestones=[10, 30, 60])

def get_noise_dataset(shape=(1, 128, 256), target_id=0, num_samples=6000):
    return NoiseDataset(shape, num_samples, target_id=target_id, noise_std=0.1)

def get_donateacry_dataset():
    return CryDataset(train_data_donateacry,
                      transform=torch.nn.Sequential(
                          noise, 
                          # NormalizeTransform(),
                          UnsqueezeTransform(),
                      ), 
                      target_transform=FixedValueTransform(value=1)
                     )

def get_esc50_train_dataset():
    return CryDataset(train_data_esc50, 
                      transform=torch.nn.Sequential(
                          noise, 
                          # NormalizeTransform(),
                          UnsqueezeTransform(),
                      ), 
                      # target_transform=ESC50LabelTransform()
                      target_transform=BinaryTransform(20)
                     )

def get_esc50_valid_dataset():
    return CryDataset(valid_data_esc50,
                      transform=torch.nn.Sequential(
                          #  NormalizeTransform(),
                          UnsqueezeTransform(),
                      ), 
                      #  target_transform=ESC50LabelTransform()
                       target_transform=BinaryTransform(20)
                     )

def get_us8k_train_dataset():
    return CryDataset(train_data_us8k, 
                      transform=torch.nn.Sequential(
                          noise, 
                          #  NormalizeTransform(),
                          UnsqueezeTransform(),
                      ), 
                    #   target_transform=US8KLabelTransform()
                     )

def get_us8k_valid_dataset():
    return CryDataset(valid_data_us8k,
                      transform=torch.nn.Sequential(
                          #  NormalizeTransform(),
                          UnsqueezeTransform(),
                      ), 
                    #   target_transform=US8KLabelTransform()
                     )
def get_big_valid_dataset():
    return ConcatDataset([get_esc50_valid_dataset(), get_us8k_valid_dataset()])

train_loader = DataLoader(ConcatDataset([
                                get_esc50_train_dataset(),
                                get_donateacry_dataset(),
                                # get_noise_dataset(num_samples=300)
                            ]), 
                          batch_size=train_batch_size, shuffle=True)

test_loader = DataLoader(ConcatDataset([
                                get_esc50_valid_dataset(),
                            ]), 
                          batch_size=test_batch_size, shuffle=False)

print(f'train_dataset:{len(train_loader.dataset)}')
print(f'test_dataset:{len(test_loader.dataset)}')
dump_dataset_info(train_loader.dataset)
dump_dataset_info(test_loader.dataset)

train_dataset:17837
test_dataset:2894
dataset info
records: 17837
classes: 2
class[0] items: 11360
class[1] items: 6477
dataset info
records: 2894
classes: 2
class[0] items: 2834
class[1] items: 60


In [2]:
import os
import torch
from tqdm.auto import tqdm

from torch.optim.lr_scheduler import MultiStepLR

# 混合精度
from torch.cuda.amp import GradScaler
from torch.utils.tensorboard import SummaryWriter
from utils.earlystop import EarlyStopper
from utils.modelproxy import ModelProxy
from utils.focalloss import FocalLoss

from model.mobilenet_v3 import MobileNetV3
from model.mobilenet_v2 import MobileNetV2
from train import train

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# model = MobileNetV2((1, 128, 100), 21).to(device)
model = MobileNetV3((1, 128, 256), classes, width_multiplier=1.0, dropout_rate=0.2).to(device)
# print(model)
# model = torch.hub.load('pytorch/vision:v0.10.0', 'mobilenet_v2', pretrained=True)

with_tqdm = False
with_early_stop = False
with_l2_regularization = True
with_grad_scaler = True
start_epoch = 0
end_epoch = 120
learning_rate = 1e-3
save_model_start = 10
save_model_period = 10
l2_reg = 0.05

model_name = f'mobilenetv3-binary'
model_file = f'./weights/{model_name}/epoch_{{epoch:03d}}.pt'

# criterion = nn.CrossEntropyLoss()
criterion = FocalLoss(classes, alpha=0.75, gamma=2.0)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = MultiStepLR(optimizer, milestones=[30, 60, 90], gamma=0.1)
grad_scaler = GradScaler() if with_grad_scaler else None
writer = SummaryWriter()

if with_early_stop:
    early_stopper = EarlyStopper(patience=3, min_delta=10)

model = ModelProxy(model).set_file_pattern(model_file).set_epoch(start_epoch).set_auto_save([10, 5], [])
model.load_checkpoint()

train(model, criterion, scheduler, start_epoch, end_epoch, train_loader, test_loader, writer=writer, grad_scaler=grad_scaler, l2_reg=l2_reg)
writer.close()


  from .autonotebook import tqdm as notebook_tqdm


Epoch [1/120] LR:[0.001] Loss/train:3.4475 Loss/val:695.1843 Accuracy/val:2.90%
Epoch [2/120] LR:[0.001] Loss/train:1.8363 Loss/val:6551.3565 Accuracy/val:2.07%
Epoch [3/120] LR:[0.001] Loss/train:1.8870 Loss/val:5382.5084 Accuracy/val:2.07%


KeyboardInterrupt: 

In [None]:
from matplotlib import pyplot as plt

waveform, sr = torchaudio.load('E:/dataset/ESC-50-master/audio/1-100032-A-0.wav')
resample = torchaudio.transforms.Resample(sr, 8000)
spectrogram = torchaudio.transforms.MelSpectrogram(
    sample_rate=8000,
    n_fft=512,
    win_length=20,
    hop_length=10, 
    n_mels=128)

eps = torch.Tensor([1e-6])
# spec = spec.numpy()
# spec = np.log(spec + eps)


out = spectrogram(resample(waveform[:44100]))
out += eps
out = out.log()
# time = len(out[0]) * 1000 / 8000
# print(time)
# print(out.shape)

plt.pcolormesh(out[0], cmap='gray')
plt.show()

In [None]:
import torch
import torchaudio
import librosa
import numpy as np

def get_spec(waveform, sampling_rate=24000, n_fft=512, window_length=20, hop_length=10):
	specs = []
	waveform = torch.Tensor(waveform)
	transform = torchaudio.transforms.MelSpectrogram(
		sample_rate=sampling_rate, 
		n_fft=n_fft, 
		win_length=window_length, 
		hop_length=hop_length, 
		n_mels=128)
	spec = transform(waveform)
	eps = 1e-6
	spec = spec.numpy()
	spec = np.log(spec + eps)
	x_min = spec.min()
	x_max = spec.max()
	spec = (spec - x_min) / (x_max - x_min)
	for j in range(0, spec.shape[1] - 51, 50):
		slice = spec[:, j:j+100]
		# print(f'slice shape: {slice.shape}, range: {j}:{j+100}')
		specs.append(slice)
	return specs

def extract_spectrogram(values, clip, entries):
	for data in entries:

		num_channels = 2
		window_sizes = [20, 40]
		hop_sizes = [10, 20]
		# window_sizes = [20]
		# hop_sizes = [10]

		specs = []
		for i in range(num_channels):
			window_length = int(round(window_sizes[i]*args.sampling_rate/1000))
			hop_length = int(round(hop_sizes[i]*args.sampling_rate/1000))

			clip = torch.Tensor(clip)
			spec = torchaudio.transforms.MelSpectrogram(sample_rate=args.sampling_rate, n_fft=512, win_length=window_length, hop_length=hop_length, n_mels=128)(clip)
			eps = 1e-6
			spec = spec.numpy()
			spec = np.log(spec + eps)
			# print(f'channel: {i} shape: {spec.shape}')
			for j in range(0, spec.shape[1] - 51, 50):
				slice = spec[:, j:j+100]
				# print(f'slice shape: {slice.shape}, range: {j}:{j+100}')
				specs.append(slice)
			# print(spec.shape)
			# spec = np.asarray(torchvision.transforms.Resize((128, 250))(Image.fromarray(spec)))
			# specs.append(spec)
		new_entry = {}
		# new_entry["audio"] = clip.numpy()
		new_entry["values"] = np.array(specs)
		new_entry["target"] = data["target"]
		values.append(new_entry)

clip, sr = librosa.load("d:\\code\\jupyter\\audio\\positive\\baby_cry_16bit_8k.wav", sr=24000)
# clip, sr = librosa.load("E:\\dataset\\bilibili\\cry2.m4s", sr=24000)
print(clip.shape, sr)
clip = clip[:len(clip) // 1000 * 1000]
print(clip.shape)
# entries = audios.loc[audios["filename"]==audio].to_dict(orient="records")
values = get_spec(clip, sampling_rate=sr)
values = [np.expand_dims(value, 0) for value in values[:1000]]
values = torch.Tensor(values).to(device)
print(values.size())
# print(len(values))
# print(values[0].shape)

predict = model(values).detach().cpu().numpy()
predict = [np.argmax(p) for p in predict]
print(predict)
# print(torch.argmax(predict[0]))