In [8]:
import os
import torch
import numpy as np

from model.mobilenet_v3 import MobileNetV3
# from model.mobilenet_v2 import MobileNetV2
# from model.AudioClassifier import AudioClassifier

def softmax(x):
    c = np.max(x)
    exp_x = np.exp(x - c)
    sum_exp_x = np.sum(exp_x)
    y = exp_x / sum_exp_x
    return y

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

num_classes = 2
model_file = f'./weights/class{num_classes}/epoch_070.pt'

# model = AudioClassifier(w=256, h=128, classes=21, num_conv_layers=3).to(device)
# model = MobileNetV2((1, 128, 100), 21).to(device)
model = MobileNetV3((1, 128, 256), num_classes, width_multiplier=1.0, dropout_rate=0).to(device)
model.load_state_dict(torch.load(model_file))
model.eval()
print('done')


done


In [9]:
import librosa
from utils.audio import *

def predict(model, file):
    wav, sr = librosa.load(file, sr=24000)
    wav, _ = wav_trim(wav)
    spec = wav_to_spectrogram(torch.Tensor(wav), 512, 512, 256, 128)
    slices = spectrogram_split(spec.detach().cpu().numpy(), [256], [128], 256)

    # print(slices[0].shape)
    # print(slices[1].shape)
    result = []
    for i in range(len(slices)):
        x = torch.from_numpy(slices[i]).unsqueeze(0).unsqueeze(0).to(device)
        result.append(softmax(model(x).detach().cpu().numpy()))

    confs = np.array([np.max(m) for m in result])
    label = np.array([np.argmax(m) for m in result])
    label[confs < 0.6] = -1
    
    filtered_indices = np.where(label >= 0)
    filtered_confs = confs[filtered_indices]
    filtered_label = label[filtered_indices]
    
    # 计算每个值出现的次数
    values, counts = np.unique(label, return_counts=True)

    # 按出现次数从多到少排序
    sorted_indices = np.argsort(-counts)
    sorted_values = values[sorted_indices]
    sorted_counts = counts[sorted_indices]
    
    cls = sorted_values[0]
    cnf = sorted_counts[0] / len(label)

    # result = [-1] * len(conf)
    # result[conf > 0.6] = 1
    # result = [np.argmax(m) for m in result]
    print(f'file:{file} slices: {len(slices)} confs len:{len(confs)} class:{filtered_label}')

for i in range(1, 34):
    file = f'e:/dataset/BabyCryDetectorSamples/{i}.wav'
    if os.path.exists(file):
        predict(model, file)

# predict(model, 'e:/dataset/testing/cry1.mp3')
# predict(model, 'e:/dataset/testing/cry2.mp3')
# predict(model, 'e:/dataset/testing/dogbark1.wav')
# predict(model, 'e:/dataset/testing/shortbark.wav')
# predict(model, 'e:/dataset/BabyCryDetectorSamples/30.wav')
# predict(model, 'e:/dataset/BabyCryDetectorSamples/31.wav')


file:e:/dataset/BabyCryDetectorSamples/1.wav slices: 16 confs len:16 class:[1 1 0 0 0 1 0 1 1 1 1 1 1 1 1 1]
file:e:/dataset/BabyCryDetectorSamples/2.wav slices: 8 confs len:8 class:[0 0 0 0 0 0 0 0]
file:e:/dataset/BabyCryDetectorSamples/3.wav slices: 5 confs len:5 class:[0 0 0 0 0]
file:e:/dataset/BabyCryDetectorSamples/4.wav slices: 17 confs len:17 class:[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
file:e:/dataset/BabyCryDetectorSamples/5.wav slices: 12 confs len:12 class:[0 0 0 0 0 0 0 0 0 0 0 0]
file:e:/dataset/BabyCryDetectorSamples/6.wav slices: 30 confs len:30 class:[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
file:e:/dataset/BabyCryDetectorSamples/7.wav slices: 26 confs len:26 class:[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
file:e:/dataset/BabyCryDetectorSamples/8.wav slices: 21 confs len:21 class:[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
file:e:/dataset/BabyCryDetectorSamples/9.wav slices: 28 confs len:28 class:[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [None]:
test_file = 'e:/dataset/testing/cry2.mp3'

wav, sr = librosa.load(test_file, sr=24000)
wav = wav_trim(wav)
spec = wav_to_spectrogram(test_file, 24000, 512, 512, 256, 128)
slices = spectrogram_split(spec[0].detach().cpu().numpy(), [256], [128], 256)

print(len(slices))
# print(slices[0].shape)
# print(slices[1].shape)