In [4]:
import os
import torch
import numpy as np

from model.mobilenet_v3 import MobileNetV3
# from model.mobilenet_v2 import MobileNetV2
from utils.modelproxy import ModelProxy

def softmax(x):
    c = np.max(x)
    exp_x = np.exp(x - c)
    sum_exp_x = np.sum(exp_x)
    y = exp_x / sum_exp_x
    return y

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

num_classes = 2
model_file = './weights/mobilenetv3-binary/epoch_120.pt'
model = ModelProxy(MobileNetV3((1, 128, 256), num_classes, width_multiplier=1.0, dropout_rate=0))
model.to(device)
model.load(model_file)
model.eval()

# model = AudioClassifier(w=256, h=128, classes=21, num_conv_layers=3).to(device)
# model = MobileNetV2((1, 128, 100), 21).to(device)

print(f'The model has {model.count_parameters():,} trainable parameters')
# model.save_torchscript('torchscript.pt', trace_input_shape=(1, 1, 128, 256))

In [2]:
import librosa
from utils.audio import *

def predict(model, file):
    wav, sr = librosa.load(file, sr=24000)
    wav, _ = wav_trim(wav)
    spec = wav_to_spectrogram(torch.Tensor(wav), 512, 512, 256, 128)
    slices = spectrogram_split(spec.detach().cpu().numpy(), [256], [128], 256)

    # print(slices[0].shape)
    # print(slices[1].shape)
    result = []
    for i in range(len(slices)):
        x = torch.from_numpy(slices[i]).unsqueeze(0).unsqueeze(0).to(device)
        result.append(softmax(model(x).detach().cpu().numpy()))

    confs = np.array([np.max(m) for m in result])
    label = np.array([np.argmax(m) for m in result])
    label[confs < 0.6] = -1
    
    filtered_indices = np.where(label >= 0)
    filtered_confs = confs[filtered_indices]
    filtered_label = label[filtered_indices]
    
    # 计算每个值出现的次数
    values, counts = np.unique(label, return_counts=True)

    # 按出现次数从多到少排序
    sorted_indices = np.argsort(-counts)
    sorted_values = values[sorted_indices]
    sorted_counts = counts[sorted_indices]
    
    cls = sorted_values[0]
    cnf = sorted_counts[0] / len(label)

    # result = [-1] * len(conf)
    # result[conf > 0.6] = 1
    # result = [np.argmax(m) for m in result]
    print(f'file:{file} slices: {len(slices)} confs len:{len(confs)} class:{filtered_label}')

def walk_dir(dir):
    # push dir to stack
    dirs = []
    audio_extentions = ['.wav', '.ogg', '.mp3']
    dirs.append(dir)
    while len(dirs) > 0:
        dir = dirs.pop()
        files = os.listdir(dir)
        for f in files:
            if os.path.isdir(os.path.join(dir, f)):
                dirs.append(os.path.join(dir, f))
            else:
                if os.path.splitext(f)[1] in audio_extentions:
                    predict(model, os.path.join(dir, f))

walk_dir('e:/dataset/baby_cry_detection_data')


  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

In [None]:
test_file = 'e:/dataset/testing/cry2.mp3'

wav, sr = librosa.load(test_file, sr=24000)
wav = wav_trim(wav)
spec = wav_to_spectrogram(test_file, 24000, 512, 512, 256, 128)
slices = spectrogram_split(spec[0].detach().cpu().numpy(), [256], [128], 256)

print(len(slices))
# print(slices[0].shape)
# print(slices[1].shape)