In [1]:
from utils.audio import *

# def spectrogram_split(spec, spec_win, spec_hop, spec_resize=-1):
#     y, x = spec.shape
#     slice = []
#     n_frame = spec.shape[1]
#     for i in range(len(spec_win)):
#         win = spec_win[i]
#         hop = spec_hop[i]
#         # n_slice = (n_frame - win) // hop + 1
#         # remain  = n_frame - (n_slice * hop - hop + win)
#         # print(f'spec[{i}] win:{win} hop:{hop} n_slice:{n_slice} remain:{remain}')

#         start = 0
#         end = 0
#         while end < n_frame:
#             end = min(start + win, n_frame)
#             # merge next frame if next frame < win // 2
#             if n_frame - start - hop < win // 2:
#                 end = n_frame
            
#             # print(f'  slice = [{start} - {end}] = {end - start}')
#             img = Image.fromarray(spec[:, start:end] * 255)
#             if spec_resize != end - start:
#                 img = img.resize((spec_resize, y))
#             slice.append(np.array(img))
#             start += hop

#     return slice

# def wav_remove_silent(waveform, top_db=30, frame_length=2048, hop_length=512):
#     non_silent_intervals = librosa.effects.split(waveform, top_db=top_db, frame_length=frame_length, hop_length=hop_length)
#     return np.concatenate([waveform[start:end] for start, end in non_silent_intervals])

# def wav_trim(waveform, top_db=30, frame_length=2048, hop_length=512):
#     return librosa.effects.trim(waveform, top_db=top_db, frame_length=frame_length, hop_length=hop_length)

# def wav_to_spectrogram(wav, n_fft, win_length, hop_length, n_mels=128):
#     mel_spectrogram = torchaudio.transforms.MelSpectrogram(
#         n_fft=n_fft, 
#         win_length=win_length,
#         hop_length=hop_length,
#         n_mels=n_mels,
#     )

#     preprocess = torch.nn.Sequential(
#         mel_spectrogram,
#         Log2Transform(), 
#         NormalizeTransform(),
#     )

#     return preprocess(wav) 


In [None]:
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

us8k_config = {
    'num_folds': 1,
    'new_freq': 24000,
    'n_fft': 512,
    'hop_length': 256,
    'win_length': 512,
    'n_mels': 128,
    'spec_win': [192, 256, 320],
    'spec_hop': [96, 128, 160],
    'spec_resize': 256,
    'csv_file': 'E:/dataset/UrbanSound8K/metadata/UrbanSound8K.csv',
    'data_dir': 'E:/dataset/UrbanSound8K/audio',
    'store_dir': 'E:/dataset/out/us8k',
    # 'csv_file': '../data/ESC-50-master/meta/esc50.csv',
    # 'data_dir': '../data/ESC-50-master/audio',
    # 'store_dir': './dataset',
    'sampling_rate': 24000
}

def extract_features_us8k(config, audios):
    audio_names = list(audios.slice_file_name.unique())
    values = []
    # print(f'records = {len(audios)}')
    # print(f'records = {len(audio_names)}')
    for file_name in tqdm(audio_names):
        entries = audios.loc[audios["slice_file_name"]==file_name].to_dict(orient="records")[0]
        full_path = f"{config['data_dir']}/fold{entries['fold']}/{entries['slice_file_name']}"
        clip, sr = librosa.load(full_path, sr=config['new_freq'])
        clip, _ = wav_trim(clip)
        clip = wav_remove_silent(clip)
        spec = wav_to_spectrogram(torch.Tensor(clip), config['n_fft'], config['win_length'], config['hop_length'], config['n_mels'])
        slices = spectrogram_split(spec.numpy(), config['spec_win'], config['spec_hop'], config['spec_resize'])
        entries = audios.loc[audios["filename"]==file_name].to_dict(orient="records")
        target = entries[0]['target']
        for v in slices:
            values.append({'value': v, 'target': target})
        # print(f"Finished {file_name}")
        # print(f"Finished: {full_path}")
    return values

def preprocess_us8k(config):
    audios = pd.read_csv(config['csv_file'], skipinitialspace=True)
    num_folds = config['num_folds']

    if not os.path.exists(config['store_dir']):
        print(f"creating directory: {config['store_dir']}")
        os.makedirs(config['store_dir'])

    for i in range(1, num_folds+1):
        training_audios = audios.loc[audios["fold"]!=i]
        validation_audios = audios.loc[audios["fold"]==i]

        print(f'Fold {i} training size: {len(training_audios)} validation size: {len(validation_audios)}')
        training_values = extract_features_us8k(config, training_audios)
        with open(f"{config['store_dir']}/training128mel{i}.pkl", "wb") as handler:
            pkl.dump(training_values, handler, protocol=pkl.HIGHEST_PROTOCOL)

        validation_values = extract_features_us8k(config, validation_audios)
        with open(f"{config['store_dir']}/validation128mel{i}.pkl", "wb") as handler:
            pkl.dump(validation_values, handler, protocol=pkl.HIGHEST_PROTOCOL)

preprocess_us8k(us8k_config)

In [2]:
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

esc50_config = {
    'num_folds': 1,
    'new_freq': 24000,
    'n_fft': 512,
    'hop_length': 256,
    'win_length': 512,
    'n_mels': 128,
    'spec_win': [192, 256, 320],
    'spec_hop': [96, 128, 160],
    'spec_resize': 256,
    'csv_file': 'E:/dataset/ESC-50-master/meta/esc50.csv',
    'data_dir': 'E:/dataset/ESC-50-master/audio',
    'store_dir': 'E:/dataset/out/esc50pp',
    # 'csv_file': '../data/ESC-50-master/meta/esc50.csv',
    # 'data_dir': '../data/ESC-50-master/audio',
    # 'store_dir': './dataset',
    'sampling_rate': 24000
}

def extract_features_esc50(config, audios):
    audio_names = list(audios.filename.unique())
    values = []
    for file_name in tqdm(audio_names):
        full_path = f"{config['data_dir']}/{file_name}"
        clip, sr = librosa.load(full_path, sr=config['new_freq'])
        clip, _ = wav_trim(clip)
        clip = wav_remove_silent(clip)
        spec = wav_to_spectrogram(torch.Tensor(clip), config['n_fft'], config['win_length'], config['hop_length'], config['n_mels'])
        slices = spectrogram_split(spec.numpy(), config['spec_win'], config['spec_hop'], config['spec_resize'])
        entries = audios.loc[audios["filename"]==file_name].to_dict(orient="records")
        target = entries[0]['target']
        for v in slices:
            values.append({'value': v, 'target': target})
        # print(f"Finished {file_name} (target:{target})")
    return values

def preprocess_esc50(config):
    audios = pd.read_csv(config['csv_file'], skipinitialspace=True)
    num_folds = config['num_folds']

    if not os.path.exists(config['store_dir']):
        print(f"creating directory: {config['store_dir']}")
        os.makedirs(config['store_dir'])

    for i in range(1, num_folds+1):
        training_audios = audios.loc[audios["fold"]!=i]
        validation_audios = audios.loc[audios["fold"]==i]

        training_values = extract_features_esc50(config, training_audios)
        # return training_values
        with open(f"{config['store_dir']}/training128mel{i}.pkl", "wb") as handler:
            pkl.dump(training_values, handler, protocol=pkl.HIGHEST_PROTOCOL)

        validation_values = extract_features_esc50(config, validation_audios)
        with open(f"{config['store_dir']}/validation128mel{i}.pkl", "wb") as handler:
            pkl.dump(validation_values, handler, protocol=pkl.HIGHEST_PROTOCOL)

preprocess_esc50(esc50_config)


  0%|          | 0/1600 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

In [16]:
import os
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

dnac_config = {
    'new_freq': 24000,
    'n_fft': 512,
    'hop_length': 256,
    'win_length': 512,
    'n_mels': 128,
    'spec_win': [192, 256, 320],
    'spec_hop': [96, 128, 160],
    'spec_resize': 256,
    'data_dir': 'E:/dataset/donate-a-cry-corpus',
    'store_dir': 'E:/dataset/out/dnac',
    'sampling_rate': 24000
}

def file_to_spectrogram(file, config):
    clip, _ = librosa.load(file, sr=config['new_freq'])
    clip, _ = wav_trim(clip)
    clip = wav_remove_silent(clip)
    spec = wav_to_spectrogram(torch.Tensor(clip), config['n_fft'], config['win_length'], config['hop_length'], config['n_mels'])
    return spectrogram_split(spec.numpy(), config['spec_win'], config['spec_hop'], config['spec_resize'])

def preprocess_donateacry(config):
    target = 0
    values = []

    if not os.path.exists(config['store_dir']):
        print(f"creating directory: {config['store_dir']}")
        os.makedirs(config['store_dir'])

    entries = os.listdir(config['data_dir'])
    for entry in entries:
        dirpath = f"{config['data_dir']}/{entry}"
        if not os.path.isdir(dirpath):
            continue
        print(f"Processing {entry}...")
        
        files = os.listdir(dirpath)
        for file in tqdm(files):
            if not file.endswith('wav'):
                continue
            
            full_path = f"{dirpath}/{file}"
            slices = file_to_spectrogram(full_path, config)
            for piece in slices:
                values.append({'value': piece, 'target': target})
            
        target += 1
    
    with open(f"{config['store_dir']}/donateacry.pkl", "wb") as f:
        pkl.dump(values, f, protocol=pkl.HIGHEST_PROTOCOL)

preprocess_donateacry(dnac_config)

Processing baby-cry-detection-ogg...


  0%|          | 0/108 [00:00<?, ?it/s]

Processing BabyCryDetectorSamples...


  0%|          | 0/18 [00:00<?, ?it/s]

Processing belly_pain...


  0%|          | 0/16 [00:00<?, ?it/s]

Processing burping...


  0%|          | 0/8 [00:00<?, ?it/s]

Processing discomfort...


  0%|          | 0/27 [00:00<?, ?it/s]

Processing hungry...


  0%|          | 0/382 [00:00<?, ?it/s]

Processing tired...


  0%|          | 0/24 [00:00<?, ?it/s]

In [None]:
import os
import torch

from model.mobilenet_v3 import MobileNetV3
# from model.mobilenet_v2 import MobileNetV2
# from model.AudioClassifier import AudioClassifier


In [None]:
from collections import Counter

# 统计元素出现次数
counter = Counter(result)

# 按照出现次数从多到少排序
sorted_items = sorted(counter.items(), key=lambda x: x[1], reverse=True)

# 打印结果
for item in sorted_items:
    print(f"{item[0]}: {item[1]}")