In [1]:
import argparse
import glob
import os
import sys
from typing import Dict, List, Union
import tqdm

import numpy as np
import pandas as pd
import librosa
import soundfile as sf
import tqdm
from multiprocessing import Pool
from tqdm import tqdm
from multiprocessing import Pool
import json

In [6]:
class LogMelIntensityExtractor:
    def __init__(self, sr, nfft, n_mels, fmin = 0, fmax=24000, duration=5, resample=True, save_dir = '', dataset_name='original'):
        self.n_mels = n_mels
        self.nfft = nfft
        self.sr = sr
        self.melW = librosa.filters.mel(
            sr=self.sr,
            n_fft=nfft,
        )
        self.fmin = fmin
        self.fmax = self.sr//2
        self.duration = duration
        self.audio_length = self.duration * self.sr
        self.step = int(self.duration*0.666*self.sr)
        self.resample = resample
        self.save_dir = save_dir
        self.dataset_name = dataset_name
        
    def logmel(self, sig):
        melspec = librosa.feature.melspectrogram(
            y=sig, sr=self.sr, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax,
        )
        melspec = librosa.power_to_db(melspec).astype(np.float32)
        return melspec
    
    def transform(self, sig_path):
        sound, original_sr = sf.read(sig_path)
        if (self.resample==True) & (original_sr != self.sr):
            sound = librosa.resample(sound, original_sr, self.sr, res_type="kaiser_fast")
        sounds = [sound[i:i+self.audio_length] for i in range(0, max(1, len(sound) - self.audio_length + 1), self.step)]
        sounds[-1] = crop_or_pad(sounds[-1] , length=self.audio_length)
        images = [self.logmel(sound) for sound in sounds]
        images = np.stack(images)
        return images
    
    def save_soundfile(self, sig_path):
        print(sig_path)
        sound, original_sr = sf.read(sig_path)
        if len(sound.shape) > 1:
            sound = sound[:,0]
        if (self.resample==True) & (original_sr != self.sr):
            sound = librosa.resample(sound, original_sr, self.sr, res_type="kaiser_fast")
        print(sound.shape)
        sounds = [sound[i:i+self.audio_length] for i in range(0, max(1, len(sound) - self.audio_length + 1), self.step)]
        sounds[-1] = crop_or_pad(sounds[-1] , length=self.audio_length)
        print(sounds[0].shape)
        images = [self.logmel(sound) for sound in sounds]
        #print(images.shape)
        images = np.stack(images)
        print(images.shape)
        label = sig_path.split('/')[-2]
        file_id = sig_path.split('/')[-1].split('.')[0]
        #os.makedirs(os.path.join(self.save_dir, self.dataset_name, label), exist_ok=True)
        #np.save(os.path.join(self.save_dir, self.dataset_name, label, file_id + '.npy'), images)
        return 

def crop_or_pad(y, length, is_train=True, start=None):
    if len(y) < length:
        y = np.concatenate([y, np.zeros(length - len(y))])
        
        n_repeats = length // len(y)
        epsilon = length % len(y)
        
        y = np.concatenate([y]*n_repeats + [y[:epsilon]])
        
    elif len(y) > length:
        if not is_train:
            start = start or 0
        else:
            start = start or np.random.randint(len(y) - length)

        y = y[start:start + length]

    return y

sig_path = "../data/train_audio/abethr1/XC128013.ogg"
sr = 32000
duration = 5
audio_length = sr*duration

f = LogMelIntensityExtractor(sr=sr, nfft=2048, n_mels=128)

In [7]:
f.save_soundfile(sig_path)

../data/train_audio/abethr1/XC128013.ogg
(1459513,)
(160000,)
(13, 128, 313)


In [11]:
file_path = "../dataset/logmel/abethr1/XC363503.npy"

sound = np.load(file_path)
print(sound.shape)
#sounds = [np.load(bird_samples[sample]) for sample in bird_samples]
#target = self.bird_label_dict[self.files[idx].split('/')[-2]]
#labels = np.zeros(len(self.bird_label_dict.keys()), dtype=float)
#labels[target] = 1.0
duration = 5
sound_size = int(duration//5)

start_idx = np.random.choice(sound.shape[0])
print(start_idx, sound_size)
if start_idx + sound_size > sound.shape[0]:
    pad_size = start_idx+sound_size-sound.shape[0]
    sound = np.concatenate([sound[start_idx:], np.zeros((pad_size, sound.shape[1], sound.shape[2]))])
    print(sound.shape)
else:
    sound = sound[start_idx:start_idx+sound_size, :, :]
    print(sound.shape)
sound = sound.transpose(0, 2, 1)
print(sound.shape)
sound = sound.reshape([-1, sound.shape[-1]]).T
print(sound.shape)

(5, 128, 313)
1 1
(1, 128, 313)
(1, 313, 128)
(128, 313)


In [19]:
def mono_to_color(X, eps=1e-6, mean=None, std=None):
    mean = X.mean()
    std = X.std()
    X = (X - mean) / (std + eps)
    print(X.shape)
    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)
        V = (V - _min) / (_max - _min)
        # V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)
    return V

In [15]:
from multiprocessing import Pool
import torch

In [32]:
sounds = np.load(file_path)
#target = self.bird_label_dict[file_path.split('/')[-2]]
#labels = np.zeros(len(self.bird_label_dict.keys()), dtype=float)
#labels[target] = 1.0
sound_size = 474 #474: データ数の最大値
if sound_size > sounds.shape[0]:
    pad_size = sound_size-sounds.shape[0]
    sounds = np.concatenate([sounds, np.zeros((pad_size, sounds.shape[1], sounds.shape[2]))])

p = Pool(10)
sounds = [sound for sound in sounds]
#if (np.random.rand() > 0):
#    noise_level = 0.05
#    noise = (np.random.sample((len(sounds), sounds[0].shape[0], sounds[0].shape[1])) + 9) * np.mean(sounds) * noise_level * (np.random.sample() + 0.3)
#    sounds = sounds + noise

sounds = p.map(mono_to_color, sounds)
sounds = np.array(sounds).astype(float)
sounds = np.stack([sounds, sounds, sounds], 1)
sounds = torch.from_numpy(sounds)

(128, 313)
(128, 313)
(128, 313)
(128, 313)
(128, 313)(128, 313)

(128, 313)(128, 313)

(128, 313)(128, 313)

(128, 313)(128, 313)

(128, 313)(128, 313)

(128, 313)(128, 313)
(128, 313)

(128, 313)(128, 313)(128, 313)


(128, 313)(128, 313)(128, 313)


(128, 313)
(128, 313)
(128, 313)
(128, 313)(128, 313)(128, 313)


(128, 313)(128, 313)(128, 313)


(128, 313)(128, 313)

(128, 313)(128, 313)

(128, 313)(128, 313)

(128, 313)(128, 313)
(128, 313)

(128, 313)
(128, 313)(128, 313)

(128, 313)
(128, 313)(128, 313)

(128, 313)
(128, 313)
(128, 313)
(128, 313)
(128, 313)(128, 313)

(128, 313)(128, 313)
(128, 313)
(128, 313)
(128, 313)

(128, 313)
(128, 313)
(128, 313)(128, 313)

(128, 313)(128, 313)
(128, 313)

(128, 313)(128, 313)(128, 313)


(128, 313)(128, 313)

(128, 313)(128, 313)
(128, 313)
(128, 313)

(128, 313)(128, 313)

(128, 313)
(128, 313)(128, 313)

(128, 313)
(128, 313)(128, 313)

(128, 313)
(128, 313)(128, 313)(128, 313)
(128, 313)


(128, 313)
(128, 313)(128, 313)

(128, 313)

In [33]:
sounds.shape

torch.Size([474, 3, 128, 313])