In [47]:
%matplotlib inline
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pydub
import librosa

import multiprocessing
from joblib import Parallel, delayed

import IPython.display

In [2]:
DATA_ROOT = '../data'

CPU_COUNT = multiprocessing.cpu_count()
print(CPU_COUNT)

4


## 特徴量データの作成

In [3]:
def load_audio(path, duration):
    # duration (ms)
    audio = pydub.AudioSegment.silent(duration=duration)
    try:
        audio = audio.overlay(pydub.AudioSegment.from_file(path).set_frame_rate(22050).set_channels(1))[:duration]
    except:
        return None
    # -32768 - 32767
    raw = np.fromstring(audio._data, dtype='int16')
    # -1 - +1
    raw = (raw + 0.5) / (32767 + 0.5)
    return raw

raw = load_audio('../data/UrbanSound8K/audio/', 4000)
if raw is not None:
    # 4000 ms = 4 s
    # 4 s x 22050 samples/s = 88200 samples
    print(raw.shape, np.min(raw), np.max(raw))

In [4]:
def load_urbansound():
    """Load raw audio and metadata from the UrbanSound8K dataset."""
    if os.path.isfile(os.path.join(DATA_ROOT, 'urban_meta.pkl')) and os.path.isfile(os.path.join(DATA_ROOT, 'urban_audio.npy')):
        rows_meta = pd.read_pickle(os.path.join(DATA_ROOT, 'urban_meta.pkl'))
        rows_audio = np.load(os.path.join(DATA_ROOT, 'urban_audio.npy'))
        return rows_meta, rows_audio
    
    metadata = pd.read_csv(os.path.join(DATA_ROOT, 'UrbanSound8K', 'metadata', 'UrbanSound8K.csv'))

    b = 0
    batch_size = 1000
    rows_meta = []
    rows_audio = []  
    while len(metadata[b * batch_size:(b + 1) * batch_size]):
        for key, row in metadata[b * batch_size:(b + 1) * batch_size].iterrows():
            filename = row['slice_file_name']
            fold = row['fold']
            category = row['classID']
            category_name = row['class']
            rows_meta.append(pd.DataFrame({'filename':filename,
                                           'fold':fold,
                                           'category':category,
                                           'category_name':category_name}, index=[0]))
            audio_path = os.path.join(DATA_ROOT, 'UrbanSound8K', 'audio', 'fold%d' % fold, filename)
            audio = load_audio(audio_path, 4000)
            if audio is not None:
                rows_audio.append(load_audio(audio_path, 4000))
        b = b + 1
        # この2行必要？最後に1回だけやればよいのでは？
        rows_meta = [pd.concat(rows_meta, ignore_index=True)]
        rows_audio = [np.vstack(rows_audio)]

        # for debug
        IPython.display.clear_output(wait=True)
        print('Loaded batch {} ({} / {})'.format(b, b * batch_size, len(metadata)))

    rows_meta = rows_meta[0]
    rows_audo = rows_audio[0]
    rows_meta[['category', 'fold']] = rows_meta[['category', 'fold']].astype(int)

    # save to files
    rows_meta.to_pickle(os.path.join(DATA_ROOT, 'urban_meta.pkl'))
    np.save(os.path.join(DATA_ROOT, 'urban_audio.npy'), rows_audio)

    return rows_meta, rows_audio

In [5]:
urban_meta, urban_audio = load_urbansound()

In [6]:
print(type(urban_meta))
print(type(urban_audio), urban_audio.shape)

<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'> (8170, 88200)


- 1つの音声ファイルからWINDOW_SIZE単位の複数のセグメントをオーバーラップさせながら抽出する
- 1セグメントは41フレームから成る（1フレームは512サンプル=23ms）
- FRAMES_PER_SEGMENTで1を引くのはなぜ？
- 1セグメント = 512 * 41 frames = 930ms
- オーバーラップはセグメントの半分
- 1セグメント = 512 x 41 = 20992 samplesを512hopでFFTするのでmelspecの長さは41

In [55]:
def extract_segments(clip, filename, fold, category, category_name, frames):
    FRAMES_PER_SEGMENT = frames - 1  # 41 frames ~= 950 ms
    WINDOW_SIZE = 512 * FRAMES_PER_SEGMENT  # 23 ms per frame
    STEP_SIZE = 512 * FRAMES_PER_SEGMENT // 2  # 512 * 20 = 10240
    BANDS = 60
    
    s = 0
    segments = []
    
    normalization_factor = 1 / np.max(np.abs(clip))
    clip = clip * normalization_factor
    
    while len(clip[s * STEP_SIZE:s * STEP_SIZE + WINDOW_SIZE]) == WINDOW_SIZE:
        signal = clip[s * STEP_SIZE:s * STEP_SIZE + WINDOW_SIZE]
        
        melspec = librosa.feature.melspectrogram(signal, sr=22050, n_fft=1024, hop_length=512, n_mels=BANDS)
        logspec = librosa.logamplitude(melspec)  # to dB
        # (60, 41) => (41, 60) => (2460, 1) => (1, 2460)
        logspec = logspec.T.flatten()[:, np.newaxis].T
        # スペクトログラムが1次元arrayになってしまったのでどのフレームのどのバンドか名前をつけておく
        logspec = pd.DataFrame(data=logspec, dtype='float32', index=[0],
                               columns=list('logspec_b{}_f{}'.format(i % BANDS, i // BANDS) for i in range(np.shape(logspec)[1])))
        # 無音のフレームでなければ処理対象とする
        if np.mean(logspec.as_matrix()) > -70.0:
            segment_meta = pd.DataFrame({
                    'filename': filename,
                    'fold': fold,
                    'category': category,
                    'category_name': category_name,
                    's_begin': s * STEP_SIZE,
                    's_end': s * STEP_SIZE + WINDOW_SIZE}, index=[0])
            segments.append(pd.concat((segment_meta, logspec), axis=1))
        s = s + 1
    
    segments = pd.concat(segments, ignore_index=True)
    return segments

audio = urban_audio
meta = urban_meta
i = 1
segments = extract_segments(audio[i, :], meta.loc[i, 'filename'], meta.loc[i, 'fold'],
                            meta.loc[i, 'category'], meta.loc[i, 'category_name'], 41)
segments

Unnamed: 0,category,category_name,filename,fold,s_begin,s_end,logspec_b0_f0,logspec_b1_f0,logspec_b2_f0,logspec_b3_f0,...,logspec_b50_f40,logspec_b51_f40,logspec_b52_f40,logspec_b53_f40,logspec_b54_f40,logspec_b55_f40,logspec_b56_f40,logspec_b57_f40,logspec_b58_f40,logspec_b59_f40
0,2,children_playing,100263-2-0-117.wav,5,0,20480,6.414443,-0.361258,-2.975967,-4.922975,...,-26.464653,-27.23357,-28.729954,-30.092625,-30.949137,-31.129772,-26.349237,-28.812599,-31.269625,-32.441116
1,2,children_playing,100263-2-0-117.wav,5,10240,30720,11.452924,4.923715,2.62729,-1.936336,...,-27.970486,-27.33279,-31.293703,-32.892082,-33.766247,-31.744202,-32.746559,-31.5186,-34.717014,-35.787029
2,2,children_playing,100263-2-0-117.wav,5,20480,40960,12.906493,12.350329,3.822859,-0.204639,...,-36.578815,-33.66943,-34.304409,-34.944786,-41.183483,-39.317562,-38.574184,-35.847332,-38.859806,-37.057201
3,2,children_playing,100263-2-0-117.wav,5,30720,51200,8.772851,8.680677,0.762579,-8.624339,...,-34.325726,-36.923904,-27.000769,-26.86294,-33.240536,-30.955284,-30.780109,-31.558142,-32.331989,-38.378609
4,2,children_playing,100263-2-0-117.wav,5,40960,61440,14.067381,12.390487,4.13842,-2.183446,...,-37.453663,-29.349934,-21.445765,-30.398357,-29.598888,-31.015837,-32.468075,-29.275682,-29.938704,-34.791977
5,2,children_playing,100263-2-0-117.wav,5,51200,71680,18.085091,12.41004,-1.477143,-1.044332,...,-28.65773,-25.135166,-25.209436,-25.282755,-32.643047,-30.84001,-28.228683,-29.27854,-28.549952,-29.055716
6,2,children_playing,100263-2-0-117.wav,5,61440,81920,-0.235098,2.772634,2.479538,-3.50808,...,-29.365555,-26.208691,-27.423306,-30.855757,-33.454422,-30.381878,-31.832542,-37.492947,-34.879707,-33.678204


In [72]:
def extract_features(meta, audio, frames=41):
    np.random.seed(20170927)
    batch_size = 100
    segments = []
    # 各バッチ（startからend）の各音声データから並列に特徴抽出
    for b in range(len(audio) // batch_size + 1):
        start = b * batch_size
        end = (b + 1) * batch_size
        if end > len(audio):
            end = len(audio)
        segments.extend(Parallel(n_jobs=CPU_COUNT)(delayed(extract_segments)(
                        audio[i, :],
                        meta.loc[i, 'filename'],
                        meta.loc[i, 'fold'],
                        meta.loc[i, 'category'],
                        meta.loc[i, 'category_name'],
                        frames) for i in range(start, end)))
        segments = [pd.concat(segments, ignore_index=True)]
        
        # for debug
        IPython.display.clear_output(wait=True)
        print('{} / {}'.format(end, len(audio)))
    return segments[0]

In [80]:
if os.path.isfile(os.path.join(DATA_ROOT, 'urban_features.pkl')):
    urban_features = pd.read_pickle(os.path.join(DATA_ROOT, 'urban_features.pkl'))
else:
    urban_features = extract_features(urban_meta, urban_audio)
    urban_features.to_pickle(os.path.join(DATA_ROOT, 'urban_features.pkl'))

In [81]:
urban_features

Unnamed: 0,category,category_name,filename,fold,s_begin,s_end,logspec_b0_f0,logspec_b1_f0,logspec_b2_f0,logspec_b3_f0,...,logspec_b50_f40,logspec_b51_f40,logspec_b52_f40,logspec_b53_f40,logspec_b54_f40,logspec_b55_f40,logspec_b56_f40,logspec_b57_f40,logspec_b58_f40,logspec_b59_f40
0,3,dog_bark,100032-3-0-0.wav,5,0,20480,-35.910690,-36.086739,-38.480804,-38.514992,...,-54.884312,-54.884312,-54.884312,-54.884312,-54.884312,-54.884312,-54.884312,-54.884312,-54.884312,-54.884312
1,2,children_playing,100263-2-0-117.wav,5,0,20480,6.414443,-0.361258,-2.975967,-4.922975,...,-26.464653,-27.233570,-28.729954,-30.092625,-30.949137,-31.129772,-26.349237,-28.812599,-31.269625,-32.441116
2,2,children_playing,100263-2-0-117.wav,5,10240,30720,11.452924,4.923715,2.627290,-1.936336,...,-27.970486,-27.332790,-31.293703,-32.892082,-33.766247,-31.744202,-32.746559,-31.518600,-34.717014,-35.787029
3,2,children_playing,100263-2-0-117.wav,5,20480,40960,12.906493,12.350329,3.822859,-0.204639,...,-36.578815,-33.669430,-34.304409,-34.944786,-41.183483,-39.317562,-38.574184,-35.847332,-38.859806,-37.057201
4,2,children_playing,100263-2-0-117.wav,5,30720,51200,8.772851,8.680677,0.762579,-8.624339,...,-34.325726,-36.923904,-27.000769,-26.862940,-33.240536,-30.955284,-30.780109,-31.558142,-32.331989,-38.378609
5,2,children_playing,100263-2-0-117.wav,5,40960,61440,14.067381,12.390487,4.138420,-2.183446,...,-37.453663,-29.349934,-21.445765,-30.398357,-29.598888,-31.015837,-32.468075,-29.275682,-29.938704,-34.791977
6,2,children_playing,100263-2-0-117.wav,5,51200,71680,18.085091,12.410040,-1.477143,-1.044332,...,-28.657730,-25.135166,-25.209436,-25.282755,-32.643047,-30.840010,-28.228683,-29.278540,-28.549952,-29.055716
7,2,children_playing,100263-2-0-117.wav,5,61440,81920,-0.235098,2.772634,2.479538,-3.508080,...,-29.365555,-26.208691,-27.423306,-30.855757,-33.454422,-30.381878,-31.832542,-37.492947,-34.879707,-33.678204
8,2,children_playing,100263-2-0-121.wav,5,0,20480,5.768313,8.482841,6.417264,3.030607,...,-28.374607,-27.107002,-27.039322,-26.465120,-25.947153,-29.787098,-31.734148,-29.067501,-29.339235,-30.059938
9,2,children_playing,100263-2-0-121.wav,5,10240,30720,15.693308,9.420144,0.473559,-4.553211,...,-34.190639,-24.745256,-23.924131,-25.468491,-30.881887,-32.087357,-27.852497,-29.973022,-27.721975,-28.146887


## Datasetクラス

In [127]:
class Dataset:
    def __init__(self, features, fold_testing, fold_validation, shape):
        train = features[(features['fold'] != fold_testing) & (features['fold'] != fold_validation)]
        validation = features[(features['fold'] == fold_validation)]
        test = features[(features['fold'] == fold_testing)]
        print('train:', train.shape)
        print('validation:', validation.shape)
        print('test:', test.shape)
        
        self.shape = shape
        self.start = 'logspec_b0_f0'
        self.end = features.columns[-1]
        class_count = len(pd.unique(features['category']))
        
        X = train.loc[:, self.start:self.end].as_matrix()
        y = Dataset.to_one_hot(train['category'].as_matrix(), class_count)
    
        X_validation = validation.loc[:, self.start:self.end].as_matrix()
        y_validation = Dataset.to_one_hot(validation['category'].as_matrix(), class_count)
        
        X_test = test.loc[:, self.start:self.end].as_matrix()
        y_test = Dataset.to_one_hot(test['category'].as_matrix(), class_count)

        # メルスペクトログラムのdBは平均0、標準偏差1に正規化
        X_mean = np.mean(X)
        X_std = np.std(X)
        
        # 訓練データの (mean, std) でバリデーションとテストデータも正規化
        X = (X - X_mean) / X_std
        X_validation = (X_validation - X_mean) / X_std
        X_test = (X_test - X_mean) / X_std

        X = np.reshape(X, self.shape, order='F')
        X_validation = np.reshape(X_validation, self.shape, order='F')
        X_test = np.reshape(X_test, self.shape, order='F')
        print(X.shape)
        print(X_validation.shape)
        print(X_test.shape)
    
        # generate delta
        X = self.generate_deltas(X)
        X_validation = self.generate_deltas(X_validation)
        X_test = self.generate_deltas(X_test)
        print(X.shape)
        print(X_validation.shape)
        print(X_test.shape)
        
        self.X, self.y = X, y
        self.X_validation, self.y_validation = X_validation, y_validation
        self.X_test, self.y_test = X_test, y_test

        
    def generate_deltas(self, X):
        new_dim = np.zeros(np.shape(X))
        X = np.concatenate((X, new_dim), axis=3)  # 3 = channel
        del new_dim
        
        for i in range(len(X)):
            X[i, :, :, 1] = librosa.feature.delta(X[i, :, :, 0])
        
        return X
            
    @classmethod
    def to_one_hot(cls, labels, class_count):
        one_hot_enc = np.zeros((len(labels), class_count))
        for r in range(len(labels)):
            one_hot_enc[r, labels[r]] = 1
        return one_hot_enc

In [128]:
dataset = Dataset(urban_features, fold_testing=1, fold_validation=10, shape=(-1, 60, 41, 1))

train: (42546, 2466)
validation: (4816, 2466)
test: (5337, 2466)
(42546, 60, 41, 1)
(4816, 60, 41, 1)
(5337, 60, 41, 1)
(42546, 60, 41, 2)
(4816, 60, 41, 2)
(5337, 60, 41, 2)


In [129]:
print(dataset.X.shape, dataset.y.shape)

(42546, 60, 41, 2) (42546, 10)


In [130]:
print(dataset.X_validation.shape, dataset.y_validation.shape)

(4816, 60, 41, 2) (4816, 10)


In [131]:
print(dataset.X_test.shape, dataset.y_test.shape)

(5337, 60, 41, 2) (5337, 10)
