# wavescoredata のデータ形式を決める上で重要な実験．

In [2]:
import numpy as np
import h5py

In [3]:
%load_ext memory_profiler

In [8]:
%memit wave = np.random.randn(44100 * 1000)

peak memory: 769.21 MiB, increment: 336.46 MiB


In [9]:
%memit score = np.random.randint(0, 2, [128, 44100 * 1000 // 512])

peak memory: 850.37 MiB, increment: 81.15 MiB


In [10]:
print(wave.shape, score.shape)

(44100000,) (128, 86132)


## (1) 辞書を pickle で保存．（今のwsdataはこれ）

In [11]:
import pickle
dic = {'wave':wave, 'score':score}
with open('test.pkl', 'wb') as f:
    pickle.dump(dic, f)

In [19]:
%%time
for i in range(10):
    with open('test.pkl', 'rb') as f:
        d = pickle.load(f)
    _ = np.mean(d['wave'][5000000:5000000+65536])

CPU times: user 1.28 s, sys: 5.29 s, total: 6.58 s
Wall time: 6.72 s


## (2) npz で保存．

In [14]:
np.savez('test.npz', wave=wave, score=score)

In [20]:
%%time
for i in range(10):
    d = np.load('test.npz')
    _ = np.mean(d['wave'][5000000:5000000+65536])

CPU times: user 7.69 s, sys: 2.33 s, total: 10 s
Wall time: 10.2 s


## (2.1) npzで保存．読み出し時に mmap_mode='r' 指定．

In [21]:
%%time
for i in range(10):
    d = np.load('test.npz', mmap_mode='r')
    _ = np.mean(d['wave'][5000000:5000000+65536])

CPU times: user 7.77 s, sys: 2.28 s, total: 10 s
Wall time: 10.2 s


mmap_mode は少なくとも npz ファイルには無効のようだ． npy なら使えるらしいが，配列ごとにファイルが別れるのは避けたい．

## (3) hdf5

In [23]:
import h5py
with h5py.File('test.h5', 'w') as f:
    f.create_dataset('wave', data=wave)
    f.create_dataset('score', data=score)

In [27]:
%%time
for i in range(10):
    with h5py.File('test.h5', 'r') as f:
        _ = np.mean(f['wave'][5000000:5000000+65536])

CPU times: user 13.5 ms, sys: 3.77 ms, total: 17.3 ms
Wall time: 17 ms


必要最小限にしかストレージにアクセスしないようだ．

In [30]:
%%time
for i in range(10):
    with h5py.File('test.h5', 'r') as f:
        _ = np.mean(f['score'][:,30000:30000+128])

CPU times: user 22.4 ms, sys: 24.7 ms, total: 47.1 ms
Wall time: 85.8 ms


In [31]:
%%time
for i in range(10):
    with h5py.File('test.h5', 'r') as f:
        _ = np.mean(f['score'][30:31, 30000:30000+16384])

CPU times: user 13.9 ms, sys: 3.36 ms, total: 17.2 ms
Wall time: 16.9 ms


ストレージ上に連続して並んでいる方がさらに早いぽいのだが，そこまではあまり気にしなくて良さそう？

# 結果を踏まえて，   .h5  形式を採用する．

In [8]:
with h5py.File('test.h5', 'r') as f:
        wave = f['wave']
print(type(wave))

<class 'h5py._hl.dataset.Dataset'>


In [9]:
np.sum(wave[3000000:3000000+2000])

ValueError: Not a dataset (not a dataset)