/
stft_aa_dataset.py
97 lines (79 loc) · 3.05 KB
/
stft_aa_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""
A Pylearn2 Dataset object for accessing TIMIT with all the preprocessing that I want
"""
__authors__ = 'David Krueger'
__copyright__ = "Copyright 2014, Universite de Montreal"
__credits__ = ["David Krueger"]
__license__ = "3-clause BSD"
__maintainer__ = "David Krueger"
############
import numpy
np = numpy
from numpy import array as A
from pylearn2.datasets.dense_design_matrix import DefaultViewConverter
from pylearn2.datasets.dense_design_matrix import DenseDesignMatrix
from pylearn2.space import VectorSpace, Conv2DSpace, CompositeSpace
#from stft import stft
import scipy
from segmentaxis import segment_axis
def stft(x, fs, framesz, hop):
"""
x - signal
fs - sample rate
framesz - frame size
hop - hop size (frame size = overlap + hop size)
"""
framesamp = int(framesz*fs)
hopsamp = int(hop*fs)
w = scipy.hamming(framesamp)
X = scipy.array([scipy.fft(w*x[i:i+framesamp])
for i in range(0, len(x)-framesamp, hopsamp)])
return X
class AA(DenseDesignMatrix):
""""""
def __init__(self,
which_set='train',
seq_len=3010,
frame_size=320,
hop_size=32,
axes=('b', 0, 1, 'c'),
noutput_frames=1,
ninput_frames=9,
overlap=False, # not implemented yet
preprocessing=True):
self.__dict__.update(locals())
del self.self
# cut sequences down to seq_len
dat = np.load('/data/lisa_ubi/speech/onomatopoeia/dataset/per_phone_timit/wav_aa.npy')
lengths = [len(i) for i in dat]
daat = A([A(dat[i][:seq_len]) for i in range(len(dat)) if lengths[i] > seq_len])
if preprocessing:
self.mean = np.mean(daat)
daat -= self.mean
self.std = np.std(daat)
daat /= self.std
# Convert to Spectral
daat = A([np.abs(stft(ex,1,frame_size,hop_size)).flatten() for ex in daat])
#print daat.shape
#aat = A([arr.reshape(len(arr),-1) for arr in daat])
print daat.shape
# Stride and flatten
#print ninput_frames*frame_size
daat = segment_axis(daat, ninput_frames*frame_size, (ninput_frames-1)*frame_size, axis=1)
print daat.shape
if which_set == 'train':
daat = daat[:int(.8*len(daat))]
if which_set == 'valid':
daat = daat[int(.8*len(daat)):int(.9*len(daat))]
if which_set == 'test':
daat = daat[int(.9*len(daat)):]
features = daat[:,:-noutput_frames,:].reshape(-1,frame_size*ninput_frames)
targets = daat[:,noutput_frames:,-frame_size*noutput_frames:].reshape(-1,frame_size*noutput_frames)
# FIXME BELOW HERE for CNNs
#--------------------------
IMAGES_SHAPE = features.shape + (1,)
print targets.shape
print features.shape
X, y = features, targets
view_converter = DefaultViewConverter(shape=IMAGES_SHAPE, axes=axes)
super(AA, self).__init__(X=X, y=y, view_converter=view_converter)