/
timitlong.py
117 lines (98 loc) · 3.92 KB
/
timitlong.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""
A Pylearn2 Dataset object for accessing TIMIT with all the preprocessing that I want
"""
__authors__ = 'David Krueger'
__copyright__ = "Copyright 2014, Universite de Montreal"
__credits__ = ["David Krueger"]
__license__ = "3-clause BSD"
__maintainer__ = "David Krueger"
############
import os
import os.path
import numpy
from pylearn2.utils import serial
import csv
from itertools import izip
import math
import time
import numpy as np
import theano.tensor as T
from theano import function
from pylearn2.datasets.dense_design_matrix import DefaultViewConverter
from pylearn2.datasets.dense_design_matrix import DenseDesignMatrix
class TIMIT(DenseDesignMatrix):
"""
A Pylearn2 Dataset object for accessing TIMIT w/preprocessing
The dataset is constructed by taking all of the sequences whose lengths are
between stop and stop+window. These sequences are all trimmed to length
stop by removing samples from the beginning. These are the input
sequences. Output sequences are generated by removing the first
frame_width audio samples from the sequence, which will only be used to
predict the 1st output (which is the frame_width+1st sample).
"""
# Mean and standard deviation of the acoustic samples from the whole
# dataset (train, valid, test).
_mean = 0.0035805809921434142
_std = 542.48824133746177
def __init__(self,
which_set='train',
transformer=None,
start=0,
stop=45000,
window=2000,
frame_width=200,
preprocessor=None,
fit_preprocessor=False,
axes=('b', 0, 1, 'c'),
fit_test_preprocessor=False,
space_preserving=False):
"""
Parameters
----------
which_set : str
Either "train", "valid" or "test"
start : int
The sample to start from for all input sequences
stop : int
The sample to stop at for all input and output sequences
window :... see class docstring
"""
self.test_args = locals()
del self.test_args['self']
# Load data from disk
self._load_data(which_set)
self.frame_width = frame_width
# Standardize and Slice data
features = []
targets = []
for i, sequence in enumerate(self.raw_wav):
if len(self.raw_wav[i]) < stop+window+1 and len(self.raw_wav[i]) > stop-1:
self.raw_wav[i] = (sequence - TIMIT._mean) / TIMIT._std
features.append(self.raw_wav[i][start:stop])
targets.append(self.raw_wav[i][start+frame_width:stop+1])
features = numpy.array(features)
targets = numpy.array(targets)
self.raw_wav = features
#self.data = features, targets
print "self.raw_wav.shape, self.raw_wav[0].shape = ", self.raw_wav.shape, self.raw_wav[0].shape
#self.num_examples = len(self.raw_wav)
IMAGES_SHAPE = [int(stop-start), 1, 1]
print IMAGES_SHAPE
print targets.shape
print features.shape
#print "which_set", which_set
X, y = features, targets
view_converter = DefaultViewConverter(shape=IMAGES_SHAPE, axes=axes)
super(TIMIT, self).__init__(X=X, y=y, view_converter=view_converter)
def _load_data(self, which_set):
#print "_load_data", which_set
# Check which_set
if which_set not in ['train', 'valid', 'test']:
raise ValueError(which_set + " is not a recognized value. " +
"Valid values are ['train', 'valid', 'test'].")
# Create file paths
timit_base_path = os.path.join(os.environ["PYLEARN2_DATA_PATH"],
"timit/readable")
raw_wav_path = os.path.join(timit_base_path, which_set + "_x_raw.npy")
# Load data.
self.raw_wav = serial.load(raw_wav_path)