Switch branches/tags
Nothing to show
Find file
Fetching contributors…
Cannot retrieve contributors at this time
256 lines (190 sloc) 8.7 KB
from dataset.timit import TIMIT
from experiments import utils
import sys
from scikits.talkbox import segment_axis
import numpy as np
def build_aa_dataset(in_samples, out_samples, shift, n_train=100, n_valid=10):
aa_seqs = np.load('/data/lisa/data/timit/readable/per_phone/wav_aa.npy')
mean = np.mean(np.hstack(aa_seqs))
std = np.std(np.hstack(aa_seqs))
print "mean:%f , std:%f"%(mean,std)
aa_max,aa_min = np.max(np.hstack(aa_seqs)), np.min(np.hstack(aa_seqs))
norm_seqs = np.asarray([(seq.astype('float32')-mean)/std \
for seq in aa_seqs])
# n_seq = norm_seqs.shape[0]
# n_train = n_seq*9/10
# train_aa_seqs = norm_seqs[:n_train]
# valid_aa_seqs = norm_seqs[n_train:]
# n_train = 100
# n_valid = 10
train_aa_seqs = norm_seqs[:n_train]
valid_aa_seqs = norm_seqs[n_train:n_train+n_valid]
print 'train sequences:', train_aa_seqs.shape[0]
print 'valid sequences:', valid_aa_seqs.shape[0]
frame_len = in_samples + out_samples
overlap = frame_len - shift
train_samples = []
valid_samples = []
for wav_seq in train_aa_seqs:
train_samples.append(segment_axis(wav_seq, frame_len, overlap))
train_samples = np.vstack(train_samples[:])
train_samples = np.random.permutation(train_samples)
for wav_seq in valid_aa_seqs:
valid_samples.append(segment_axis(wav_seq, frame_len, overlap))
valid_samples = np.vstack(valid_samples[:])
print 'train examples:', train_samples.shape
print 'valid examples:', valid_samples.shape
train_x = train_samples[:,:in_samples]
train_y = train_samples[:,in_samples:]
print train_x.shape, train_y.shape
valid_x = valid_samples[:,:in_samples]
valid_y = valid_samples[:,in_samples:]
print valid_x.shape, valid_y.shape
return utils.shared_dataset(train_x), \
utils.shared_dataset(train_y), \
utils.shared_dataset(valid_x), \
def build_one_user_data(dataset, in_samples, out_samples, shift,
win_width, shuffle, usr_id=0):
"""a function that builds train and validation set for one user
in the training set"""
print "building datasets for user %d"%usr_id
subset = 'train'
train_wav_seqs = dataset.train_raw_wav[usr_id*10:usr_id*10+9]
train_seqs_to_phns = dataset.train_seq_to_phn[usr_id*10:usr_id*10+9]
train_x, train_y1, train_y2 = \
_build_frames_w_phn(dataset, subset,
train_wav_seqs, train_seqs_to_phns,
in_samples, out_samples, shift,
win_width, shuffle)
valid_wav_seqs = dataset.train_raw_wav[usr_id*10+9:(usr_id+1)*10]
valid_seqs_to_phns = dataset.train_seq_to_phn[usr_id*10+9:(usr_id+1)*10]
#import pdb; pdb.set_trace()
valid_x, valid_y1, valid_y2 = \
_build_frames_w_phn(dataset, subset,
valid_wav_seqs, valid_seqs_to_phns,
in_samples, out_samples, shift,
win_width, shuffle)
return train_x, train_y1, train_y2, valid_x, valid_y1, valid_y2
def build_data_sets(dataset, subset, n_spkr, n_utts,
in_samples, out_samples, shift,
win_width, shuffle):
"""general function that builds data sets for training/validating/testing
the models from the corresponding dataset in TIMIT"""
print "building %s dataset..."%subset
wav_seqs = dataset.__dict__[subset+"_raw_wav"][0:n_utts*n_spkr]
seqs_to_phns = dataset.__dict__[subset+"_seq_to_phn"][0:n_utts*n_spkr]
return _build_frames_w_phn(dataset, subset, wav_seqs, seqs_to_phns,
in_samples, out_samples, shift,
win_width, shuffle)
def _build_frames_w_phn(dataset, subset, wav_seqs, seqs_to_phns,
in_samples, out_samples, shift,
win_width, shuffle):
#import pdb; pdb.set_trace()
norm_seqs = utils.standardize(wav_seqs)
#norm_seqs = utils.normalize(wav_seqs)
frame_len = in_samples + out_samples
overlap = frame_len - shift
samples = []
seqs_phn_info = []
seqs_phn_shift = []
# CAUTION!: I am using here reduced phone set
# we can also try using the full set but we must store phn+1
# because 0 no more refers to 'h#' (no speech)
for ind in range(len(norm_seqs)):
#import pdb; pdb.set_trace()
wav_seq = norm_seqs[ind]
phn_seq = seqs_to_phns[ind]
phn_start_end = dataset.__dict__[subset+"_phn"][phn_seq[0]:phn_seq[1]]
# create a matrix with consecutive windows
# phones are padded by h#, because each window will be shifted once
# the first phone samples has passed
phones = np.append(phn_start_end[:,2].astype('int16'),
# phones = np.append(phn_start_end[:,2],
# np.zeros((1,)))
phn_windows = segment_axis(phones, win_width, win_width-1)
# array that has endings of each phone
phn_ends = phn_start_end[:,1]
# extend the last phone till the end, this is not wrong as long as the
# last phone is no speech phone (h#)
phn_ends[-1] = wav_seq.shape[0]-1
# create a mapping from each sample to phn_window
phn_win_shift = np.zeros_like(wav_seq,dtype='int16')
phn_win_shift[phn_ends] = 1
phn_win = phn_win_shift.cumsum(dtype='int16')
# minor correction!
phn_win[-1] = phn_win[-2]
# Segment samples into frames
samples.append(segment_axis(wav_seq, frame_len, overlap))
# for phones we care only about one value to mark the start of a new window.
# the start of a phone window in a frame is when all samples of previous
# phone hav passed, so we use 'min' function to choose the current phone
# of the frame
phn_frames = segment_axis(phn_win, frame_len, overlap).min(axis=1)
# replace the window index with the window itself
win_frames = phn_windows[phn_frames]
#import pdb; pdb.set_trace()
# create a window shift for each frame
shift_frames_aux = np.roll(phn_frames,1)
shift_frames_aux[0] = 0
shift_frames = phn_frames - shift_frames_aux
# to mark the ending of the sequence - countering the first correction!
shift_frames[-1] = 1
#import pdb; pdb.set_trace()
#import pdb; pdb.set_trace()
# stack all data in one matrix, each row is a frame
samples_data = np.vstack(samples[:])
phn_data = np.vstack(seqs_phn_info[:])
shift_data = np.hstack(seqs_phn_shift[:])
#convert phone data to one-hot
from pylearn2.format.target_format import OneHotFormatter
fmt = OneHotFormatter(max_labels=39, dtype='float32')
phn_data = fmt.format(phn_data)
phn_data = phn_data.reshape(phn_data.shape[0],
full_data = np.hstack([samples_data[:,:in_samples], phn_data, #input
samples_data[:,in_samples:], #out1
shift_data.reshape(shift_data.shape[0],1)]) #out2
if shuffle:
full_data = np.random.permutation(full_data)
data_x = full_data[:,:in_samples+win_width*39]
data_y1 = full_data[:,in_samples+win_width*39:-1]
data_y2 = full_data[:,-1]
print 'Done'
print 'There are %d examples in %s set'%(data_x.shape[0],subset)
print "--------------"
print 'data_x.shape', data_x.shape
print 'data_y1.shape', data_y1.shape
return utils.shared_dataset(data_x), \
if __name__ == "__main__":
print 'loading data...'
save_stdout = sys.stdout
sys.stdout = open('timit.log', 'w')
# creating wrapper object for TIMIT dataset
dataset = TIMIT()
sys.stdout = save_stdout
in_samples = 240
out_samples = 1
shift = 1
win_width = 2
# n_spkr = 1
# n_utts = 10
shuffle = False
# each training example has 'in_sample' inputs and 'out_samples' output
# and examples are shifted by 'shift'
build_one_user_data(dataset, in_samples, out_samples, shift,
win_width, shuffle)
## code for loading AA data
# in_samples = 240
# out_samples = 1
# shift = 1
# build_aa_dataset(in_samples, out_samples, shift)