From d2bd53d5ab8968584bfb65a835f8f3bab2715f18 Mon Sep 17 00:00:00 2001 From: Stephen Rawls Date: Wed, 19 Jun 2019 17:45:58 +0000 Subject: [PATCH 1/5] fixing var-seq-len rnn backward() operator --- src/operator/rnn-inl.h | 18 +++++++++++++++--- tests/python/gpu/test_gluon_gpu.py | 20 +++++++++++++++++--- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 1046f01cf6e2..328e28de8537 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -1583,8 +1583,11 @@ static OpStatePtr CreateRNNState(const nnvm::NodeAttrs &attrs, int dtype = in_types[rnn_enum::kData]; int itype = dtype; if (param.use_sequence_length) { - itype = in_types[rnn_enum::kSequenceLength]; - if (param.mode == rnn_enum::kLstm) itype -= 1; + size_t seq_len_input_idx = rnn_enum::kSequenceLength; + if (param.mode != rnn_enum::kLstm) { + seq_len_input_idx -= 1; + } + itype = in_types[seq_len_input_idx]; } MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { @@ -1649,7 +1652,7 @@ void RNNStatefulGradCompute(const OpStatePtr& state, // Hacky. This relies on fact that seq-len type is either the last input, // or we aren't using seq-len input and this type should be same as dtype. // Would prefer direct access to RNNParam object here but not sure how to get. - int itype = inputs[inputs.size()-1].type_flag_; + int itype = outputs[outputs.size()-1].type_flag_; MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { MSHADOW_TYPE_SWITCH(itype, IType, { @@ -1669,6 +1672,15 @@ void RNNStatefulGradCompute(const OpStatePtr& state, } } + + if (param.use_sequence_length) { + size_t seq_len_input_idx = rnn_enum::kSequenceLength; + if (param.mode != rnn_enum::kLstm) { + seq_len_input_idx -= 1; + } + in_data.push_back(outputs[seq_len_input_idx]); + } + op.Backward(ctx, out_grad, in_data, out_data, req, in_grad); }); }); diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py index b60814a47a81..5361e2e02d1c 100644 --- a/tests/python/gpu/test_gluon_gpu.py +++ b/tests/python/gpu/test_gluon_gpu.py @@ -265,15 +265,29 @@ def forward(self, inpt, sequence_length): # TODO: figure out why int32 doesn't work here sequence_length = nd.random.randint(1, num_timesteps+1, shape=(batch_size)).astype("float") - net_output = net(data, sequence_length=sequence_length).asnumpy() - ref_net_output = ref_net(data, sequence_length).asnumpy() + with autograd.record(): + net_output = net(data.copy(), sequence_length=sequence_length.copy()) + ref_net_output = ref_net(data.copy(), sequence_length.copy()) + + net_output_np = net_output.asnumpy() + ref_net_output_np = ref_net_output.asnumpy() sequence_length_np = sequence_length.asnumpy().astype("int32") # TODO: test state return value as well output # Only compare the valid sections for each batch entry for b in range(batch_size): - assert_allclose(net_output[:sequence_length_np[b], b], ref_net_output[:sequence_length_np[b], b]) + assert_allclose(net_output_np[:sequence_length_np[b], b], ref_net_output_np[:sequence_length_np[b], b]) + # Now test backward + net_output.backward() + ref_net_output.backward() + + for k in weights: + net_grad = net_params[k].grad() + ref_net_grad = ref_net_params[k.replace('l0', 'l0l0').replace('r0', 'r0l0')].grad() + sys.stderr.write("checking gradient for {}\n".format(k)) + assert_almost_equal(net_grad.asnumpy(), ref_net_grad.asnumpy(), + rtol=1e-2, atol=1e-2) @with_seed() @assert_raises_cudnn_not_satisfied(min_version='5.1.10') From f20e4d5b9556c07bee9a26c8cb3f81102b880eae Mon Sep 17 00:00:00 2001 From: Stephen Rawls Date: Wed, 19 Jun 2019 23:55:14 +0000 Subject: [PATCH 2/5] updating var-length lstm to test backward pass --- tests/python/gpu/test_gluon_gpu.py | 45 +++++++++++++++--------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py index 5361e2e02d1c..59e58fb76488 100644 --- a/tests/python/gpu/test_gluon_gpu.py +++ b/tests/python/gpu/test_gluon_gpu.py @@ -227,19 +227,6 @@ def forward(self, inpt): def check_layer_bidirectional_varseqlen(size, in_size): - class RefBiLSTMVarSeqLen(gluon.Block): - def __init__(self, size, **kwargs): - super(RefBiLSTMVarSeqLen, self).__init__(**kwargs) - with self.name_scope(): - self._lstm_fwd = gluon.rnn.LSTM(size, bidirectional=False, prefix='l0') - self._lstm_bwd = gluon.rnn.LSTM(size, bidirectional=False, prefix='r0') - - def forward(self, inpt, sequence_length): - fwd = self._lstm_fwd(inpt) - bwd_inpt = nd.SequenceReverse(inpt, sequence_length=sequence_length, use_sequence_length=True) - bwd = self._lstm_bwd(bwd_inpt) - bwd = nd.SequenceReverse(bwd, sequence_length=sequence_length, use_sequence_length=True) - return nd.concat(fwd, bwd, dim=2) weights = {} for d in ['l', 'r']: weights['lstm_{}0_i2h_weight'.format(d)] = mx.random.uniform(shape=(size*4, in_size)) @@ -248,47 +235,61 @@ def forward(self, inpt, sequence_length): weights['lstm_{}0_h2h_bias'.format(d)] = mx.random.uniform(shape=(size*4,)) net = gluon.rnn.LSTM(size, bidirectional=True, use_sequence_length=True, prefix='lstm_') - ref_net = RefBiLSTMVarSeqLen(size, prefix='lstm_') + ref_net = gluon.rnn.LSTM(size, bidirectional=True, use_sequence_length=False, prefix='lstm_ref_') net.initialize() ref_net.initialize() net_params = net.collect_params() ref_net_params = ref_net.collect_params() for k in weights: net_params[k].set_data(weights[k]) - ref_net_params[k.replace('l0', 'l0l0').replace('r0', 'r0l0')].set_data(weights[k]) - + ref_net_params[k.replace("lstm_", "lstm_ref_")].set_data(weights[k]) batch_size = 10 num_timesteps = 11 data = mx.random.uniform(shape=(num_timesteps, batch_size, in_size)) + data_np = data.asnumpy() # TODO: figure out why int32 doesn't work here sequence_length = nd.random.randint(1, num_timesteps+1, shape=(batch_size)).astype("float") + sequence_length_np = sequence_length.asnumpy().astype("int32") + # Reference net is processing batch elements one at a time, so that it is "perfectly sized" + # Because of that, we need to accumulate gradients in reference net. + for p in ref_net.collect_params().values(): + p.grad_req = 'add' + + ref_net_output = [] with autograd.record(): net_output = net(data.copy(), sequence_length=sequence_length.copy()) - ref_net_output = ref_net(data.copy(), sequence_length.copy()) + + for b in range(batch_size): + data_slice = mx.nd.array(data_np[:sequence_length_np[b], b, :]).reshape(sequence_length_np[b], 1, in_size) + ref_output_slice = ref_net(data_slice) + ref_net_output.append(ref_output_slice) net_output_np = net_output.asnumpy() - ref_net_output_np = ref_net_output.asnumpy() - sequence_length_np = sequence_length.asnumpy().astype("int32") # TODO: test state return value as well output # Only compare the valid sections for each batch entry for b in range(batch_size): - assert_allclose(net_output_np[:sequence_length_np[b], b], ref_net_output_np[:sequence_length_np[b], b]) + assert_allclose(net_output_np[:sequence_length_np[b], b], ref_net_output[b].asnumpy().squeeze(1)) # Now test backward net_output.backward() - ref_net_output.backward() + + for ref_output_slice in ref_net_output: + ref_output_slice.backward() + + ref_net_params = ref_net.collect_params() for k in weights: net_grad = net_params[k].grad() - ref_net_grad = ref_net_params[k.replace('l0', 'l0l0').replace('r0', 'r0l0')].grad() + ref_net_grad = ref_net_params[k.replace('lstm_', 'lstm_ref_')].grad() sys.stderr.write("checking gradient for {}\n".format(k)) assert_almost_equal(net_grad.asnumpy(), ref_net_grad.asnumpy(), rtol=1e-2, atol=1e-2) + @with_seed() @assert_raises_cudnn_not_satisfied(min_version='5.1.10') def test_layer_bidirectional(): From 20b79e85fca19c7d9050a97551ff036e82e301ce Mon Sep 17 00:00:00 2001 From: Stephen Rawls Date: Thu, 20 Jun 2019 00:04:23 +0000 Subject: [PATCH 3/5] removing bit of dbg print to stderr i forgot to remove earlier --- tests/python/gpu/test_gluon_gpu.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py index 59e58fb76488..3562050249aa 100644 --- a/tests/python/gpu/test_gluon_gpu.py +++ b/tests/python/gpu/test_gluon_gpu.py @@ -285,7 +285,6 @@ def check_layer_bidirectional_varseqlen(size, in_size): for k in weights: net_grad = net_params[k].grad() ref_net_grad = ref_net_params[k.replace('lstm_', 'lstm_ref_')].grad() - sys.stderr.write("checking gradient for {}\n".format(k)) assert_almost_equal(net_grad.asnumpy(), ref_net_grad.asnumpy(), rtol=1e-2, atol=1e-2) From 60f277e381a7b907e404e524e880cd94b28c4da4 Mon Sep 17 00:00:00 2001 From: Stephen Rawls Date: Thu, 20 Jun 2019 00:21:26 +0000 Subject: [PATCH 4/5] resolving TODO about using int32 for sequence_length --- tests/python/gpu/test_gluon_gpu.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py index 3562050249aa..8b31c753194e 100644 --- a/tests/python/gpu/test_gluon_gpu.py +++ b/tests/python/gpu/test_gluon_gpu.py @@ -249,8 +249,7 @@ def check_layer_bidirectional_varseqlen(size, in_size): data = mx.random.uniform(shape=(num_timesteps, batch_size, in_size)) data_np = data.asnumpy() - # TODO: figure out why int32 doesn't work here - sequence_length = nd.random.randint(1, num_timesteps+1, shape=(batch_size)).astype("float") + sequence_length = nd.random.randint(1, num_timesteps+1, shape=(batch_size)).astype("int32") sequence_length_np = sequence_length.asnumpy().astype("int32") # Reference net is processing batch elements one at a time, so that it is "perfectly sized" From 25e8e5137021b49ae0de6a6fc1ac52545b8b0854 Mon Sep 17 00:00:00 2001 From: Stephen Rawls Date: Thu, 20 Jun 2019 01:37:50 +0000 Subject: [PATCH 5/5] setting rtol and atol similar to other tests in this file --- tests/python/gpu/test_gluon_gpu.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py index 8b31c753194e..fc650294a538 100644 --- a/tests/python/gpu/test_gluon_gpu.py +++ b/tests/python/gpu/test_gluon_gpu.py @@ -271,7 +271,8 @@ def check_layer_bidirectional_varseqlen(size, in_size): # TODO: test state return value as well output # Only compare the valid sections for each batch entry for b in range(batch_size): - assert_allclose(net_output_np[:sequence_length_np[b], b], ref_net_output[b].asnumpy().squeeze(1)) + assert_allclose(net_output_np[:sequence_length_np[b], b], ref_net_output[b].asnumpy().squeeze(1), + rtol=1e-2, atol=1e-6) # Now test backward net_output.backward() @@ -285,7 +286,7 @@ def check_layer_bidirectional_varseqlen(size, in_size): net_grad = net_params[k].grad() ref_net_grad = ref_net_params[k.replace('lstm_', 'lstm_ref_')].grad() assert_almost_equal(net_grad.asnumpy(), ref_net_grad.asnumpy(), - rtol=1e-2, atol=1e-2) + rtol=1e-2, atol=1e-6) @with_seed()