From d2bd53d5ab8968584bfb65a835f8f3bab2715f18 Mon Sep 17 00:00:00 2001
From: Stephen Rawls <sterawls@amazon.com>
Date: Wed, 19 Jun 2019 17:45:58 +0000
Subject: [PATCH 1/5] fixing var-seq-len rnn backward() operator

---
 src/operator/rnn-inl.h             | 18 +++++++++++++++---
 tests/python/gpu/test_gluon_gpu.py | 20 +++++++++++++++++---
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 1046f01cf6e2..328e28de8537 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -1583,8 +1583,11 @@ static OpStatePtr CreateRNNState(const nnvm::NodeAttrs &attrs,
   int dtype = in_types[rnn_enum::kData];
   int itype = dtype;
   if (param.use_sequence_length) {
-    itype = in_types[rnn_enum::kSequenceLength];
-    if (param.mode == rnn_enum::kLstm) itype -= 1;
+      size_t seq_len_input_idx = rnn_enum::kSequenceLength;
+      if  (param.mode != rnn_enum::kLstm) {
+        seq_len_input_idx -= 1;
+      }
+    itype = in_types[seq_len_input_idx];
   }
 
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
@@ -1649,7 +1652,7 @@ void RNNStatefulGradCompute(const OpStatePtr& state,
   // Hacky. This relies on fact that seq-len type is either the last input,
   // or we aren't using seq-len input and this type should be same as dtype.
   // Would prefer direct access to RNNParam object here but not sure how to get.
-  int itype = inputs[inputs.size()-1].type_flag_;
+  int itype = outputs[outputs.size()-1].type_flag_;
 
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
       MSHADOW_TYPE_SWITCH(itype, IType, {
@@ -1669,6 +1672,15 @@ void RNNStatefulGradCompute(const OpStatePtr& state,
             }
           }
 
+
+          if (param.use_sequence_length) {
+            size_t seq_len_input_idx = rnn_enum::kSequenceLength;
+            if  (param.mode != rnn_enum::kLstm) {
+              seq_len_input_idx -= 1;
+            }
+            in_data.push_back(outputs[seq_len_input_idx]);
+          }
+
           op.Backward(ctx, out_grad, in_data, out_data, req, in_grad);
         });
     });
diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py
index b60814a47a81..5361e2e02d1c 100644
--- a/tests/python/gpu/test_gluon_gpu.py
+++ b/tests/python/gpu/test_gluon_gpu.py
@@ -265,15 +265,29 @@ def forward(self, inpt, sequence_length):
     # TODO: figure out why int32 doesn't work here
     sequence_length = nd.random.randint(1, num_timesteps+1, shape=(batch_size)).astype("float")
 
-    net_output = net(data, sequence_length=sequence_length).asnumpy()
-    ref_net_output = ref_net(data, sequence_length).asnumpy()
+    with autograd.record():
+        net_output = net(data.copy(), sequence_length=sequence_length.copy())
+        ref_net_output = ref_net(data.copy(), sequence_length.copy())
+
+    net_output_np = net_output.asnumpy()
+    ref_net_output_np = ref_net_output.asnumpy()
     sequence_length_np = sequence_length.asnumpy().astype("int32")
 
     # TODO: test state return value as well output
     # Only compare the valid sections for each batch entry
     for b in range(batch_size):
-        assert_allclose(net_output[:sequence_length_np[b], b], ref_net_output[:sequence_length_np[b], b])
+        assert_allclose(net_output_np[:sequence_length_np[b], b], ref_net_output_np[:sequence_length_np[b], b])
 
+    # Now test backward
+    net_output.backward()
+    ref_net_output.backward()
+
+    for k in weights:
+        net_grad = net_params[k].grad()
+        ref_net_grad = ref_net_params[k.replace('l0', 'l0l0').replace('r0', 'r0l0')].grad()
+        sys.stderr.write("checking gradient for {}\n".format(k))
+        assert_almost_equal(net_grad.asnumpy(), ref_net_grad.asnumpy(),
+                            rtol=1e-2, atol=1e-2)
 
 @with_seed()
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')

From f20e4d5b9556c07bee9a26c8cb3f81102b880eae Mon Sep 17 00:00:00 2001
From: Stephen Rawls <sterawls@amazon.com>
Date: Wed, 19 Jun 2019 23:55:14 +0000
Subject: [PATCH 2/5] updating var-length lstm to test backward pass

---
 tests/python/gpu/test_gluon_gpu.py | 45 +++++++++++++++---------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py
index 5361e2e02d1c..59e58fb76488 100644
--- a/tests/python/gpu/test_gluon_gpu.py
+++ b/tests/python/gpu/test_gluon_gpu.py
@@ -227,19 +227,6 @@ def forward(self, inpt):
 
 
 def check_layer_bidirectional_varseqlen(size, in_size):
-    class RefBiLSTMVarSeqLen(gluon.Block):
-        def __init__(self, size, **kwargs):
-            super(RefBiLSTMVarSeqLen, self).__init__(**kwargs)
-            with self.name_scope():
-                self._lstm_fwd = gluon.rnn.LSTM(size, bidirectional=False, prefix='l0')
-                self._lstm_bwd = gluon.rnn.LSTM(size, bidirectional=False, prefix='r0')
-
-        def forward(self, inpt, sequence_length):
-            fwd = self._lstm_fwd(inpt)
-            bwd_inpt = nd.SequenceReverse(inpt, sequence_length=sequence_length, use_sequence_length=True)
-            bwd = self._lstm_bwd(bwd_inpt)
-            bwd = nd.SequenceReverse(bwd, sequence_length=sequence_length, use_sequence_length=True)
-            return nd.concat(fwd, bwd, dim=2)
     weights = {}
     for d in ['l', 'r']:
         weights['lstm_{}0_i2h_weight'.format(d)] = mx.random.uniform(shape=(size*4, in_size))
@@ -248,47 +235,61 @@ def forward(self, inpt, sequence_length):
         weights['lstm_{}0_h2h_bias'.format(d)] = mx.random.uniform(shape=(size*4,))
 
     net = gluon.rnn.LSTM(size, bidirectional=True, use_sequence_length=True, prefix='lstm_')
-    ref_net = RefBiLSTMVarSeqLen(size, prefix='lstm_')
+    ref_net  = gluon.rnn.LSTM(size, bidirectional=True, use_sequence_length=False, prefix='lstm_ref_')
     net.initialize()
     ref_net.initialize()
     net_params = net.collect_params()
     ref_net_params = ref_net.collect_params()
     for k in weights:
         net_params[k].set_data(weights[k])
-        ref_net_params[k.replace('l0', 'l0l0').replace('r0', 'r0l0')].set_data(weights[k])
-
+        ref_net_params[k.replace("lstm_", "lstm_ref_")].set_data(weights[k])
 
     batch_size = 10
     num_timesteps = 11
     data = mx.random.uniform(shape=(num_timesteps, batch_size, in_size))
+    data_np = data.asnumpy()
 
     # TODO: figure out why int32 doesn't work here
     sequence_length = nd.random.randint(1, num_timesteps+1, shape=(batch_size)).astype("float")
+    sequence_length_np = sequence_length.asnumpy().astype("int32")
 
+    # Reference net is processing batch elements one at a time, so that it is "perfectly sized"
+    # Because of that, we need to accumulate gradients in reference net.
+    for p in ref_net.collect_params().values():
+        p.grad_req = 'add'
+
+    ref_net_output = []
     with autograd.record():
         net_output = net(data.copy(), sequence_length=sequence_length.copy())
-        ref_net_output = ref_net(data.copy(), sequence_length.copy())
+
+        for b in range(batch_size):
+            data_slice = mx.nd.array(data_np[:sequence_length_np[b], b, :]).reshape(sequence_length_np[b], 1, in_size)
+            ref_output_slice = ref_net(data_slice)
+            ref_net_output.append(ref_output_slice)
 
     net_output_np = net_output.asnumpy()
-    ref_net_output_np = ref_net_output.asnumpy()
-    sequence_length_np = sequence_length.asnumpy().astype("int32")
 
     # TODO: test state return value as well output
     # Only compare the valid sections for each batch entry
     for b in range(batch_size):
-        assert_allclose(net_output_np[:sequence_length_np[b], b], ref_net_output_np[:sequence_length_np[b], b])
+        assert_allclose(net_output_np[:sequence_length_np[b], b], ref_net_output[b].asnumpy().squeeze(1))
 
     # Now test backward
     net_output.backward()
-    ref_net_output.backward()
+
+    for ref_output_slice in ref_net_output:
+        ref_output_slice.backward()
+
+    ref_net_params = ref_net.collect_params()
 
     for k in weights:
         net_grad = net_params[k].grad()
-        ref_net_grad = ref_net_params[k.replace('l0', 'l0l0').replace('r0', 'r0l0')].grad()
+        ref_net_grad = ref_net_params[k.replace('lstm_', 'lstm_ref_')].grad()
         sys.stderr.write("checking gradient for {}\n".format(k))
         assert_almost_equal(net_grad.asnumpy(), ref_net_grad.asnumpy(),
                             rtol=1e-2, atol=1e-2)
 
+
 @with_seed()
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 def test_layer_bidirectional():

From 20b79e85fca19c7d9050a97551ff036e82e301ce Mon Sep 17 00:00:00 2001
From: Stephen Rawls <sterawls@amazon.com>
Date: Thu, 20 Jun 2019 00:04:23 +0000
Subject: [PATCH 3/5] removing bit of dbg print to stderr i forgot to remove
 earlier

---
 tests/python/gpu/test_gluon_gpu.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py
index 59e58fb76488..3562050249aa 100644
--- a/tests/python/gpu/test_gluon_gpu.py
+++ b/tests/python/gpu/test_gluon_gpu.py
@@ -285,7 +285,6 @@ def check_layer_bidirectional_varseqlen(size, in_size):
     for k in weights:
         net_grad = net_params[k].grad()
         ref_net_grad = ref_net_params[k.replace('lstm_', 'lstm_ref_')].grad()
-        sys.stderr.write("checking gradient for {}\n".format(k))
         assert_almost_equal(net_grad.asnumpy(), ref_net_grad.asnumpy(),
                             rtol=1e-2, atol=1e-2)
 

From 60f277e381a7b907e404e524e880cd94b28c4da4 Mon Sep 17 00:00:00 2001
From: Stephen Rawls <sterawls@amazon.com>
Date: Thu, 20 Jun 2019 00:21:26 +0000
Subject: [PATCH 4/5] resolving TODO about using int32 for sequence_length

---
 tests/python/gpu/test_gluon_gpu.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py
index 3562050249aa..8b31c753194e 100644
--- a/tests/python/gpu/test_gluon_gpu.py
+++ b/tests/python/gpu/test_gluon_gpu.py
@@ -249,8 +249,7 @@ def check_layer_bidirectional_varseqlen(size, in_size):
     data = mx.random.uniform(shape=(num_timesteps, batch_size, in_size))
     data_np = data.asnumpy()
 
-    # TODO: figure out why int32 doesn't work here
-    sequence_length = nd.random.randint(1, num_timesteps+1, shape=(batch_size)).astype("float")
+    sequence_length = nd.random.randint(1, num_timesteps+1, shape=(batch_size)).astype("int32")
     sequence_length_np = sequence_length.asnumpy().astype("int32")
 
     # Reference net is processing batch elements one at a time, so that it is "perfectly sized"

From 25e8e5137021b49ae0de6a6fc1ac52545b8b0854 Mon Sep 17 00:00:00 2001
From: Stephen Rawls <sterawls@amazon.com>
Date: Thu, 20 Jun 2019 01:37:50 +0000
Subject: [PATCH 5/5] setting rtol and atol similar to other tests in this file

---
 tests/python/gpu/test_gluon_gpu.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py
index 8b31c753194e..fc650294a538 100644
--- a/tests/python/gpu/test_gluon_gpu.py
+++ b/tests/python/gpu/test_gluon_gpu.py
@@ -271,7 +271,8 @@ def check_layer_bidirectional_varseqlen(size, in_size):
     # TODO: test state return value as well output
     # Only compare the valid sections for each batch entry
     for b in range(batch_size):
-        assert_allclose(net_output_np[:sequence_length_np[b], b], ref_net_output[b].asnumpy().squeeze(1))
+        assert_allclose(net_output_np[:sequence_length_np[b], b], ref_net_output[b].asnumpy().squeeze(1),
+                        rtol=1e-2, atol=1e-6)
 
     # Now test backward
     net_output.backward()
@@ -285,7 +286,7 @@ def check_layer_bidirectional_varseqlen(size, in_size):
         net_grad = net_params[k].grad()
         ref_net_grad = ref_net_params[k.replace('lstm_', 'lstm_ref_')].grad()
         assert_almost_equal(net_grad.asnumpy(), ref_net_grad.asnumpy(),
-                            rtol=1e-2, atol=1e-2)
+                            rtol=1e-2, atol=1e-6)
 
 
 @with_seed()