[SYSTEMML-1408] Add padding parameters to max-pooling layers

This adds padding parameters to the max-pooling layers, along with the associated tests. Also, there are some general code formatting updates. Closes #434.
apache · Mar 22, 2017 · 15ccb7c · 15ccb7c
1 parent 16e9909
commit 15ccb7c
Show file tree

Hide file tree

Showing 26 changed files with 537 additions and 247 deletions.
diff --git a/scripts/staging/SystemML-NN/examples/get_mnist_data.sh b/scripts/staging/SystemML-NN/examples/get_mnist_data.sh
@@ -8,9 +8,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -23,6 +23,6 @@
 DIR="$(cd "$(dirname "$0")" && pwd)"
 mkdir -p $DIR/data/mnist/
 cd $DIR/data/mnist/
-curl -O http://pjreddie.com/media/files/mnist_train.csv
-curl -O http://pjreddie.com/media/files/mnist_test.csv
+curl -O https://pjreddie.com/media/files/mnist_train.csv
+curl -O https://pjreddie.com/media/files/mnist_test.csv
 
diff --git a/scripts/staging/SystemML-NN/examples/mnist_lenet-predict.dml b/scripts/staging/SystemML-NN/examples/mnist_lenet-predict.dml
@@ -41,7 +41,7 @@
 # Outputs:
 #  - probs: File containing class probability predictions for each
 #     image.
-# 
+#
 # Data:
 # The X file should contain images of handwritten digits,
 # where each example is a 28x28 pixel image of grayscale values in
@@ -79,7 +79,7 @@ b3 = read($model_dir+"/b3")
 W4 = read($model_dir+"/W4")
 b4 = read($model_dir+"/b4")
 
-# Predict classes 
+# Predict classes
 probs = mnist_lenet::predict(X, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
 
 # Output results

diff --git a/scripts/staging/SystemML-NN/examples/mnist_lenet-train.dml b/scripts/staging/SystemML-NN/examples/mnist_lenet-train.dml
@@ -41,7 +41,7 @@
 #  - W1, W2, W3, W4: Files containing the trained weights of the model.
 #  - b1, b2, b3, b4: Files containing the trained biases of the model.
 #  - accuracy: File containing the final accuracy on the test data.
-# 
+#
 # Data:
 # The MNIST dataset contains labeled images of handwritten digits,
 # where each example is a 28x28 pixel image of grayscale values in

diff --git a/scripts/staging/SystemML-NN/examples/mnist_lenet.dml b/scripts/staging/SystemML-NN/examples/mnist_lenet.dml
@@ -114,13 +114,17 @@ train = function(matrix[double] X, matrix[double] y,
 
       # Compute forward pass
       ## layer 1: conv1 -> relu1 -> pool1
-      [outc1, Houtc1, Woutc1] = conv::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+      [outc1, Houtc1, Woutc1] = conv::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
+                                              pad, pad)
       outr1 = relu::forward(outc1)
-      [outp1, Houtp1, Woutp1] = max_pool::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2, strideh=2, stridew=2) 
+      [outp1, Houtp1, Woutp1] = max_pool::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+                                                  strideh=2, stridew=2, pad=0, pad=0)
       ## layer 2: conv2 -> relu2 -> pool2
-      [outc2, Houtc2, Woutc2] = conv::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad)
+      [outc2, Houtc2, Woutc2] = conv::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
+                                              stride, stride, pad, pad)
       outr2 = relu::forward(outc2)
-      [outp2, Houtp2, Woutp2] = max_pool::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2, strideh=2, stridew=2) 
+      [outp2, Houtp2, Woutp2] = max_pool::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+                                                  strideh=2, stridew=2, pad=0, pad=0)
       ## layer 3:  affine3 -> relu3 -> dropout
       outa3 = affine::forward(outp2, W3, b3)
       outr3 = relu::forward(outa3)
@@ -146,7 +150,8 @@ train = function(matrix[double] X, matrix[double] y,
         accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(y_val))
 
         # Output results
-        print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: " + accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
+        print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: "
+              + accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
       }
 
       # Compute data backward pass
@@ -160,13 +165,17 @@ train = function(matrix[double] X, matrix[double] y,
       douta3 = relu::backward(doutr3, outa3)
       [doutp2, dW3, db3] = affine::backward(douta3, outp2, W3, b3)
       ## layer 2: conv2 -> relu2 -> pool2
-      doutr2 = max_pool::backward(doutp2, Houtp2, Woutp2, outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2, strideh=2, stridew=2)
+      doutr2 = max_pool::backward(doutp2, Houtp2, Woutp2, outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+                                  strideh=2, stridew=2, pad=0, pad=0)
       doutc2 = relu::backward(doutr2, outc2)
-      [doutp1, dW2, db2] = conv::backward(doutc2, Houtc2, Woutc2, outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad)
+      [doutp1, dW2, db2] = conv::backward(doutc2, Houtc2, Woutc2, outp1, W2, b2, F1,
+                                          Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad)
       ## layer 1: conv1 -> relu1 -> pool1
-      doutr1 = max_pool::backward(doutp1, Houtp1, Woutp1, outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2, strideh=2, stridew=2)
+      doutr1 = max_pool::backward(doutp1, Houtp1, Woutp1, outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+                                  strideh=2, stridew=2, pad=0, pad=0)
       doutc1 = relu::backward(doutr1, outc1)
-      [dX_batch, dW1, db1] = conv::backward(doutc1, Houtc1, Woutc1, X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+      [dX_batch, dW1, db1] = conv::backward(doutc1, Houtc1, Woutc1, X_batch, W1, b1, C, Hin, Win,
+                                            Hf, Wf, stride, stride, pad, pad)
 
       # Compute regularization backward pass
       dW1_reg = l2_reg::backward(W1, lambda)
@@ -251,13 +260,17 @@ predict = function(matrix[double] X, int C, int Hin, int Win,
 
     # Compute forward pass
     ## layer 1: conv1 -> relu1 -> pool1
-    [outc1, Houtc1, Woutc1] = conv::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+    [outc1, Houtc1, Woutc1] = conv::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
+                                            pad, pad)
     outr1 = relu::forward(outc1)
-    [outp1, Houtp1, Woutp1] = max_pool::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2, strideh=2, stridew=2) 
+    [outp1, Houtp1, Woutp1] = max_pool::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
+                                                strideh=2, stridew=2, pad=0, pad=0)
     ## layer 2: conv2 -> relu2 -> pool2
-    [outc2, Houtc2, Woutc2] = conv::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad)
+    [outc2, Houtc2, Woutc2] = conv::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
+                                            stride, stride, pad, pad)
     outr2 = relu::forward(outc2)
-    [outp2, Houtp2, Woutp2] = max_pool::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2, strideh=2, stridew=2) 
+    [outp2, Houtp2, Woutp2] = max_pool::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
+                                                strideh=2, stridew=2, pad=0, pad=0)
     ## layer 3:  affine3 -> relu3
     outa3 = affine::forward(outp2, W3, b3)
     outr3 = relu::forward(outa3)
@@ -281,7 +294,7 @@ eval = function(matrix[double] probs, matrix[double] y)
    *
    * Inputs:
    *  - probs: Class probabilities, of shape (N, K).
-   *  - y: Target matrix, of shape (N, 
+   *  - y: Target matrix, of shape (N, K).
    *
    * Outputs:
    *  - loss: Scalar loss, of shape (1).

diff --git a/scripts/staging/SystemML-NN/examples/mnist_softmax-predict.dml b/scripts/staging/SystemML-NN/examples/mnist_softmax-predict.dml
@@ -37,7 +37,7 @@
 # Outputs:
 #  - probs: File containing class probability predictions for each
 #     image.
-# 
+#
 # Data:
 # The X file should contain images of handwritten digits,
 # where each example is a 28x28 pixel image of grayscale values in
@@ -66,7 +66,7 @@ X = X / 255.0
 W = read($model_dir+"/W")
 b = read($model_dir+"/b")
 
-# Predict classes 
+# Predict classes
 probs = mnist_softmax::predict(X, W, b)
 
 # Output results

diff --git a/scripts/staging/SystemML-NN/examples/mnist_softmax-train.dml b/scripts/staging/SystemML-NN/examples/mnist_softmax-train.dml
@@ -38,7 +38,7 @@
 #  - W: File containing the trained weights of the model.
 #  - b: File containing the trained biases of the model.
 #  - accuracy: File containing the final accuracy on the test data.
-# 
+#
 # Data:
 # The MNIST dataset contains labeled images of handwritten digits,
 # where each example is a 28x28 pixel image of grayscale values in

diff --git a/scripts/staging/SystemML-NN/nn/layers/affine.dml b/scripts/staging/SystemML-NN/nn/layers/affine.dml
@@ -69,7 +69,7 @@ init = function(int D, int M)
    *
    * Note: This is just a convenience function, and parameters
    * may be initialized manually if needed.
-   * 
+   *
    * We use the heuristic by He et al. [http://arxiv.org/abs/1502.01852],
    * which limits the magnification of inputs/gradients during
    * forward/backward passes by scaling unit-Gaussian weights by a
@@ -84,6 +84,6 @@ init = function(int D, int M)
    *  - b: Biases vector, of shape (1, M).
    */
   W = rand(rows=D, cols=M, pdf="normal") * sqrt(2.0/D)
-  b = matrix(0, rows=1, cols=M) 
+  b = matrix(0, rows=1, cols=M)
 }
 
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv.dml b/scripts/staging/SystemML-NN/nn/layers/conv.dml
@@ -69,7 +69,7 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
   F = nrow(W)
   Hout = as.integer((Hin + 2 * padh - Hf) / strideh + 1)
   Wout = as.integer((Win + 2 * padw - Wf) / stridew + 1)
-  
+
   # Create output volume
   out = matrix(0, rows=N, cols=F*Hout*Wout)
 
@@ -124,7 +124,7 @@ backward = function(matrix[double] dout, int Hout, int Wout,
    */
   N = nrow(X)
   F = nrow(W)
-  
+
   # Create gradient volumes
   # Note: Create convenience gradient volumes for dW and db that will
   # allow for one gradient to be stored per example, allowing for
@@ -151,8 +151,8 @@ backward = function(matrix[double] dout, int Hout, int Wout,
 
     # Compute dX
     dXn_padded_cols = t(W) %*% doutn  # shape (C*Hf*Wf, Hout*Wout)
-    dXn_padded =
-      util::col2im(dXn_padded_cols, C, Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew, "add")
+    dXn_padded = util::col2im(dXn_padded_cols, C, Hin+2*padh, Win+2*padw, Hf, Wf,
+                              strideh, stridew, "add")
     dXn = util::unpad_image(dXn_padded, Hin, Win, padh, padw)
     dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)  # reshape
   }
@@ -170,7 +170,7 @@ init = function(int F, int C, int Hf, int Wf)
    *
    * Note: This is just a convenience function, and parameters
    * may be initialized manually if needed.
-   * 
+   *
    * We use the heuristic by He et al. [http://arxiv.org/abs/1502.01852],
    * which limits the magnification of inputs/gradients during
    * forward/backward passes by scaling unit-Gaussian weights by a
@@ -187,6 +187,6 @@ init = function(int F, int C, int Hf, int Wf)
    *  - b: Biases vector, of shape (F, 1).
    */
   W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
-  b = matrix(0, rows=F, cols=1) 
+  b = matrix(0, rows=F, cols=1)
 }
 
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml b/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml
@@ -60,10 +60,9 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
    */
   N = nrow(X)
   F = nrow(W)
-  # TODO: We should eliminate this in a seperate PR
   Hout = as.integer((Hin + 2 * padh - Hf) / strideh + 1)
   Wout = as.integer((Win + 2 * padw - Wf) / stridew + 1)
-  
+
   # Convolution - built-in implementation
   out = conv2d(X, W, input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf],
                stride=[strideh,stridew], padding=[padh,padw])
@@ -105,7 +104,7 @@ backward = function(matrix[double] dout, int Hout, int Wout,
    */
   N = nrow(X)
   F = nrow(W)
-  
+
   # Partial derivatives for convolution - built-in implementation
   dW = conv2d_backward_filter(X, dout, stride=[strideh,stridew], padding=[padh,padw],
                               input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf])
@@ -123,7 +122,7 @@ init = function(int F, int C, int Hf, int Wf)
    *
    * Note: This is just a convenience function, and parameters
    * may be initialized manually if needed.
-   * 
+   *
    * We use the heuristic by He et al. [http://arxiv.org/abs/1502.01852],
    * which limits the magnification of inputs/gradients during
    * forward/backward passes by scaling unit-Gaussian weights by a
@@ -140,6 +139,6 @@ init = function(int F, int C, int Hf, int Wf)
    *  - b: Biases vector, of shape (F, 1).
    */
   W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
-  b = matrix(0, rows=F, cols=1) 
+  b = matrix(0, rows=F, cols=1)
 }
 
diff --git a/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml b/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml
@@ -26,7 +26,7 @@
  *  vectors of class probs.
  * L = (1/N) sum(L_i) for i=1 to N, where N is the number of examples.
  */
-forward = function(matrix[double] pred, matrix[double] y) 
+forward = function(matrix[double] pred, matrix[double] y)
     return (double loss) {
   /*
    * Computes the forward pass for a cross-entropy loss function.  The
@@ -50,7 +50,7 @@ forward = function(matrix[double] pred, matrix[double] y)
   loss = sum(losses) / N
 }
 
-backward = function(matrix[double] pred, matrix[double] y) 
+backward = function(matrix[double] pred, matrix[double] y)
     return (matrix[double] dpred) {
   /*
    * Computes the backward pass of a cross-entropy loss function.  The

diff --git a/scripts/staging/SystemML-NN/nn/layers/dropout.dml b/scripts/staging/SystemML-NN/nn/layers/dropout.dml
@@ -47,10 +47,10 @@ forward = function(matrix[double] X, double p, int seed)
   # to create a dropout mask.  Fortunately, SystemML has a `sparsity` parameter on
   # the `rand` function that allows use to create a mask directly.
   if (seed == -1) {
-  	mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p)
+    mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p)
   }
   else {
-  	mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p, seed=seed)
+    mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p, seed=seed)
   }
   out = X * mask / p
 }

diff --git a/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml b/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml
@@ -25,7 +25,7 @@
  * L_i = sum_j(abs((pred_i)_j - (y_i)_j)) for all j.
  * L = (1/N) sum(L_i) for i=1 to N, where N is the number of examples.
  */
-forward = function(matrix[double] pred, matrix[double] y) 
+forward = function(matrix[double] pred, matrix[double] y)
     return (double loss) {
   /*
    * Computes the forward pass for an L1 loss function.  The inputs
@@ -46,7 +46,7 @@ forward = function(matrix[double] pred, matrix[double] y)
   loss = sum(losses) / N
 }
 
-backward = function(matrix[double] pred, matrix[double] y) 
+backward = function(matrix[double] pred, matrix[double] y)
     return (matrix[double] dpred) {
   /*
    * Computes the backward pass for an L1 loss function.  The inputs

diff --git a/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml b/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml
@@ -46,7 +46,7 @@ backward = function(matrix[double] X, double lambda) return (matrix[double] dX)
    *  - lambda: Regularization strength.
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of same shape as X. 
+   *  - dX: Gradient wrt X, of same shape as X.
    */
   dX = lambda * sign(X)
 }

diff --git a/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml b/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml
@@ -25,7 +25,7 @@
  * L_i = (1/2) 2norm(pred_i - y_i)^2
  * L = (1/N) sum(L_i) for i=1 to N, where N is the number of examples.
  */
-forward = function(matrix[double] pred, matrix[double] y) 
+forward = function(matrix[double] pred, matrix[double] y)
     return (double loss) {
   /*
    * Computes the forward pass for an L2 loss function.  The inputs
@@ -46,7 +46,7 @@ forward = function(matrix[double] pred, matrix[double] y)
   loss = sum(losses) / N
 }
 
-backward = function(matrix[double] pred, matrix[double] y) 
+backward = function(matrix[double] pred, matrix[double] y)
     return (matrix[double] dpred) {
   /*
    * Computes the backward pass for an L2 loss function.  The inputs

diff --git a/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml b/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml
@@ -46,7 +46,7 @@ backward = function(matrix[double] X, double lambda) return (matrix[double] dX)
    *  - lambda: Regularization strength.
    *
    * Outputs:
-   *  - dX: Gradient wrt X, of same shape as X. 
+   *  - dX: Gradient wrt X, of same shape as X.
    */
   dX = lambda * X
 }

diff --git a/scripts/staging/SystemML-NN/nn/layers/log_loss.dml b/scripts/staging/SystemML-NN/nn/layers/log_loss.dml
@@ -23,10 +23,10 @@
  * Log loss function.
  *
  * L_i = -y_i*log(pred_i) - (1-y_i)*log(1-pred_i), where y_i is a
- *  binary target, and pred_i is a probability of y=1. 
+ *  binary target, and pred_i is a probability of y=1.
  * L = (1/N) sum(L_i) for i=1 to N, where N is the number of examples.
  */
-forward = function(matrix[double] pred, matrix[double] y) 
+forward = function(matrix[double] pred, matrix[double] y)
     return (double loss) {
   /*
    * Computes the forward pass for a log loss function.
@@ -48,7 +48,7 @@ forward = function(matrix[double] pred, matrix[double] y)
   loss = sum(losses) / N
 }
 
-backward = function(matrix[double] pred, matrix[double] y) 
+backward = function(matrix[double] pred, matrix[double] y)
     return (matrix[double] dpred) {
   /*
    * Computes the backward pass for a log loss function.