apache · piiswrong · Apr 24, 2017 · Apr 12, 2017 · Apr 12, 2017 · Apr 12, 2017
@@ -954,7 +954,7 @@ def ones(shape, ctx=None, dtype=mx_real_t):
 
     Parameters
     ----------
-    shape : int or tuple of int
+    shape : int or tuple of int or list of int
         The shape of the empty array.
     ctx : Context, optional
         An optional device context.

diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
@@ -28,18 +28,20 @@ struct SGDParam : public dmlc::Parameter<SGDParam> {
   float clip_gradient;
   DMLC_DECLARE_PARAMETER(SGDParam) {
     DMLC_DECLARE_FIELD(lr)
-    .describe("learning_rate");
+    .describe("Learning rate");
     DMLC_DECLARE_FIELD(wd)
     .set_default(0.0f)
-    .describe("weight decay");
+    .describe("Weight decay augments the objective function with a "
+              "regularization term that penalizes large weights. "
+              "The penalty scales with the square of the magnitude of each weight.");
     DMLC_DECLARE_FIELD(rescale_grad)
     .set_default(1.0f)
-    .describe("rescale gradient as grad = rescale_grad*grad.");
+    .describe("Rescale gradient to grad = rescale_grad*grad.");
     DMLC_DECLARE_FIELD(clip_gradient)
     .set_default(-1.0f)
-    .describe("If greater than 0, clip gradient to "
-              "grad = max(min(grad, -clip_gradient), clip_gradient). "
-              "Otherwise turned off.");
+    .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] "
+              "If clip_gradient <= 0, gradient clipping is turned off. "
+              "grad = max(min(grad, clip_gradient), -clip_gradient).");
   }
 };
 
@@ -90,21 +92,23 @@ struct SGDMomParam : public dmlc::Parameter<SGDMomParam> {
   float clip_gradient;
   DMLC_DECLARE_PARAMETER(SGDMomParam) {
     DMLC_DECLARE_FIELD(lr)
-    .describe("learning_rate");
+    .describe("Learning rate");
     DMLC_DECLARE_FIELD(momentum)
     .set_default(0.0f)
-    .describe("momentum");
+    .describe("The decay rate of momentum estimates at each epoch.");
     DMLC_DECLARE_FIELD(wd)
     .set_default(0.0f)
-    .describe("weight decay");
+    .describe("Weight decay augments the objective function with a "
+              "regularization term that penalizes large weights. "
+              "The penalty scales with the square of the magnitude of each weight.");
     DMLC_DECLARE_FIELD(rescale_grad)
     .set_default(1.0f)
-    .describe("rescale gradient as grad = rescale_grad*grad.");
+    .describe("Rescale gradient to grad = rescale_grad*grad.");
     DMLC_DECLARE_FIELD(clip_gradient)
     .set_default(-1.0f)
-    .describe("If greater than 0, clip gradient to "
-              "grad = max(min(grad, -clip_gradient), clip_gradient). "
-              "Otherwise turned off.");
+    .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] "
+              "If clip_gradient <= 0, gradient clipping is turned off. "
+              "grad = max(min(grad, clip_gradient), -clip_gradient).");
   }
 };
 
@@ -159,27 +163,29 @@ struct AdamParam : public dmlc::Parameter<AdamParam> {
   float clip_gradient;
   DMLC_DECLARE_PARAMETER(AdamParam) {
     DMLC_DECLARE_FIELD(lr)
-    .describe("learning_rate");
+    .describe("Learning rate");
     DMLC_DECLARE_FIELD(beta1)
     .set_default(0.9f)
-    .describe("beta1");
+    .describe("The decay rate for the 1st moment estimates.");
     DMLC_DECLARE_FIELD(beta2)
     .set_default(0.999f)
-    .describe("beta2");
+    .describe("The decay rate for the 2nd moment estimates.");
     DMLC_DECLARE_FIELD(epsilon)
     .set_default(1e-8f)
-    .describe("epsilon");
+    .describe("A small constant for numerical stability.");
     DMLC_DECLARE_FIELD(wd)
     .set_default(0.0f)
-    .describe("weight decay");
+    .describe("Weight decay augments the objective function with a "
+              "regularization term that penalizes large weights. "
+              "The penalty scales with the square of the magnitude of each weight.");
     DMLC_DECLARE_FIELD(rescale_grad)
     .set_default(1.0f)
-    .describe("rescale gradient as grad = rescale_grad*grad.");
+    .describe("Rescale gradient to grad = rescale_grad*grad.");
     DMLC_DECLARE_FIELD(clip_gradient)
     .set_default(-1.0f)
-    .describe("If greater than 0, clip gradient to "
-              "grad = max(min(grad, -clip_gradient), clip_gradient). "
-              "Otherwise turned off.");
+    .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] "
+              "If clip_gradient <= 0, gradient clipping is turned off. "
+              "grad = max(min(grad, clip_gradient), -clip_gradient).");
   }
 };
 
@@ -233,24 +239,31 @@ struct RMSPropAlexParam : public dmlc::Parameter<RMSPropAlexParam> {
   float clip_gradient;
   float clip_weights;
   DMLC_DECLARE_PARAMETER(RMSPropAlexParam) {
-    DMLC_DECLARE_FIELD(lr).describe("learning_rate");
-    DMLC_DECLARE_FIELD(gamma1).set_default(0.95f).describe("gamma1");
-    DMLC_DECLARE_FIELD(gamma2).set_default(0.9f).describe("gamma2");
-    DMLC_DECLARE_FIELD(epsilon).set_default(1e-8f).describe("epsilon");
-    DMLC_DECLARE_FIELD(wd).set_default(0.0f).describe("weight decay");
+    DMLC_DECLARE_FIELD(lr)
+    .describe("Learning rate");
+    DMLC_DECLARE_FIELD(gamma1).set_default(0.95f)
+    .describe("Decay rate.");
+    DMLC_DECLARE_FIELD(gamma2).set_default(0.9f)
+    .describe("Decay rate.");
+    DMLC_DECLARE_FIELD(epsilon).set_default(1e-8f)
+    .describe("A small constant for numerical stability.");
+    DMLC_DECLARE_FIELD(wd).set_default(0.0f)
+    .describe("Weight decay augments the objective function with a "
+              "regularization term that penalizes large weights. "
+              "The penalty scales with the square of the magnitude of each weight.");
     DMLC_DECLARE_FIELD(rescale_grad)
     .set_default(1.0f)
-    .describe("rescale gradient as grad = rescale_grad*grad.");
+    .describe("Rescale gradient to grad = rescale_grad*grad.");
     DMLC_DECLARE_FIELD(clip_gradient)
     .set_default(-1.0f)
-    .describe("If greater than 0, clip gradient to "
-              "grad = max(min(grad, -clip_gradient), clip_gradient). "
-              "Otherwise turned off.");
+    .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] "
+              "If clip_gradient <= 0, gradient clipping is turned off. "
+              "grad = max(min(grad, clip_gradient), -clip_gradient).");
     DMLC_DECLARE_FIELD(clip_weights)
-      .set_default(-1.0f)
-      .describe("If greater than 0, clip weights to "
-                "weights = max(min(weights, -clip_weights), clip_weights). "
-                "Otherwise turned off.");
+    .set_default(-1.0f)
+    .describe("Clip weights to the range of [-clip_weights, clip_weights] "
+              "If clip_weights <= 0, weight clipping is turned off. "
+              "weights = max(min(weights, clip_weights), -clip_weights).");
   }
 };
 
@@ -320,23 +333,29 @@ struct RMSPropParam : public dmlc::Parameter<RMSPropParam> {
   float clip_gradient;
   float clip_weights;
   DMLC_DECLARE_PARAMETER(RMSPropParam) {
-    DMLC_DECLARE_FIELD(lr).describe("learning_rate");
-    DMLC_DECLARE_FIELD(gamma1).set_default(0.95f).describe("gamma1");
-    DMLC_DECLARE_FIELD(epsilon).set_default(1e-8f).describe("epsilon");
-    DMLC_DECLARE_FIELD(wd).set_default(0.0f).describe("weight decay");
+    DMLC_DECLARE_FIELD(lr)
+    .describe("Learning rate");
+    DMLC_DECLARE_FIELD(gamma1).set_default(0.95f)
+    .describe("The dacay rate of momentum estimates.");
+    DMLC_DECLARE_FIELD(epsilon).set_default(1e-8f)
+    .describe("A small constant for numerical stability.");
+    DMLC_DECLARE_FIELD(wd).set_default(0.0f)
+    .describe("Weight decay augments the objective function with a "
+              "regularization term that penalizes large weights. "
+              "The penalty scales with the square of the magnitude of each weight.");
     DMLC_DECLARE_FIELD(rescale_grad)
     .set_default(1.0f)
-    .describe("rescale gradient as grad = rescale_grad*grad.");
+    .describe("Rescale gradient to grad = rescale_grad*grad.");
     DMLC_DECLARE_FIELD(clip_gradient)
     .set_default(-1.0f)
-    .describe("If greater than 0, clip gradient to "
-              "grad = max(min(grad, -clip_gradient), clip_gradient). "
-              "Otherwise turned off.");
+    .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] "
+              "If clip_gradient <= 0, gradient clipping is turned off. "
+              "grad = max(min(grad, clip_gradient), -clip_gradient).");
     DMLC_DECLARE_FIELD(clip_weights)
-      .set_default(-1.0f)
-      .describe("If greater than 0, clip weights to "
-                "weights = max(min(weights, -clip_weights), clip_weights). "
-                "Otherwise turned off.");
+    .set_default(-1.0f)
+    .describe("Clip weights to the range of [-clip_weights, clip_weights] "
+              "If clip_weights <= 0, weight clipping is turned off. "
+              "weights = max(min(weights, clip_weights), -clip_weights).");
   }
 };
 

diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc
@@ -16,19 +16,43 @@ DMLC_REGISTER_PARAMETER(RMSPropParam);
 DMLC_REGISTER_PARAMETER(RMSPropAlexParam);
 
 NNVM_REGISTER_OP(sgd_update)
-.describe("Updater function for sgd optimizer")
+.describe(R"code(Update function for Stochastic Gradient Descent (SDG) optimizer.
+
+It updates the weights using::
+
+ weight = weight - learning_rate * gradient
+
+)code" ADD_FILELINE)
 .set_num_inputs(2)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<SGDParam>)
 .set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<2, 1>)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
 .set_attr<FCompute>("FCompute<cpu>", SGDUpdate<cpu>)
 .add_argument("weight", "NDArray-or-Symbol", "Weight")
-.add_argument("grad", "NDArray-or-Symbol", "gradient")
+.add_argument("grad", "NDArray-or-Symbol", "Gradient")
 .add_arguments(SGDParam::__FIELDS__());
 
 NNVM_REGISTER_OP(sgd_mom_update)
-.describe("Updater function for sgd optimizer")
+.describe(R"code(Momentum update function for Stochastic Gradient Descent (SDG) optimizer.
+
+Momentum update has better convergence rates on neural networks. Mathematically it looks
+like below:
+
+.. math::
+
+  v_1 = \alpha * \nabla J(W_0)\\
+  v_t = \gamma v_{t-1} - \alpha * \nabla J(W_{t-1})\\
+  W_t = W_{t-1} + v_t
+
+It updates the weights using::
+
+  v = momentum * v - learning_rate * gradient
+  weight += v
+
+Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch.
+
+)code" ADD_FILELINE)
 .set_num_inputs(3)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<SGDMomParam>)
@@ -45,7 +69,26 @@ NNVM_REGISTER_OP(sgd_mom_update)
 .add_arguments(SGDMomParam::__FIELDS__());
 
 NNVM_REGISTER_OP(adam_update)
-.describe("Updater function for adam optimizer")
+.describe(R"code(Update function for Adam optimizer. Adam is seen as a generalization
+of AdaGrad.
+
+Adam update consists of the following steps, where g represents gradient and m, v
+are 1st and 2nd order moment estimates (mean and variance).
+
+.. math::
+
+ g_t = \nabla J(W_{t-1})\\
+ m_t = \beta_1 m_{t-1} + (1 - \beta_1) g_t\\
+ v_t = \beta_2 v_{t-1} + (1 - \beta_2) g_t^2\\
+ W_t = W_{t-1} - \alpha \frac{ m_t }{ \sqrt{ v_t } + \epsilon }
+
+It updates the weights using::
+
+ m = beta1*m + (1-beta1)*grad
+ v = beta2*v + (1-beta2)*(grad**2)
+ w += - learning_rate * m / (sqrt(v) + epsilon)
+
+)code" ADD_FILELINE)
 .set_num_inputs(4)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<AdamParam>)
@@ -63,10 +106,9 @@ NNVM_REGISTER_OP(adam_update)
 .add_arguments(AdamParam::__FIELDS__());
 
 NNVM_REGISTER_OP(rmsprop_update)
-.describe("Updater function for RMSProp optimizer."
-          " The RMSProp code follows the version in"
-          " http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf "
-          "Tieleman & Hinton, 2012.")
+.describe(R"code(Update function for RMSProp optimizer. The RMSProp code follows the version in
+http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf Tieleman & Hinton, 2012.
+)code" ADD_FILELINE)
 .set_num_inputs(3)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<RMSPropParam>)
@@ -83,9 +125,9 @@ NNVM_REGISTER_OP(rmsprop_update)
 .add_arguments(RMSPropParam::__FIELDS__());
 
 NNVM_REGISTER_OP(rmspropalex_update)
-.describe("Updater function for RMSPropAlex optimizer."
-          " The RMSPropAlex code follows the version in"
-          " http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.")
+.describe(R"code(Update function for RMSPropAlex optimizer. The RMSPropAlex code follows the version in
+http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
+)code" ADD_FILELINE)
 .set_num_inputs(5)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<RMSPropAlexParam>)

diff --git a/src/operator/tensor/broadcast_reduce_op.h b/src/operator/tensor/broadcast_reduce_op.h
@@ -22,9 +22,14 @@ struct ReduceAxesParam : public dmlc::Parameter<ReduceAxesParam> {
   bool keepdims;
   DMLC_DECLARE_PARAMETER(ReduceAxesParam) {
     DMLC_DECLARE_FIELD(axis).set_default(TShape())
-        .describe("The axes to perform the reduction.");
+      .describe("The axis or axes along which to perform the reduction. "
+                "The default, `axis=()`, will compute over all elements into a "
+                "scalar array with shape `(1,)`.\n\nIf axis is int, "
+                "a reduction is performed on a particular axis.\n\n"
+                "If axis is a tuple of ints, a reduction is performed "
+                "on all the axes specified in the tuple.");
     DMLC_DECLARE_FIELD(keepdims).set_default(false)
-      .describe("If true, the axes which are reduced are left "
+      .describe("If this is set to `True`, the reduced axes are left "
                 "in the result as dimension with size one.");
   }
 };
@@ -34,11 +39,12 @@ struct ReduceAxisParam : public dmlc::Parameter<ReduceAxisParam> {
   bool keepdims;
   DMLC_DECLARE_PARAMETER(ReduceAxisParam) {
     DMLC_DECLARE_FIELD(axis).set_default(dmlc::optional<int>())
-      .describe("int or None. The axis to perform the reduction. "
+      .describe("The axis along which to perform the reduction. "
                 "Negative values means indexing from right to left. "
-                "If is `None`, a global reduction will be performed.");
+                "``Requires axis to be set as int, because global reduction "
+                "is not supported yet.``");
     DMLC_DECLARE_FIELD(keepdims).set_default(false)
-      .describe("If true, the axis which is reduced is left "
+      .describe("If this is set to `True`, the reduced axis is left "
                 "in the result as dimension with size one.");
   }
 };

diff --git a/src/operator/tensor/broadcast_reduce_op_value.cc b/src/operator/tensor/broadcast_reduce_op_value.cc
@@ -15,18 +15,6 @@ DMLC_REGISTER_PARAMETER(BroadcastToParam);
 inline std::string get_reduce_axes_description(const std::string& op_name, int line) {
   std::string doc = R"code(Compute the __op__ of array elements over given axes.
 
-The argument ``axis`` specifies the axes to compute over:
-
-- **()**: compute over all elements into a scalar array with shape ``(1,)``. This is
-  the default option.
-- **int**: compute over along a particular axis. If input has shape ``(n, m, k)``,
-  use ``axis=0`` will result in an array with shape ``(m, k)``.
-- **tuple of int**: compute over multiple axes. Again assume input shape ``(n, m,
-  k)``, with ``axis=(0,2)`` we obtain a ``(m,)`` shape array.
-
-If ``keepdims = 1``, then the result array will has the same number of dimensions
-as the input, while the reduced axes will have size 1.
-
 
 Defined in )code";
   doc += std::string(__FILE__) + std::string(":L") + std::to_string(line);
@@ -68,9 +56,7 @@ MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_prod)
 .set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseInOut< cpu, mshadow_op::rdiv>);
 
 MXNET_OPERATOR_REGISTER_REDUCE(nansum)
-.describe(R"code(Compute the sum of array elements over given axes with ``NaN`` ignored
-
-Refer to ``sum`` for more details.
+.describe(R"code(Compute the sum of array elements over given axes treating Not a Numbers ``NaN`` as zero.
 
 )code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow_op::nansum>)
@@ -81,9 +67,7 @@ MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_nansum)
 .set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseInOut<cpu, mshadow_op::nansum_grad>);
 
 MXNET_OPERATOR_REGISTER_REDUCE(nanprod)
-.describe(R"code(Compute the product of array elements over given axes with ``NaN`` ignored
-
-Refer to ``prod`` for more details.
+.describe(R"code(Compute the product of array elements over given axes treating Not a Numbers ``NaN`` as one.
 
 )code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow_op::nanprod>)
@@ -171,9 +155,7 @@ NNVM_REGISTER_OP(_broadcast_backward)
 .set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow::red::sum>);
 
 NNVM_REGISTER_OP(norm)
-.describe(R"code(Computes the L2 norm of the input array.
-
-Flattens the input array and then computes the l2 norm.
+.describe(R"code(Flattens the input array and then computes the l2 norm.
 
 Examples::