Skip to content
This repository was archived by the owner on Nov 17, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/mxnet/ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -954,7 +954,7 @@ def ones(shape, ctx=None, dtype=mx_real_t):

Parameters
----------
shape : int or tuple of int
shape : int or tuple of int or list of int
The shape of the empty array.
ctx : Context, optional
An optional device context.
Expand Down
113 changes: 66 additions & 47 deletions src/operator/optimizer_op-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,20 @@ struct SGDParam : public dmlc::Parameter<SGDParam> {
float clip_gradient;
DMLC_DECLARE_PARAMETER(SGDParam) {
DMLC_DECLARE_FIELD(lr)
.describe("learning_rate");
.describe("Learning rate");
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the the correct use of .describe() ? Should it take the formal variable name or a common language string?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

they use .describe to document other parameters.

DMLC_DECLARE_FIELD(wd)
.set_default(0.0f)
.describe("weight decay");
.describe("Weight decay augments the objective function with a "
"regularization term that penalizes large weights. "
"The penalty scales with the square of the magnitude of each weight.");
DMLC_DECLARE_FIELD(rescale_grad)
.set_default(1.0f)
.describe("rescale gradient as grad = rescale_grad*grad.");
.describe("Rescale gradient to grad = rescale_grad*grad.");
DMLC_DECLARE_FIELD(clip_gradient)
.set_default(-1.0f)
.describe("If greater than 0, clip gradient to "
"grad = max(min(grad, -clip_gradient), clip_gradient). "
"Otherwise turned off.");
.describe("Clip gradient to the range of [-clip_gradient, clip_gradient] "
"If clip_gradient <= 0, gradient clipping is turned off. "
"grad = max(min(grad, clip_gradient), -clip_gradient).");
}
};

Expand Down Expand Up @@ -90,21 +92,23 @@ struct SGDMomParam : public dmlc::Parameter<SGDMomParam> {
float clip_gradient;
DMLC_DECLARE_PARAMETER(SGDMomParam) {
DMLC_DECLARE_FIELD(lr)
.describe("learning_rate");
.describe("Learning rate");
DMLC_DECLARE_FIELD(momentum)
.set_default(0.0f)
.describe("momentum");
.describe("The decay rate of momentum estimates at each epoch.");
DMLC_DECLARE_FIELD(wd)
.set_default(0.0f)
.describe("weight decay");
.describe("Weight decay augments the objective function with a "
"regularization term that penalizes large weights. "
"The penalty scales with the square of the magnitude of each weight.");
DMLC_DECLARE_FIELD(rescale_grad)
.set_default(1.0f)
.describe("rescale gradient as grad = rescale_grad*grad.");
.describe("Rescale gradient to grad = rescale_grad*grad.");
DMLC_DECLARE_FIELD(clip_gradient)
.set_default(-1.0f)
.describe("If greater than 0, clip gradient to "
"grad = max(min(grad, -clip_gradient), clip_gradient). "
"Otherwise turned off.");
.describe("Clip gradient to the range of [-clip_gradient, clip_gradient] "
"If clip_gradient <= 0, gradient clipping is turned off. "
"grad = max(min(grad, clip_gradient), -clip_gradient).");
}
};

Expand Down Expand Up @@ -159,27 +163,29 @@ struct AdamParam : public dmlc::Parameter<AdamParam> {
float clip_gradient;
DMLC_DECLARE_PARAMETER(AdamParam) {
DMLC_DECLARE_FIELD(lr)
.describe("learning_rate");
.describe("Learning rate");
DMLC_DECLARE_FIELD(beta1)
.set_default(0.9f)
.describe("beta1");
.describe("The decay rate for the 1st moment estimates.");
DMLC_DECLARE_FIELD(beta2)
.set_default(0.999f)
.describe("beta2");
.describe("The decay rate for the 2nd moment estimates.");
DMLC_DECLARE_FIELD(epsilon)
.set_default(1e-8f)
.describe("epsilon");
.describe("A small constant for numerical stability.");
DMLC_DECLARE_FIELD(wd)
.set_default(0.0f)
.describe("weight decay");
.describe("Weight decay augments the objective function with a "
"regularization term that penalizes large weights. "
"The penalty scales with the square of the magnitude of each weight.");
DMLC_DECLARE_FIELD(rescale_grad)
.set_default(1.0f)
.describe("rescale gradient as grad = rescale_grad*grad.");
.describe("Rescale gradient to grad = rescale_grad*grad.");
DMLC_DECLARE_FIELD(clip_gradient)
.set_default(-1.0f)
.describe("If greater than 0, clip gradient to "
"grad = max(min(grad, -clip_gradient), clip_gradient). "
"Otherwise turned off.");
.describe("Clip gradient to the range of [-clip_gradient, clip_gradient] "
"If clip_gradient <= 0, gradient clipping is turned off. "
"grad = max(min(grad, clip_gradient), -clip_gradient).");
}
};

Expand Down Expand Up @@ -233,24 +239,31 @@ struct RMSPropAlexParam : public dmlc::Parameter<RMSPropAlexParam> {
float clip_gradient;
float clip_weights;
DMLC_DECLARE_PARAMETER(RMSPropAlexParam) {
DMLC_DECLARE_FIELD(lr).describe("learning_rate");
DMLC_DECLARE_FIELD(gamma1).set_default(0.95f).describe("gamma1");
DMLC_DECLARE_FIELD(gamma2).set_default(0.9f).describe("gamma2");
DMLC_DECLARE_FIELD(epsilon).set_default(1e-8f).describe("epsilon");
DMLC_DECLARE_FIELD(wd).set_default(0.0f).describe("weight decay");
DMLC_DECLARE_FIELD(lr)
.describe("Learning rate");
DMLC_DECLARE_FIELD(gamma1).set_default(0.95f)
.describe("Decay rate.");
DMLC_DECLARE_FIELD(gamma2).set_default(0.9f)
.describe("Decay rate.");
DMLC_DECLARE_FIELD(epsilon).set_default(1e-8f)
.describe("A small constant for numerical stability.");
DMLC_DECLARE_FIELD(wd).set_default(0.0f)
.describe("Weight decay augments the objective function with a "
"regularization term that penalizes large weights. "
"The penalty scales with the square of the magnitude of each weight.");
DMLC_DECLARE_FIELD(rescale_grad)
.set_default(1.0f)
.describe("rescale gradient as grad = rescale_grad*grad.");
.describe("Rescale gradient to grad = rescale_grad*grad.");
DMLC_DECLARE_FIELD(clip_gradient)
.set_default(-1.0f)
.describe("If greater than 0, clip gradient to "
"grad = max(min(grad, -clip_gradient), clip_gradient). "
"Otherwise turned off.");
.describe("Clip gradient to the range of [-clip_gradient, clip_gradient] "
"If clip_gradient <= 0, gradient clipping is turned off. "
"grad = max(min(grad, clip_gradient), -clip_gradient).");
DMLC_DECLARE_FIELD(clip_weights)
.set_default(-1.0f)
.describe("If greater than 0, clip weights to "
"weights = max(min(weights, -clip_weights), clip_weights). "
"Otherwise turned off.");
.set_default(-1.0f)
.describe("Clip weights to the range of [-clip_weights, clip_weights] "
"If clip_weights <= 0, weight clipping is turned off. "
"weights = max(min(weights, clip_weights), -clip_weights).");
}
};

Expand Down Expand Up @@ -320,23 +333,29 @@ struct RMSPropParam : public dmlc::Parameter<RMSPropParam> {
float clip_gradient;
float clip_weights;
DMLC_DECLARE_PARAMETER(RMSPropParam) {
DMLC_DECLARE_FIELD(lr).describe("learning_rate");
DMLC_DECLARE_FIELD(gamma1).set_default(0.95f).describe("gamma1");
DMLC_DECLARE_FIELD(epsilon).set_default(1e-8f).describe("epsilon");
DMLC_DECLARE_FIELD(wd).set_default(0.0f).describe("weight decay");
DMLC_DECLARE_FIELD(lr)
.describe("Learning rate");
DMLC_DECLARE_FIELD(gamma1).set_default(0.95f)
.describe("The dacay rate of momentum estimates.");
DMLC_DECLARE_FIELD(epsilon).set_default(1e-8f)
.describe("A small constant for numerical stability.");
DMLC_DECLARE_FIELD(wd).set_default(0.0f)
.describe("Weight decay augments the objective function with a "
"regularization term that penalizes large weights. "
"The penalty scales with the square of the magnitude of each weight.");
DMLC_DECLARE_FIELD(rescale_grad)
.set_default(1.0f)
.describe("rescale gradient as grad = rescale_grad*grad.");
.describe("Rescale gradient to grad = rescale_grad*grad.");
DMLC_DECLARE_FIELD(clip_gradient)
.set_default(-1.0f)
.describe("If greater than 0, clip gradient to "
"grad = max(min(grad, -clip_gradient), clip_gradient). "
"Otherwise turned off.");
.describe("Clip gradient to the range of [-clip_gradient, clip_gradient] "
"If clip_gradient <= 0, gradient clipping is turned off. "
"grad = max(min(grad, clip_gradient), -clip_gradient).");
DMLC_DECLARE_FIELD(clip_weights)
.set_default(-1.0f)
.describe("If greater than 0, clip weights to "
"weights = max(min(weights, -clip_weights), clip_weights). "
"Otherwise turned off.");
.set_default(-1.0f)
.describe("Clip weights to the range of [-clip_weights, clip_weights] "
"If clip_weights <= 0, weight clipping is turned off. "
"weights = max(min(weights, clip_weights), -clip_weights).");
}
};

Expand Down
64 changes: 53 additions & 11 deletions src/operator/optimizer_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,43 @@ DMLC_REGISTER_PARAMETER(RMSPropParam);
DMLC_REGISTER_PARAMETER(RMSPropAlexParam);

NNVM_REGISTER_OP(sgd_update)
.describe("Updater function for sgd optimizer")
.describe(R"code(Update function for Stochastic Gradient Descent (SDG) optimizer.

It updates the weights using::

weight = weight - learning_rate * gradient

)code" ADD_FILELINE)
.set_num_inputs(2)
.set_num_outputs(1)
.set_attr_parser(ParamParser<SGDParam>)
.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<2, 1>)
.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
.set_attr<FCompute>("FCompute<cpu>", SGDUpdate<cpu>)
.add_argument("weight", "NDArray-or-Symbol", "Weight")
.add_argument("grad", "NDArray-or-Symbol", "gradient")
.add_argument("grad", "NDArray-or-Symbol", "Gradient")
.add_arguments(SGDParam::__FIELDS__());

NNVM_REGISTER_OP(sgd_mom_update)
.describe("Updater function for sgd optimizer")
.describe(R"code(Momentum update function for Stochastic Gradient Descent (SDG) optimizer.

Momentum update has better convergence rates on neural networks. Mathematically it looks
like below:

.. math::

v_1 = \alpha * \nabla J(W_0)\\
v_t = \gamma v_{t-1} - \alpha * \nabla J(W_{t-1})\\
W_t = W_{t-1} + v_t

It updates the weights using::

v = momentum * v - learning_rate * gradient
weight += v

Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch.

)code" ADD_FILELINE)
.set_num_inputs(3)
.set_num_outputs(1)
.set_attr_parser(ParamParser<SGDMomParam>)
Expand All @@ -45,7 +69,26 @@ NNVM_REGISTER_OP(sgd_mom_update)
.add_arguments(SGDMomParam::__FIELDS__());

NNVM_REGISTER_OP(adam_update)
.describe("Updater function for adam optimizer")
.describe(R"code(Update function for Adam optimizer. Adam is seen as a generalization
of AdaGrad.

Adam update consists of the following steps, where g represents gradient and m, v
are 1st and 2nd order moment estimates (mean and variance).

.. math::

g_t = \nabla J(W_{t-1})\\
m_t = \beta_1 m_{t-1} + (1 - \beta_1) g_t\\
v_t = \beta_2 v_{t-1} + (1 - \beta_2) g_t^2\\
W_t = W_{t-1} - \alpha \frac{ m_t }{ \sqrt{ v_t } + \epsilon }

It updates the weights using::

m = beta1*m + (1-beta1)*grad
v = beta2*v + (1-beta2)*(grad**2)
w += - learning_rate * m / (sqrt(v) + epsilon)

)code" ADD_FILELINE)
.set_num_inputs(4)
.set_num_outputs(1)
.set_attr_parser(ParamParser<AdamParam>)
Expand All @@ -63,10 +106,9 @@ NNVM_REGISTER_OP(adam_update)
.add_arguments(AdamParam::__FIELDS__());

NNVM_REGISTER_OP(rmsprop_update)
.describe("Updater function for RMSProp optimizer."
" The RMSProp code follows the version in"
" http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf "
"Tieleman & Hinton, 2012.")
.describe(R"code(Update function for RMSProp optimizer. The RMSProp code follows the version in
http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf Tieleman & Hinton, 2012.
)code" ADD_FILELINE)
.set_num_inputs(3)
.set_num_outputs(1)
.set_attr_parser(ParamParser<RMSPropParam>)
Expand All @@ -83,9 +125,9 @@ NNVM_REGISTER_OP(rmsprop_update)
.add_arguments(RMSPropParam::__FIELDS__());

NNVM_REGISTER_OP(rmspropalex_update)
.describe("Updater function for RMSPropAlex optimizer."
" The RMSPropAlex code follows the version in"
" http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.")
.describe(R"code(Update function for RMSPropAlex optimizer. The RMSPropAlex code follows the version in
http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
)code" ADD_FILELINE)
.set_num_inputs(5)
.set_num_outputs(1)
.set_attr_parser(ParamParser<RMSPropAlexParam>)
Expand Down
16 changes: 11 additions & 5 deletions src/operator/tensor/broadcast_reduce_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,14 @@ struct ReduceAxesParam : public dmlc::Parameter<ReduceAxesParam> {
bool keepdims;
DMLC_DECLARE_PARAMETER(ReduceAxesParam) {
DMLC_DECLARE_FIELD(axis).set_default(TShape())
.describe("The axes to perform the reduction.");
.describe("The axis or axes along which to perform the reduction. "
"The default, `axis=()`, will compute over all elements into a "
"scalar array with shape `(1,)`.\n\nIf axis is int, "
"a reduction is performed on a particular axis.\n\n"
"If axis is a tuple of ints, a reduction is performed "
"on all the axes specified in the tuple.");
DMLC_DECLARE_FIELD(keepdims).set_default(false)
.describe("If true, the axes which are reduced are left "
.describe("If this is set to `True`, the reduced axes are left "
"in the result as dimension with size one.");
}
};
Expand All @@ -34,11 +39,12 @@ struct ReduceAxisParam : public dmlc::Parameter<ReduceAxisParam> {
bool keepdims;
DMLC_DECLARE_PARAMETER(ReduceAxisParam) {
DMLC_DECLARE_FIELD(axis).set_default(dmlc::optional<int>())
.describe("int or None. The axis to perform the reduction. "
.describe("The axis along which to perform the reduction. "
"Negative values means indexing from right to left. "
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you need to mention this ? i do not see this mentioned for other parameters that have this behavior

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what do you mean?

"If is `None`, a global reduction will be performed.");
"``Requires axis to be set as int, because global reduction "
"is not supported yet.``");
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is used for multiple ops. Are all of them not supporting None?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is for argmax,argmin and pick operator and these 3 don't support None yet.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, it is only for argmax, argmin, and pick. These three does not support global reduction.

DMLC_DECLARE_FIELD(keepdims).set_default(false)
.describe("If true, the axis which is reduced is left "
.describe("If this is set to `True`, the reduced axis is left "
"in the result as dimension with size one.");
}
};
Expand Down
24 changes: 3 additions & 21 deletions src/operator/tensor/broadcast_reduce_op_value.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,6 @@ DMLC_REGISTER_PARAMETER(BroadcastToParam);
inline std::string get_reduce_axes_description(const std::string& op_name, int line) {
std::string doc = R"code(Compute the __op__ of array elements over given axes.

The argument ``axis`` specifies the axes to compute over:

- **()**: compute over all elements into a scalar array with shape ``(1,)``. This is
the default option.
- **int**: compute over along a particular axis. If input has shape ``(n, m, k)``,
use ``axis=0`` will result in an array with shape ``(m, k)``.
- **tuple of int**: compute over multiple axes. Again assume input shape ``(n, m,
k)``, with ``axis=(0,2)`` we obtain a ``(m,)`` shape array.

If ``keepdims = 1``, then the result array will has the same number of dimensions
as the input, while the reduced axes will have size 1.


Defined in )code";
doc += std::string(__FILE__) + std::string(":L") + std::to_string(line);
Expand Down Expand Up @@ -68,9 +56,7 @@ MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_prod)
.set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseInOut< cpu, mshadow_op::rdiv>);

MXNET_OPERATOR_REGISTER_REDUCE(nansum)
.describe(R"code(Compute the sum of array elements over given axes with ``NaN`` ignored

Refer to ``sum`` for more details.
.describe(R"code(Compute the sum of array elements over given axes treating Not a Numbers ``NaN`` as zero.

)code" ADD_FILELINE)
.set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow_op::nansum>)
Expand All @@ -81,9 +67,7 @@ MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_nansum)
.set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseInOut<cpu, mshadow_op::nansum_grad>);

MXNET_OPERATOR_REGISTER_REDUCE(nanprod)
.describe(R"code(Compute the product of array elements over given axes with ``NaN`` ignored

Refer to ``prod`` for more details.
.describe(R"code(Compute the product of array elements over given axes treating Not a Numbers ``NaN`` as one.

)code" ADD_FILELINE)
.set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow_op::nanprod>)
Expand Down Expand Up @@ -171,9 +155,7 @@ NNVM_REGISTER_OP(_broadcast_backward)
.set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow::red::sum>);

NNVM_REGISTER_OP(norm)
.describe(R"code(Computes the L2 norm of the input array.

Flattens the input array and then computes the l2 norm.
.describe(R"code(Flattens the input array and then computes the l2 norm.

Examples::

Expand Down
Loading