# Fundamentals of MXNet Numpy Module

## Operator Namespaces for Imperative Programming
- `mxnet.numpy`: Regular NumPy operators
- `mxnet.numpy.random`: NumPy random operators
- `mxnet.numpy.linalg`: NumPy linear algebra operators
- `mxnet.numpy.ext`: Operators implemented in MXNet that do not exist in official NumPy

## Operator Namespaces for Gluon
`F` can be either `mxnet.ndarray` or `mxnet.symbol`.
- `F.np`: Regular NumPy operators
- `F.np.random`: NumPy random operators
- `F.np.linalg`: NumPy linear algebra operators
- `F.np.ext`: Operators implemented in MXNet that do not exist in official NumPy

## New `ndarray` and `symbol`
`mxnet.numpy.ndarray` and `mxnet.symbol.numpy._NumpySymbol` (not visible to users)
- Same name as in the official NumPy package
- Dispatch convience fluent method calls to MXNet Numpy operators
- Override many convenience fluent methods that do not exist in the official NumPy ndarray
- Make the behavior of built-in methods consistent with the official NumPy
    - Indexing: `__getitem__` and `__setitem__`
    - Many binary element-wise with broadcasting, not supported in `mxnet.symbol.Symbol`
    
## Examples of ndarray and symbol Basics
### Scalar and zero-size tensors

In [None]:
import mxnet as mx
from mxnet import numpy as np

# use numpy-compatible semantics
mx.set_np_compat(True)

# create a scalar tensor
x = np.array(3.14)
print(x)

In [None]:
s = x.item()  # copy the element from the scalar tensor to a python scalar
print('s = {}'.format(str(s)))

In [None]:
# create a scalar tensors with only one element 1.0
y = np.ones(())
print(y)

In [None]:
# create a zero-size tensor
x = np.ones((5, 4, 0, 6))
print(x)

In [None]:
# transpose the zero-size tensor
y = np.transpose(x)
print(y)

### Conversion between classic and numpy ndarrays

In [None]:
# create a classic MXNet NDArray
x = mx.nd.random.uniform(shape=(2, 3))
print(x)

In [None]:
# convert classic NDArray type to mxnet.numpy.ndarray with zero-copy
y = x.as_np_ndarray()
print(y)

In [None]:
# changing y's content changes x's content too
y[:] = 1
print(x)

In [None]:
# convert mxnet.numpy.ndarray to classic NDArray with zero-copy
z = y.as_classic_ndarray()
print(z)

In [None]:
# changing z's content changes y's content too
z[:] = 2
print(y)

### Binary element-wise operations with broadcasting in new and old symbols

In [None]:
from mxnet import gluon
class TestBinaryBroadcast(gluon.HybridBlock):
    def hybrid_forward(self, F, x1, x2):
        print("x1 type:", str(type(x1)))
        print("x2 type:", str(type(x2)))
        return x1 + x2

net = TestBinaryBroadcast()
x1 = mx.nd.ones((2, 1))
x2 = mx.nd.ones((1, 3))
out = net(x1, x2)  # ok: imperative execution supports broadcasting
print(out)

In [None]:
net.hybridize()  # mark the block for execution using a computational graph
try:
    out = net(x1, x2)  # error: old symbol `+` operation does not support broadcasting
    assert False  # should not reach here
except mx.MXNetError:
    print("ERROR: cannot perform broadcast add for two symbols of mxnet.sym.Symbol")

In [None]:
class TestBinaryBroadcast2(gluon.HybridBlock):
    def hybrid_forward(self, F, x1, x2):
        print("x1 type:", str(type(x1)))
        print("x2 type:", str(type(x2)))
        return x1.as_np_ndarray() + x2  # convert x1 to new numpy ndarray/symbol

net2 = TestBinaryBroadcast2()
net2.hybridize()

out =net2(x1, x2)
print(out)

In [None]:
net = TestBinaryBroadcast()  # Create a new block object to clear the graph
net.hybridize()  # mark the block for execution using a computational graph

x1 = x1.as_np_ndarray()  # convert x1 to np.ndarray so that _NumpySymbol will be used in graph construction
x2 = x2.as_np_ndarray()  # convert x2 to np.ndarray so that _NumpySymbol will be used in graph construction
out = net(x1, x2)  # ok: `+` operation supports broadcasting for _NumpySymbol
print(out)  # mxnet.numpy.ndarray type, because it's from a np operator

## A Simple Linear Regression Model
Let's consider a simple linear regression model as the following.
Given dataset `{x, y}`, where `x`s represent input examples and `y`s represent observed data, find the parameters `w1` and `w2` for the following model.
```
y_pred = np.dot(np.maximum(np.dot(x, w1), 0), w2)
```

## MXNet Numpy Operators in Imperative Programming

In [None]:
import mxnet as mx
from mxnet import numpy as np
from mxnet import autograd
try:
    from mxboard import SummaryWriter
except ImportError:
    SummaryWriter = None

# create a summary writer for visualization
sw = SummaryWriter(logdir='./logs', flush_secs=2) if SummaryWriter is not None else None

# Use numpy-compatible semantics to support scalar tensors
mx.set_np_compat(True)

# N is number of examples; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = mx.nd.random.normal(shape=(N, D_in)).as_np_ndarray()  # x is of type mxnet.numpy.ndarray
y = mx.nd.random.normal(shape=(N, D_out)).as_np_ndarray()  # y is of type mxnet.numpy.ndarray

# Randomly initialize weights
w1 = mx.nd.random.normal(shape=(D_in, H)).as_np_ndarray()  # w1 is of type mxnet.numpy.ndarray
w1.attach_grad()  # w1.grad is of type mxnet.numpy.ndarray
w2 = mx.nd.random.normal(shape=(H, D_out)).as_np_ndarray()  # w2 is of type mxnet.numpy.ndarray
w2.attach_grad()  # w2.grad is of type mxnet.numpy.ndarray

learning_rate = 1e-6


for t in range(1000):
    with autograd.record():
        # Forward pass: compute predicted y
        h = x.dot(w1)  # equivalent to np.dot(x, w1)
        h_relu = np.ext.relu(h)  # equivalent to mx.nd.relu(h)
        y_pred = h_relu.dot(w2)  # equivalent to np.dot(h_relu, w2)

        # Compute loss
        # (y_pred - y) ** 2 calls np.ndarray.__pow__
        # sum() calls np.sum() which should return a scalar tensor
        loss = ((y_pred - y) ** 2).sum()
    # Note that the print function will invoke loss.asnumpy()
    print(t, loss)  # loss is a scalar tensor of type mxnet.numpy.ndarray
    loss.backward()

    # Update weights
    w1 -= learning_rate * w1.grad
    w2 -= learning_rate * w2.grad

    if sw is not None:
        sw.add_scalar('loss', loss.item(), global_step=t)  # loss.item() copies the tensor element to a python scalar
        if t % 50 == 0:
            sw.add_histogram(tag='w1', values=w1, global_step=t)
            sw.add_histogram(tag='w2', values=w2, global_step=t)

if sw is not None:
    sw.close()

## MXNet Numpy Operators in Gluon `HybridBlock`

In [None]:
import mxnet as mx
from mxnet import gluon, autograd
try:
    from mxboard import SummaryWriter
except ImportError:
    SummaryWriter = None

# create a summary writer for visualization
sw = SummaryWriter(logdir='./logs', flush_secs=2) if SummaryWriter is not None else None

# Use numpy-compatible semantics to support scalar tensors
mx.set_np_compat(True)


class LinearRegression(gluon.HybridBlock):
    def __init__(self, num_input_dim=1000, num_hidden_dim=100, num_output_dim=10):
        super(LinearRegression, self).__init__()
        with self.name_scope():
            self.w1 = self.params.get('w1', shape=(num_input_dim, num_hidden_dim),
                                      allow_deferred_init=True)
            self.w2 = self.params.get('w2', shape=(num_hidden_dim, num_output_dim),
                                      allow_deferred_init=True)

    def hybrid_forward(self, F, x, w1, w2):
        h = x.dot(w1)  # equivalent to F.np.dot(x, w1)
        h_relu = F.np.ext.relu(h)  # equivalent to F.relu(h)
        y_pred = h_relu.dot(w2)  # equivalent to F.np.dot(h_relu, w2)
        return y_pred


class TotalLoss(gluon.HybridBlock):
    def hybrid_forward(self, F, pred, label):
        return ((pred - label) ** 2).sum()  # equivalent to F.np.sum(F.np.square(pred - label))


regressor = LinearRegression()
regressor.initialize(mx.init.Normal())
regressor.hybridize()

# Create random input and output data
x = mx.nd.random.normal(shape=(64, 1000)).as_np_ndarray()  # x is of type mxnet.numpy.ndarray
y = mx.nd.random.normal(shape=(64, 10)).as_np_ndarray()  # y is of type mxnet.numpy.ndarray

total_loss = TotalLoss()
trainer = gluon.Trainer(regressor.collect_params(), 'sgd', {'learning_rate': 1e-3, 'momentum': 0.9})

for t in range(1000):
    with autograd.record():
        output = regressor(x)  # output is a type of np.ndarray because np.dot is the last op in the network
        loss = total_loss(output, y)  # loss is a scalar np.ndarray
    loss.backward()
    print(t, loss)  # note that loss.asnumpy() is called
    trainer.step(1)
    if sw is not None:
        sw.add_scalar('loss', loss.item(), global_step=t)  # loss.item() copies the tensor element to a python scalar
        if t % 50 == 0:
            for k, v in regressor.collect_params().items():
                sw.add_histogram(tag=k, values=v.data(), global_step=t)

if sw is not None:
    sw.close()