[View in Colaboratory](https://colab.research.google.com/github/brucecmd/learn_gluon/blob/master/l2regular.ipynb)

In [0]:
from mxnet import nd, autograd
from mxnet.gluon import data as gdata
from mxnet import gluon
from mxnet.gluon import loss as gloss

In [0]:
# prepare data and data iter
num_train = 20
num_test = 100
num_inputs = 200
batch_size = 1
x = nd.random_normal(0,1,shape=(num_train+num_test, num_inputs))
true_w = nd.ones(shape=(num_inputs,1)) * 0.01
true_b = 2.8
y = nd.dot(x, true_w) + true_b
y += nd.random_normal(0,0.01,shape=y.shape)
feature_train, feature_test = x[:num_train,:], x[num_train:,:]
label_train, label_test = y[:num_train], y[num_train:]

train_iter = gdata.DataLoader(gdata.ArrayDataset(feature_train, label_train), batch_size, shuffle=True)
test_iter = gdata.DataLoader(gdata.ArrayDataset(feature_test, label_test), batch_size, shuffle=True)

即使num_train很大，num_test很小的时候，也有可能会出问题，越训练loss越大。
这时候要看一下batch_size是多少，如果batch_size为1的时候，lr为0.01的时候表现不好。
如果batch_size调成10的话，情况会有很大好转。
经过参数调整实验，发现这种情况出现的原因是lr可能有点大了。因为batch_size为10的时候，sgd的时候也要把这个batch_size除一下，这个lr就会变小。
如果直接调lr为0.001，效果也会有很大的改善。

In [0]:
# define w and b
w = nd.random_normal(0,1,shape=(num_inputs, 1))
b = nd.zeros(shape=(1,))
params = [w,b]
for p in params:
    p.attach_grad()

In [0]:
# define net
def net(x):
    return nd.dot(x,w) + b

In [0]:
# define loss function 
#def loss_func(y_hat, y):
#    return (y_hat - y.reshape(y_hat.shape))**2 / 2
loss_func = gloss.L2Loss()

In [0]:
# define the trainer(sgd)
def sgd(params, batch_size, lr):
    for p in params:
        p[:] -= p.grad * lr / batch_size

In [0]:
# define l2 penalty function
def l2_penalty(w):
    return (w**2).sum() / 2

In [147]:
# train
epochs = 10
lr = 0.001
batch_size = 1
lambd = 0.1
for i in range(epochs):
    for data, label in train_iter:
        with autograd.record():
            y_hat = net(data)
            #l = loss_func(y_hat, label) + lambd * l2_penalty(w)
            l = loss_func(y_hat, label)
        l.backward()
        sgd(params, batch_size, lr)
    train_loss_value = loss_func(net(feature_train), label_train).mean().asscalar()
    test_loss_value = loss_func(net(feature_test), label_test).mean().asscalar()
    print('epoch[%d], train loss value[%f], test loss value[%f]'%(i, train_loss_value, test_loss_value))

epoch[0], train loss value[25.414448], test loss value[108.884552]
epoch[1], train loss value[16.409306], test loss value[108.569626]
epoch[2], train loss value[10.836321], test loss value[108.380280]
epoch[3], train loss value[7.297836], test loss value[108.287041]
epoch[4], train loss value[4.986907], test loss value[108.238487]
epoch[5], train loss value[3.482398], test loss value[108.225052]
epoch[6], train loss value[2.469655], test loss value[108.217209]
epoch[7], train loss value[1.776655], test loss value[108.204277]
epoch[8], train loss value[1.290132], test loss value[108.203407]
epoch[9], train loss value[0.948414], test loss value[108.199142]
