In [23]:
import numpy as np
import chainer as ch
# from chainer.backends import cuda
# from chainer import Function, gradient_check, report, training, utils, Variable
# from chainer import datasets, iterators, optimizers, serializers
# from chainer import Link, Chain, ChainList
import chainer.functions as F
import chainer.links as L
# from chainer.training import extensions

## Variables & Grad

In [24]:
x = ch.Variable(np.array([5.]))
y = x**2 - 2*x + 1

In [25]:
y.backward()

In [26]:
x.grad

array([8.])

In [27]:
z=2*x
y = x**2 - z + 1
y.backward(retain_grad=True)
z.grad

array([-1.])

In [28]:
x = ch.Variable(np.array([[1,2,3],[4,5,6]]).astype(np.float32))
y = x**2 - 2*x + 1
y.grad = np.ones((2,3)).astype(np.float32)
y.backward()
x.grad

array([[ 0.,  2.,  4.],
       [ 6.,  8., 10.]], dtype=float32)

## Links

In [29]:
f = L.Linear(3,2)

In [30]:
f.W.data,f.b.data

(array([[ 0.5211857 , -0.52129376,  0.2944199 ],
        [ 0.9014918 , -1.3782756 , -0.08024161]], dtype=float32),
 array([0., 0.], dtype=float32))

In [31]:
x

variable([[1., 2., 3.],
          [4., 5., 6.]])

In [32]:
y = f(x)

In [33]:
f.cleargrads()

In [34]:
f.cleargrads()
y.grad = np.ones((2,2)).astype(np.float32)
y.backward()
y.backward()
print(f.W.grad,f.b.grad)
f.cleargrads()
y.backward()
print(f.W.grad,f.b.grad)

[[10. 14. 18.]
 [10. 14. 18.]] [4. 4.]
[[5. 7. 9.]
 [5. 7. 9.]] [2. 2.]


## Defining Functions

In [35]:
class MulAdd(ch.Function):
    def forward(self,inputs):
        x,y,z = inputs
        return x*y + z
    
    def backward(self,inputs,grad_next):
        x,y,z = inputs
        dw, = grad_next
        
        dx = y*dw
        dy = x*dw
        dz = dw
        return (gx,gy,gz)
    
#     forward_cpu/backward_cpu
#         use if inputs of type np.ndarray
    
#     foward_gpu/backward_gpu
        # similar but inputs of type cupy.ndarray
    
#     forward/backward
#         use if gpu and cpu execution the same

# ch.backends.cuda.cupy ~= numpy for many functions, use in gpu forward/backward pass

# can abstract w/ xp = ch.backends.cuda.get_array_module()
# now np.exp() or cp.exp() is xp.exp() 

# can define elementwise ops in cuda by ch.backends.cuda.elementwise

In [36]:
# diferent train and test tiem behavioer
def dropout(x):
    if not ch.config.train: # thread specific global
        return x
    
    xp = ch.backends.cuda.get_array_module(x.data)
    msk = 2*(xp.random.rand(*x.shape) > 0.5).astype(x.dtype)
    return x*mask

In [37]:
from chainer import gradient_check

In [38]:
# funciton that  extends Link for use in Chain
class EltwiseParamProd(ch.Link):
    def __init__(self,shape):
        super(EltwiseParamProd,self).__init__()
        with self.init_scope():
            self.W = ch.Parameter(ch.initializers.Normal(scale=1.),shape)
    
    def __call__(self,x):
        return self.W * x

In [39]:
# gradient_check.numerical_grad checks finite difference gradient approximaition
# ch.testing.assert_allclose() same as numpy.testing.assert_allclose()

## Defining Models

In [40]:
l1 = L.Linear(4,3)
l2 = L.Linear(3,2)
def forward(x): return l2(l1(x))

In [41]:
class MyProc():
    def __init__(self):
        self.l1 = L.Linear(4,3)
        self.l2 = L.Linear(3,2)
    
    def forward(self,x):
        return self.l2(self.l1(x))

In [42]:
class MyChain(ch.Chain):
    def __init__(self):
        super(MyChain,self).__init__()
        with self.init_scope():
            self.l1 = L.Linear(4,3)
            self.l2 = L.Linear(3,2)
    
    def __call__(self,x):
        return self.l2(self.l1(x))

In [43]:
class MyChain2(ch.ChainList):
    def __init__(self):
        super(MyChain2,self).__init__(
            L.Linear(4,3),
            L.Linear(3,2),)
        
    def __call__(self,x):
        return self[1](self[0](x))

## Optimizers

In [44]:
model = MyChain()
opt = ch.optimizers.Adam().setup(model)
opt.add_hook(ch.optimizer_hooks.GradientClipping(1))
opt.add_hook(ch.optimizer_hooks.WeightDecay(0.0005))

<chainer.optimizers.adam.Adam at 0x118c7e668>

In [47]:
# directly update
x = np.random.uniform(-1, 1, (2, 4)).astype(np.float32)
model.cleargrads()
loss = F.sum(model(ch.Variable(x)))
loss.backward()
opt.update()

In [48]:
# can abstract w/ loss function
def lossfun(arg1,arg2):
    return F.sum(model(arg1-arg2))

y = np.random.uniform(-1, 1, (2, 4)).astype(np.float32)
yhat = np.random.uniform(-1, 1, (2, 4)).astype(np.float32)
opt.update(lossfun,y,yhat)

### Trainer
Training loop
(1) Iterations over training datasets
(2) Preprocessing of extracted mini-batches
(3) Forward/backward computations of the neural networks
(4) Parameter updates
(5) Evaluations of the current parameters on validation datasets
(6) Logging and printing of the intermediate results

dataset abstraction handles (1),(2)

trainer abstraction handles (3) - (6)

updater handles (3),(4)

extension handles (5),(6)