In [1]:
import vugrad as vg
import numpy as np

# Question 6
1) The operation of OpNNode object is defined by the Op abstract class, which can be subclassed to define certain operations. Those operations have to implement *forward* and *backward* methods, as defined in Op interface.

2) It is defined in core.py:287
```python
    class Add(Op):
        """
        Op for element-wise matrix addition.
        """
        @staticmethod
        def forward(context, a, b):
            assert a.shape == b.shape, f'Arrays not the same sizes ({a.shape} {b.shape}).'
287         return a + b

```
3) It's done because the computational graph is eagerly executed, it's build on the fly. The graph only defines the flow of the computations, but not their results. OpNode is connected to the output nodes in core.py:212 

```python
    outputs = [TensorNode(value=output, source=opnode) for output in outputs_raw]
212 opnode.outputs = outputs
```

# Question 7
```python
    # compute the gradients over the inputs
132 ginputs_raw = self.op.backward(self.context, *goutputs_raw)
```

# Question 8

TODO

# Question 9



In [4]:
class ReLU(vg.ops.Op):
    """
    Op for element-wise application of ReLU function
    """

    @staticmethod
    def forward(context, input):
        #print(input.shape)
        relux = input * (input > 0)
        context['relux'] = relux
        return relux

    @staticmethod
    def backward(context, goutput):
        relux = context['relux']
        drelux = np.greater(relux, 0).astype(int)
        return drelux*goutput

In [8]:
import numpy as np
import vugrad as vg

# Parse command line arguments

class args:
    lr = 0.01
    data = 'synth'
    epochs = 20
    batch_size = 128

def relu(x):
    """
    Wrap the sigmoid op in a funciton (just for symmetry with the softmax).

    :param x:
    :return:
    """
    return ReLU.do_forward(x)

class MLP_ReLU(vg.MLP):
    def forward(self, input):

        assert len(input.size()) == 2

        # first layer
        hidden = self.layer1(input)

        # non-linearity
        hidden = relu(hidden)

        # second layer
        output = self.layer2(hidden)
        output = vg.functions.softmax(output)

        return output

## Load the data
if args.data == 'synth':
    (xtrain, ytrain), (xval, yval), num_classes = vg.load_synth()
elif args.data == 'mnist':
    (xtrain, ytrain), (xval, yval), num_classes = vg.load_mnist(final=False, flatten=True)
else:
    raise Exception(f'Dataset {args.data} not recognized.')

print(f'## loaded data:')
print(f'         number of instances: {xtrain.shape[0]} in training, {xval.shape[0]} in validation')
print(f' training class distribution: {np.bincount(ytrain)}')
print(f'     val. class distribution: {np.bincount(yval)}')

num_instances, num_features = xtrain.shape

from collections import defaultdict
results = list()
## Create the model.
for nonlinearity, model in [('sigmoid', vg.MLP,), ('relu', MLP_ReLU)]
    mlp = model(input_size=num_features, output_size=num_classes)

    n, m = xtrain.shape
    b = args.batch_size

    print('\n## Starting training')

    cl = '...'

    for epoch in range(args.epochs):

        print(f'epoch {epoch:03}')

        if epoch % 1 == 0:
            ## Compute validation accuracy

            o = mlp(vg.TensorNode(xval))
            oval = o.value

            predictions = np.argmax(oval, axis=1)
            num_correct = (predictions == yval).sum()
            acc = num_correct / yval.shape[0]

            o.clear() # gc the computation graph

            print(f'       accuracy: {acc:.4}')
            
        cl = 0.0 # running sum of the training loss

        # We loop over the data in batches of size `b`
        for fr in range(0, n, b):

            # The end index of the batch
            to = min(fr + b, n)

            # Slice out the batch and its corresponding target values
            batch, targets = xtrain[fr:to, :], ytrain[fr:to]

            # Wrap the inputs in a Node
            batch = vg.TensorNode(value=batch)

            outputs = mlp(batch)
            loss = vg.celoss(outputs, targets)
            # -- The computation graph is now complete. It consists of the mlp, together with the computation of
            #    the scalar loss.
            # -- The variable `loss` is the TreeNode at the very top of our computation graph. This means we can call
            #    it to perform operations on the computation graph, like clearing the gradients, starting the backpropgation
            #    and clearing the graph.

            cl += loss.value
            # -- We must be careful here to extract the _raw_ value for the running loss. What would happen if we kept
            #    a running sum using the TensorNode?

            # Start the backpropagation
            loss.backward()

            # pply gradient descent
            for parm in mlp.parameters():
                parm.value -= args.lr * parm.grad
                # -- Note that we are directly manipulating the members of the parm TensorNode. This means that for this
                #    part, we are not building up a computation graph.

            # -- In Pytorch, the gradient descent is abstracted away into an Optimizer. This allows us to build slightly more
            #    complexoptimizers than plain graident descent.

            # Finally, we need to reset the gradients to zero ...
            loss.zero_grad()
            # ... and delete the parts of the computation graph we don't need to remember.
            loss.clear()

        print(f'   running loss: {cl:.4}')
        results.append({
                'epoch' : epoch,
                'accuracy' : acc,
                'loss' : cl,
                'nonlinearity' : nonlinearity,
                'dataset' : args.data
            })

import pandas as pd
df = 


## loaded data:
         number of instances: 60000 in training, 10000 in validation
 training class distribution: [32631 27369]
     val. class distribution: [5541 4459]

## Starting training
epoch 000
       accuracy: 0.5356
   running loss: 5.238e+03
epoch 001
       accuracy: 0.9839
   running loss: 2.528e+03
epoch 002
       accuracy: 0.9862
   running loss: 2.286e+03
epoch 003
       accuracy: 0.9878
   running loss: 2.18e+03
epoch 004
       accuracy: 0.9898
   running loss: 2.123e+03
epoch 005
       accuracy: 0.9907
   running loss: 2.094e+03
epoch 006
       accuracy: 0.9909
   running loss: 2.069e+03
epoch 007
       accuracy: 0.9909
   running loss: 2.045e+03
epoch 008
       accuracy: 0.9912
   running loss: 2e+03
epoch 009
       accuracy: 0.9917
   running loss: 1.994e+03
epoch 010
       accuracy: 0.9921
   running loss: 1.988e+03
epoch 011
       accuracy: 0.9925
   running loss: 1.986e+03
epoch 012
       accuracy: 0.9924
   running loss: 1.977e+03
epoch 013
       ac