### Revisiting Micrograd just to have a better understanding

In [1]:
import math

class Value:
    
    def __init__(self, data, _children=(), _op=''):
        self.data = data
        self.grad = 0.0
        self._backward = lambda : None
        self._prev = set(_children)
        self._op = _op

    def __repr__(self):
        return f'Value(data = {self.data})'

    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), '+')

        def _backward():
            # by chain rule, self.grad = local_derivative * output.grad
            self.grad += 1 * out.grad
            other.grad += 1 * out.grad

        out._backward = _backward
        
        return out

    def __radd__(self, other):
        return self+other

    def __neg__(self):
        return self * (-1)

    def __sub__(self, other):
        return self + (-other)

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')

        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad

        out._backward = _backward
        
        return out

    def __rmul__(self, other):
        return self * other

    def __pow__(self, other):
        assert isinstance(other, (int, float))
        out = Value(self.data**other, (self, ), f'**{other}')

        def _backward():
            self.grad = (other * self.data**(other-1) ) * out.grad
    
        out._backward = _backward
    
        return out
    

    def __truediv__(self, other):
        return self * other**-1
        

    def tanh(self):
        n = self.data
        t = (math.exp(2*n) - 1) / (math.exp(2*n) + 1 )
        out = Value(t, (self,), 'tanh')

        def _backward():
            self.grad += (1 - t**2) * out.grad

        out._backward = _backward
        
        return out

    def exp(self):
        exp = math.exp(self.data)
        out = Value(exp, (self,), 'exp')

        def _backward():
            self.grad += out.data * out.grad

        out._backward = _backward

        return out

    
    def backward(self):
        # do topological sorting and run back propagation

        def dfs(val, visited, topo_sorted) -> None:
            if val in visited : return
        
            visited.add(val)
            # print(val)
        
            for k in val._prev:
                dfs(k, visited, topo_sorted)
        
            topo_sorted.append(val)
        
        def build_topo(out) -> list:
            topo_sorted = []
            visited = set()
            dfs(out, visited, topo_sorted)
            topo_sorted.reverse()
            return topo_sorted
            
        
        topo = build_topo(self)
        
        self.grad = 1
        
        for op in topo:
            op._backward()

In [2]:
a = Value(2.0)
b = Value(3.0)
c = Value(4.0)
d = a*b + c
d._prev

{Value(data = 4.0), Value(data = 6.0)}

In [23]:
d._op

'+'

In [24]:
def print_graph(val):
    if val._prev:
        print(val, val._prev, val._op)
        for child in val._prev:
            print_graph(child)

print_graph(d)

Value(data = 10.0) {Value(data = 4.0), Value(data = 6.0)} +
Value(data = 6.0) {Value(data = 3.0), Value(data = 2.0)} *


In [77]:
# neuron
import math

# inputs
x1 = Value(2.0)
x2 = Value(0.0)
# weights
w1 = Value(-3.0)
w2 = Value(1.0)
# bias of the neuron
b = Value(0.881)
# x1w1 + x2w2 + b
x1w1 = x1*w1
x2w2 = x2*w2
x1w1x2w2 = x1w1 + x2w2 
n = x1w1x2w2 + b
# o = n.tanh()
e = (2*n).exp()
o = (e-1) / (e+1)

#### Manual Back Propagation

In [10]:
o.grad = 1
o._backward()

In [11]:
n.grad

0.00014312714365083412

In [13]:
n._backward()
x1w1x2w2.grad

0.00014312714365083412

In [15]:
x1w1x2w2._backward()
x1w1.grad

0.00014312714365083412

In [19]:
x1w1._backward()
w1.grad

0.0

#### Auto Back Propagation

In [79]:
w1.grad

0.00028625428730212794

In [78]:
o.backward()

In [72]:
2*a

Value(data = 4.0)

### Building Neural Network

In [113]:
import random

class Neuron:
    def __init__(self, nin):
        self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]
        self.b = Value(random.uniform(-1,1))

    def __call__(self, x):
        act = sum(wi*xi for wi, xi in zip(self.w, x)) + self.b
        out = act.tanh()
        return out

    def parameters(self):
        return self.w + [self.b]


class Layer:
    def __init__(self, nin, nout):
        self.neurons = [Neuron(nin) for _ in range(nout)] 

    def __call__(self, x):
        outs = [n(x) for n in self.neurons]
        return outs[0] if len(outs) == 1 else outs

    def parameters(self):
        return [p for neuron in self.neurons for p in neuron.parameters()]


class MLP:
    def __init__(self, nin, nout):
        sz = [nin] + nout
        self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nout))]

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)

        return x

    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]
        
        

x = [2.0, 3.0, -1.0]
mlp = MLP(3, [4, 4, 1])
mlp(x)

# mlp.parameters()


Value(data = 0.7522054968986874)

In [114]:
xs = [
    [2.0, 3.0, -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0]
]
ys = [1.0, -1.0, -1.0, 1.0]

In [115]:
for k in range(20):
    # forward pass
    ypred = [mlp(x) for x in xs]
    loss = sum((yout-ygt)**2 for ygt, yout in zip(ys, ypred))
    print(f'loss on step {k} is {loss.data}')

    # backward pass
    for p in mlp.parameters():
        p.grad = 0.0
    loss.backward()

    # update
    for p in mlp.parameters():
        p.data += -0.05 * p.grad

loss on step 0 is 2.6541134189807596
loss on step 1 is 0.9795115724683165
loss on step 2 is 0.6512208412708855
loss on step 3 is 0.47795152765428855
loss on step 4 is 0.37178866211972605
loss on step 5 is 0.30122078969614474
loss on step 6 is 0.25151957480179454
loss on step 7 is 0.21490871409074142
loss on step 8 is 0.1869696140437594
loss on step 9 is 0.16503374747862634
loss on step 10 is 0.1474062731871071
loss on step 11 is 0.13296472852090085
loss on step 12 is 0.12093910782359811
loss on step 13 is 0.11078516667123652
loss on step 14 is 0.10210823618864609
loss on step 15 is 0.09461568296049513
loss on step 16 is 0.08808627306139548
loss on step 17 is 0.08234986434643338
loss on step 18 is 0.07727360546090878
loss on step 19 is 0.07275234683557791


In [112]:
ypred

[Value(data = 1.0), Value(data = -1.0), Value(data = -1.0), Value(data = 1.0)]

### Finally completed the video & the implementation.
Next steps:
1. complete the assignment
2. Check the source code & especially the demo part where there's some implementation of different type of data set with decision boundary.
3. Move confidently to 'MakeMore'

### Assignment

In [121]:
# here is a mathematical expression that takes 3 inputs and produces one output
from math import sin, cos

def f(a, b, c):
  return -a**3 + sin(3*b) - 1.0/c + b**2.5 - a**0.5
    
# expected answer is the list of 
ans = [-12.353553390593273, 10.25699027111255, 0.0625]


# now estimate the gradient numerically without any calculus, using# the approximation we used in the video.
# you should not call the function df from the last cell# -----------
def get_numerical_grad(a, b, c):
    h = 0.000001
    df_da = (f(a+h, b, c) - f(a, b, c)) / h
    df_db = (f(a, b+h, c) - f(a, b, c)) / h
    df_dc = (f(a, b, c+h) - f(a, b, c)) / h
    return [df_da, df_db, df_dc]
    
numerical_grad = get_numerical_grad(2, 3, 4) # TODO# -----------

for dim in range(3):  
    ok = 'OK' if abs(numerical_grad[dim] - ans[dim]) < 1e-5 else 'WRONG!'  
    print(f"{ok} for dim {dim}: expected {ans[dim]}, yours returns {numerical_grad[dim]}")



OK for dim 0: expected -12.353553390593273, yours returns -12.353559348809995
OK for dim 1: expected 10.25699027111255, yours returns 10.256991666679482
OK for dim 2: expected 0.0625, yours returns 0.062499984743169534


In [127]:
# there is an alternative formula that provides a much better numerical 
# approximation to the derivative of a function.
# learn about it here: https://en.wikipedia.org/wiki/Symmetric_derivative
# implement it. confirm that for the same step size h this version gives a
# better approximation.

# -----------
def get_numerical_grad2(a, b, c):
    h = 0.001
    df_da = (f(a+h, b, c) - f(a-h, b, c)) / (2*h)
    df_db = (f(a, b+h, c) - f(a, b-h, c)) / (2*h)
    df_dc = (f(a, b, c+h) - f(a, b, c-h)) / (2*h)
    return [df_da, df_db, df_dc]
    

numerical_grad2 = get_numerical_grad2(2, 3, 4) # TODO
# -----------

for dim in range(3):
  ok = 'OK' if abs(numerical_grad2[dim] - ans[dim]) < 1e-5 else 'WRONG!'
  print(f"{ok} for dim {dim}: expected {ans[dim]}, yours returns {numerical_grad2[dim]}")

OK for dim 0: expected -12.353553390593273, yours returns -12.353554401639766
OK for dim 1: expected 10.25699027111255, yours returns 10.256994551617105
OK for dim 2: expected 0.0625, yours returns 0.06250000390650712


#### Support for Softmax

In [160]:
# Value class starter code, with many functions taken out
from math import exp, log

class Value:
  
  def __init__(self, data, _children=(), _op='', label=''):
    self.data = data
    self.grad = 0.0
    self._backward = lambda: None
    self._prev = set(_children)
    self._op = _op
    self.label = label

  def __repr__(self):
    return f"Value(data={self.data})"
      
  
  def __add__(self, other): # exactly as in the video
    other = other if isinstance(other, Value) else Value(other)
    out = Value(self.data + other.data, (self, other), '+')
    
    def _backward():
      self.grad += 1.0 * out.grad
      other.grad += 1.0 * out.grad
        
    out._backward = _backward
    
    return out

  def __radd__(self, other):
      return self+other

  # def __sub__(self, other):
  #     other = other if isinstance(other, Value) else Value(other)
  #     data = self.data-other.data
  #     out = Value(data, (self, other), '-')

  #     def _backward():
  #         self.grad += 1.0 * out.grad
  #         other.grad += -1.0 * out.grad

  #     out._backward = _backward

  #     return out


  def __mul__(self, other):
      other = other if isinstance(other, Value) else Value(other)
      data = self.data * other.data
      out = Value(data, (self, other), '*')

      def _backward():
          self.grad += other.data * out.grad
          other.grad += self.data * out.grad

      out._backward = _backward
      
      return out 

  def __rmul__(self, other):
      return self*other

  def __neg__(self):
      return self*(-1)

    
  def __pow__(self, other):
      assert isinstance(other, (int, float))
      data = self.data**other
      out = Value(data, (self, ), '**')

      def _backward():
          self.grad += other * self.data**(other-1) * out.grad

      out._backward = _backward
      
      return out
      
    
  def __truediv__(self, other):
      return self * (other**-1)

  def __rtruediv__(self, other):
      return other * (self**-1)
    

  def exp(self):
      data = math.exp(self.data)
      out = Value(data, (self,), 'exp')
    
      def _backward():
        self.grad += out.data * out.grad
    
      out._backward = _backward
    
      return out

  def log(self):
      data = math.log(self.data)
      out = Value(data, (self,), 'log')

      def _backward():
          self.grad += (1/self.data) * out.grad 

      out._backward = _backward
      
      return out

      
  # ------
  # re-implement all the other functions needed for the exercises below
  # your code here
  # TODO
  # ------

  def backward(self): # exactly as in video  
    topo = []
    visited = set()
    def build_topo(v):
      if v not in visited:
        visited.add(v)
        for child in v._prev:
          build_topo(child)
        topo.append(v)
    build_topo(self)
    
    self.grad = 1.0
    for node in reversed(topo):
      node._backward()

In [161]:
# without referencing our code/video __too__ much, make this cell work
# you'll have to implement (in some cases re-implemented) a number of functions
# of the Value object, similar to what we've seen in the video.
# instead of the squared error loss this implements the negative log likelihood
# loss, which is very often used in classification.

# this is the softmax function
# https://en.wikipedia.org/wiki/Softmax_function
def softmax(logits):
  counts = [logit.exp() for logit in logits]
  denominator = sum(counts)
  out = [c / denominator for c in counts]
  return out

# this is the negative log likelihood loss function, pervasive in classification
logits = [Value(0.0), Value(3.0), Value(-2.0), Value(1.0)]
probs = softmax(logits)
loss = -probs[3].log() # dim 3 acts as the label for this input example
loss.backward()
print(loss.data)

ans = [0.041772570515350445, 0.8390245074625319, 0.005653302662216329, -0.8864503806400986]
for dim in range(4):
  ok = 'OK' if abs(logits[dim].grad - ans[dim]) < 1e-5 else 'WRONG!'
  print(f"{ok} for dim {dim}: expected {ans[dim]}, yours returns {logits[dim].grad}")


2.1755153626167147
OK for dim 0: expected 0.041772570515350445, yours returns 0.041772570515350445
OK for dim 1: expected 0.8390245074625319, yours returns 0.8390245074625319
OK for dim 2: expected 0.005653302662216329, yours returns 0.005653302662216329
OK for dim 3: expected -0.8864503806400986, yours returns -0.8864503806400986


In [173]:
# verify the gradient using the torch library
# torch should give you the exact same gradient
import torch

logits = [torch.tensor(0.0).double(), torch.tensor(3.0).double(), torch.tensor(-2.0).double(), torch.tensor(1.0).double()]

for tensor in logits:
    tensor.requires_grad = True 

probs = softmax(logits)
loss = -probs[3].log() # dim 3 acts as the label for this input example
loss.backward()
print(loss.data)

ans = [0.041772570515350445, 0.8390245074625319, 0.005653302662216329, -0.8864503806400986]
for dim in range(4):
  ok = 'OK' if abs(logits[dim].grad - ans[dim]) < 1e-5 else 'WRONG!'
  print(f"{ok} for dim {dim}: expected {ans[dim]}, yours returns {logits[dim].grad}")

tensor(2.1755, dtype=torch.float64)
OK for dim 0: expected 0.041772570515350445, yours returns 0.041772570515350445
OK for dim 1: expected 0.8390245074625319, yours returns 0.8390245074625319
OK for dim 2: expected 0.005653302662216329, yours returns 0.005653302662216329
OK for dim 3: expected -0.8864503806400986, yours returns -0.8864503806400988
