* THIS IS THE MICROGRAD NOTBOOK WHICH STARTS WITH IMPLEMENTING THE VARIENTS OF TANH *

In [1]:
import math
import numpy as np
import matplotlib.pyplot as plt
import random
from typing import Any
%matplotlib inline

In [2]:
# let move to nn, we should start by building basic datastructures
class Value:
    def __init__(self, data, _children=(), _op='', label='') -> None:
        self.data = data
        self.grad = 0.0 # grad is initialized to 0 assuming in the beginning there is no impact of changing 'data' on the output 'L'
        self._backward = lambda: None # backward ops at each node
        self._prev = set(_children) # children is to keep the memory of previous nodes
        self._op = _op # op is the operation done on the nodes
        self.label = label

    def __repr__(self) -> str:
        return f"Value(data={self.data})"
    
    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other) # the other operand is wrapped in Value
        out = Value(self.data + other.data, (self, other), '+') # (self, other) is a tuple and passed as children

        # lets define the backward at the add node (secret: from 'out' it flows into 'self' and 'other')
        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad
        out._backward = _backward

        return out
    
    def __radd__(self, other): # reverse add
        return self + other
    
    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other) # the other operand is wrapped in Value
        out = Value(self.data * other.data, (self, other), '*') # (self, other) is a tuple and passed as children
        
        # lets define the backward at the mul node (secret: out.grad a.k.a global gradient is multiplied with 
        # local gradients of 'self' and 'other', where local(self.grad) = other.data and local(other.grad) = self.data)
        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward=_backward

        return out
    
    def __rmul__(self, other): # reverse multiply
        return self * other
    
    # remember that substraction and negate use the 'add' and 'mul' ops
    def __neg__(self): # negation, -self
        return self * -1

    def __sub__(self, other): # self - other
        return self + (-other)

    def __pow__(self, other): # other is the pow value
        assert(isinstance(other, (int, float)))
        out = Value(self.data**other, (self,), f'**{other}')

        def _backward():
            self.grad += other * (self.data ** (other - 1)) * out.grad # derivative rules - https://www.google.com/search?sca_esv=589070032&rlz=1C5CHFA_enGB1040GB1040&sxsrf=AM9HkKmzNBy5PI7mT-5g3DqaPLwo24rheQ:1702036879826&q=derivative+rules&tbm=isch&source=lnms&sa=X&sqi=2&ved=2ahUKEwiH9_a85f-CAxU1iv0HHTi3DU0Q0pQJegQIDRAB&biw=1470&bih=803&dpr=2#imgrc=wXvz5V_wFuJ6jM
        out._backward = _backward

        return out

    def __truediv__(self, other):
        # self / other
        return self * other**-1 # other to the pow is implemented above

    def tanh(self):
        x = self.data
        t = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
        out = Value(t, (self, ), 'tanh')

        def _backward():
            self.grad += (1 - t**2) * out.grad
        out._backward=_backward

        return out
    
    def exp(self):
        # lets define a func for handling exp
        x = self.data
        out = Value(math.exp(x), (self,), 'exp')

        def _backward():
            self.grad += out.data * out.grad # https://www.youtube.com/watch?v=RtGjBRIwONA&t=121s = e(x)
        out._backward = _backward
        
        return out


    def backward(self):
        # lets now automate the autograd based on topo

        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v) # all the childrens are first added to the list, later o is added
        build_topo(self)

        self.grad = 1.0
        for node in reversed(topo):
            node._backward()

In [None]:
a = Value(2.0)
a.exp()

b = Value(4.0)
a/b

b - a

In [None]:
# to implement division following will be considered
a / b
a * (1/b)
a * (b**-1)

# so lets implement something powerfull like (b**-1) i.e. 'b' to the power '-1'
x**k # 'x' to the power some 'k', 'k' will be int or float, we should be able to differentiate x**k, and 'k'=-1 is the division special case

In [3]:
# time to now visualize the operations graph (bit complicated)
# lets use graphviz
from graphviz import Digraph

def trace(root):
    # builds a set of all nodes and edges in the graph
    nodes, edges = set(), set()
    def build(v):
        if v not in nodes:
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v))
                build(child)
    build(root)
    return nodes, edges

def draw_dot(root):
    dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = Left to Right

    nodes, edges = trace(root)
    for n in nodes:
        uid = str(id(n))
        # create the root node
        dot.node(name=uid, label="{ %s | data %.4f | grad %.4f}" % (n.label, n.data, n.grad), shape='record')
        # create a fake op node
        if n._op:
            dot.node(name=uid + n._op, label=n._op)
            dot.edge(uid + n._op, uid)

    # todo
    for n1, n2 in edges:
        dot.edge(str(id(n1)), str(id(n2)) + n2._op)

    return dot


In [None]:
# lets develop a neuron

# inputs x1, x2
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')

# weights w1, w2
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')

# bias of the neuron
b = Value(6.8813735870195432, label='b')

# x1*w1 + x2*w2 + b
x1w1 = x1 * w1; x1w1.label = 'x1*w1'
x2w2 = x2 * w2; x2w2.label = 'x2*w2'
x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1*w1 + x2*w2'
n = x1w1x2w2 + b; n.label = 'n'

#draw_dot(n)

# we will now apply activation function - tanh, but tanh cannot be applied on +*, since tanh is hyperbolic function we will need exponensiation
# https://en.wikipedia.org/wiki/Hyperbolic_functions
# tanh x = sinh x\cosh x = e^{x}-e^{-x} / e^{x}+e^{-x}= e^{2x}-1 / e^{2x}+1

o = n.tanh(); o.label='o'

o.backward()

In [None]:
# lets check with autograd
draw_dot(o)

In [None]:
# after the implementation of exp, pow, div, sub. It is time to break tanh above and use 'e' to the power rule instead

# lets develop a neuron

# inputs x1, x2
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')

# weights w1, w2
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')

# bias of the neuron
b = Value(6.8813735870195432, label='b')

# x1*w1 + x2*w2 + b
x1w1 = x1 * w1; x1w1.label = 'x1*w1'
x2w2 = x2 * w2; x2w2.label = 'x2*w2'
x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1*w1 + x2*w2'
n = x1w1x2w2 + b; n.label = 'n'

#draw_dot(n)

# we will now apply activation function - tanh, but tanh cannot be applied on +*, since tanh is hyperbolic function we will need exponensiation
# https://en.wikipedia.org/wiki/Hyperbolic_functions
# tanh x = sinh x\cosh x = e^{x}-e^{-x} / e^{x}+e^{-x}= e^{2x}-1 / e^{2x}+1

# ------------
e = (2*n).exp()
o = (e - 1) / (e + 1)
# ------------

o.label='o'
o.backward()
draw_dot(o)

In [None]:
# lets now use pytorch to do the above
import torch

# torch.Tensor([[1, 2, 3], [4, 5, 6]])

# casting to double for dtype to be float64
x1 = torch.Tensor([2.0]).double()               ; x1.requires_grad = True
x2 = torch.Tensor([0.0]).double()               ; x2.requires_grad = True
w1 = torch.Tensor([-3.0]).double()              ; w1.requires_grad = True
w2 = torch.Tensor([1.0]).double()               ; w2.requires_grad = True
b = torch.Tensor([6.8813735870195432]).double() ; b.requires_grad = True
n = x1*w1 + x2*w2 + b
o = torch.tanh(n)

print(o.data.item())
o.backward()

print('-----')
print('x2', x2.grad.item())
print('w2', w2.grad.item())
print('x1', x1.grad.item())
print('w1', w1.grad.item())

In [4]:
# time to build two layer perceptron

from typing import Any

# single neuron
class Neuron:
    # nin is number of inputs
    # no. of 'w' weights are equal to no. of inputs like x1w1, x2w2, x3w3 and so on
    def __init__(self, nin) -> None:
        self.w = [Value(random.uniform(-1, 1)) for _ in range(nin)]
        self.b = Value(random.uniform(-1, 1))

    # forward pass
    def __call__(self, x):
        # w * x + b
        # zip(self.w, x) # zip iterates over a tuple (w, x) in the given arrs
        #print(list(zip(self.w, x)))
        act = sum((wi*xi for wi, xi in zip(self.w, x)), self.b) # neuron activation function
        out = act.tanh()
        return out
    
    # parameters list for a neuron which is 'w' and 'b'
    def parameters(self):
        return self.w + [self.b]

# lets define the layer now
class Layer:
    # nin is no. of inputs to a single neuron
    # nout is no. of neuron in a layer
    def __init__(self, nin, nout):
        self.neurons = [Neuron(nin) for _ in range(nout)]

    # iterate each neuron in the layer and pass input 'x' to get the output of each neuron, outs=list of outputs
    def __call__(self, x):
        outs = [n(x) for n in self.neurons]
        return outs[0] if len(outs) == 1 else outs # return single output else list
    
    # parameters list for a layer
    def parameters(self):
        params = []
        for neuron in self.neurons:
            ps = neuron.parameters()
            params.extend(ps)
        return params

# lets define a MLP - multi layer perceptron
class MLP:
    def __init__(self, nin, nouts) -> None: # (self, 3, [4, 4, 1])
        sz = [nin] + nouts # [3, 4, 4, 1]
        self.layers = [Layer(sz[i], sz[i + 1]) for i in range(len(nouts))]

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
    
    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()] # simplified expression

In [5]:
#x = [2.0, 3.0]
#n = Neuron(2) # 2 because the inputs are 2
#n = Layer(2, 3) # 2 is no. of inputs, 3 is no. of neurons in a layer
x = [2.0, 3.0, -1.0]
n = MLP(3, [4, 4, 1])
n(x) # different ouput from neuron as we initialize diff weights each time

Value(data=0.6775668414781266)

In [6]:
n.parameters() # total no. of weights and biases in the MLP
# len(n.parameter()) # total of 41 values

[Value(data=-0.40447442034189685),
 Value(data=0.3270661753589519),
 Value(data=-0.8589789507011809),
 Value(data=-0.21683436190365502),
 Value(data=-0.1373194731128462),
 Value(data=-0.13057972780855365),
 Value(data=-0.2859234903959169),
 Value(data=-0.18513984249606596),
 Value(data=0.29122764752528085),
 Value(data=0.926525469528384),
 Value(data=-0.006208995348336188),
 Value(data=0.4044028454736859),
 Value(data=-0.4298838237462337),
 Value(data=0.884838759277786),
 Value(data=-0.10500733254512351),
 Value(data=-0.7374267639432341),
 Value(data=0.47718070013562697),
 Value(data=0.3429809745901793),
 Value(data=0.7930730034321869),
 Value(data=0.6369096766213207),
 Value(data=-0.39720584514831647),
 Value(data=0.35099839520421106),
 Value(data=0.15553684131718293),
 Value(data=-0.8461602928926861),
 Value(data=0.5600956069398904),
 Value(data=-0.3873631052707234),
 Value(data=-0.054073187280274215),
 Value(data=-0.09009851903098798),
 Value(data=0.23036389612346886),
 Value(data=0

In [None]:
# lets draw the MLP
draw_dot(n(x))

In [7]:
# lets now design a binary classifier or use our micrograd
xs = [
    [2.0, 3.0, 1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0],
]
ys = [1.0, -1.0, -1.0, 1.0] # desired targets

In [8]:
# lets define the loss now 'a single no. to very the nn' - mean squared error loss
# ygt is the ground truth and yout is the prediction, we get list of losses here for each input. square is used to make the output positive
# final loss is the sum of all losses in the list
ypred = [n(x) for x in xs] # we are still using the MLP defined above i.e. MLP(3, [4, 4, 1]), 4 time MLP is run here i.e. 4 forward passes
loss = sum([(yout - ygt)**2 for ygt, yout in zip(ys, ypred)]) # 4 forward passes extended to this math expression
loss

Value(data=1.7732768129779868)

In [22]:
# loss after nudging the params (run the below cells, to see the result here) -> loss should reduce
ypred = [n(x) for x in xs] # we are still using the MLP defined above i.e. MLP(3, [4, 4, 1]), 4 time MLP is run here i.e. 4 forward passes
loss = sum([(yout - ygt)**2 for ygt, yout in zip(ys, ypred)]) # 4 forward passes extended to this math expression
loss

Value(data=0.07536987698966498)

In [23]:
# lets reduce the loss and we get what we want to predict
loss.backward()

In [10]:
# something magical happened above, lets check the weights on first layer's first neuron
n.layers[0].neurons[0].w[0].grad


-0.4412833558154417

In [11]:
n.layers[0].neurons[0].w[0].data

-0.40447442034189685

In [None]:
# loss is the defined in the expression above, draw_dot will draw the forward pass of the loss
draw_dot(loss)

SO what happened above is following
1. 4 forward MLP passes
2. 4 outputs produced by each run of MLP
3. all 4 outputs combined, the loss expression is then forward passed
4. we have final single value (L)

Now time to change the grad of weights or bias, changing grad for inputs doesnt make sense since the input never changes!!! Overall to reduce L

In [21]:
# lets now iterate all the params and change them
# remember that 
for p in n.parameters():
    # direction of the gradient (or vector of all gradients) is in the direction of the Loss
    
    # change in L w.r.t to a node is indicated by node's gradient. If grad is -tive, reducing the nodes data will increase the L, 
    # if the grad is +tive, changing the nodes data positively will increase L. Therefore we need to add a minus sign to the step size, because we want to minimize the L.
    
    # modify p.data slightly in the direction of the gradient, 0.01 is the small step size we will be taking, '-' because we want to minimize loss
    p.data += -0.1 * p.grad

In [13]:
# after nudging the params above, we will see a small change in data of the neuron
n.layers[0].neurons[0].w[0].data

-0.40006158678374243

In [28]:
ypred

[Value(data=0.9999999964210621),
 Value(data=-0.9999907053746284),
 Value(data=-0.9999999503491005),
 Value(data=0.9999999960438181)]

In [27]:
# rerunning cell 20 18 19 i.e. forward pass, backward, nudging -> L reduces -> predictions becoming better
for k in range(10):
    # forward pass
    ypred = [n(x) for x in xs]
    loss = sum([(yout - ygt)**2 for ygt, yout in zip(ys, ypred)])

    # backward pass
    for p in n.parameters(): # ensuring the grad is flushed before every backward pass (a.k.a zero grad)
        p.grad = 0.0
    loss.backward()

    # change
    for p in n.parameters():
        p.data += -0.1 * p.grad

    print(k, loss)

0 Value(data=8.639263582609676e-11)
1 Value(data=8.639262678652186e-11)
2 Value(data=8.639261774696947e-11)
3 Value(data=8.63926087073963e-11)
4 Value(data=8.639259966784485e-11)
5 Value(data=8.639259062827185e-11)
6 Value(data=8.639258158869929e-11)
7 Value(data=8.639257254914928e-11)
8 Value(data=8.639256350957767e-11)
9 Value(data=8.63925544700286e-11)


In [None]:
# Following is what we did
# create the datastructure
# create the graph method
# create the MLP
# create the training loop