following this video first to understand backprop... https://youtu.be/VMj-3S1tku0?si=eP7Ai1CO0BHh-yeS 

also note: to get this working on vscode, needed to install virtual environments T_T 

In [47]:
import math
import numpy as np 
import matplotlib.pyplot as plt

(at first the videos covers gradients, the "speed of change" at any point), then moving onto the structure of "micrograd"

In [48]:
class Value:
    # this is the constructor function in python classes 
    def __init__(self, data, _children=(), _op='', label=''): 
        self.data = data 
        self.grad = 0  # initially, we assume it's zero.
        self._prev = set(_children)
        self._op = _op
        self.label = label
    
    # this defines the string REPRESENTATION of the object 
    def __repr__(self): 
        return f"Value(data={self.data})"
    
    # we need to enable adding as well of these Value objects
    def __add__(self, other):
        out = Value(self.data + other.data, (self, other), '+') # creates a new value object
        return out

    def __mul__(self, other):
        out = Value(self.data * other.data, (self, other), '*') 
        return out
    


In [49]:
a = Value(2.0, label='a')
b = Value(-3.0, label='b')
a + b # doesn't work until... implemented, now a.__add__(b) under the hood
a * b # doesn't work until implementing __mul__

Value(data=-6.0)

In [50]:
# now, we want to be able to SHOW THE CONNECTIVE TISSUE - GRAPHS
# what values produce other values??

# adding children... we want to now include that as part of value.
c = Value(10.0, label='c')
d = a*b + c 
d.label = 'd'
d._prev
d._op

'+'

Now we want to be able to draw a graph of the nodes!

In [51]:
from graphviz import Digraph # Digraph stands for directed graph (ok makes sense)

# using graph traversal 
def trace(root):
    # we are indeed finding all the nodes and edges in a graph 
    nodes, edges = set(), set()

    def build(v):
        # unvisited nodes only duh 
        if v not in nodes:
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v))
                build(child)

    build(root)
    return nodes, edges

def draw_dot(root):
    dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = left to right

    nodes, edges = trace(root)
    for n in nodes:
        uid = str(id(n)) # id is a built in python function, unique (member addr)

        # we create "records" for each node
        dot.node(name = uid, label = "{data %.4f}" % (n.data, ), shape='record')
        if n._op:
            # when this node is created by an operation 
            dot.node(name = uid + n._op, label = n._op)
            # add "operation" nodes to it 
            dot.edge(uid + n._op, uid)
    
    for n1, n2 in edges: 
        # connect n1 and n2 -- the "nodes" with the name w an edge
        dot.edge(str(id(n1)), str(id(n2)) + n2._op)
    
    return dot

In [52]:
draw_dot(d) # errors i think due to graphviz not properly being set up in my python evironment. oh well.
f = Value(-2.0, label ='f')
L = d * f ; L.label = 'L'

now we want to calculate backprop - calculate the gradient of all nodes -- with respect to L.

In [53]:
# L to L: 1. 

L.grad = 1

# L = d * f, dL/dd = f.


# so he is using the definition of derivatives as TAKING A FUNCTION TO INFINITY

# (f(x+h) - f(x))/ h --> right, (function input changes by h, divided by h)
# given that "d" is the "x" in our function;
# (d+h)*f - d*f) / h
# = ((d*f) + (h*f) - (d*f)) / h
# = h*f/h 
# = f.

d.grad = f.data

# then dL/df = d.

f.grad = d.data

In [71]:
# we can verify these numbers with small step sizes: 

def gradient_checker():
    h = 0.001 

    a = Value(2.0, label = 'a')
    b = Value(-3.0, label = 'b')
    c = Value(10.0, label = 'c')
    e = a*b; e.label = 'e'
    # e.data += h #2 
    # c.data += h # 2 
    d = e + c; d.label = 'd'
    f = Value(-2.0, label='f')
    # f.data += h # this returns -3.99999
    # d.data += h # this returns 2.000
    L = d*f; L.label = 'L'
    L1 = L.data

    a = Value(2.0, label = 'a')
    b = Value(-3.0, label = 'b')
    c = Value(10.0, label = 'c')
    e = a*b; e.label = 'e'
    d = e + c; d.label = 'd'
    f = Value(-2.0, label='f')
    L = d*f; L.label = 'L'
    L2 = L.data
    
    print((L2-L1)/h)

gradient_checker()


2.000000000000668


In [64]:
#  and then as we now want to check dc / dL ... 
# first calculating dd / dc: 

# f(x+h) - f(x) / h
# d = c + e, 
# (c+e+h) - (c-e) / h = h / h = 1

# dd/de and dd/dc = 1. 

# put together -> CHAIN RULE : dz/dx = dz/dy * dy/dz MULTIPLY THE DERIVATIVES.
# i like this quote actually - "if a car travels twice as fast as a bike, and a bike 4x fast as a man, then the car travels 8x fast as a man (2*4) = 8"

# OMFG, then we do: 
# dL/dc = dL/dd * dd/dc = 2 * 1 = 2
# same for dL/de = dL/dd * dd/de = 2 * 1 = 2

