- Automatic differentialtion
- chain-rule
- closed-form symbolic derivative
- finite differences method
- back propagation
- primal evaluation

In [2]:
from math import exp, sin, cos

simple funciton example

In [None]:
# note we are restricting ourselves to a computational chain, i.e. we have an input x which goes through multiple operations which has a 1 to 1 mapping
f = lambda x: exp(sin(sin(x)))

In [5]:
f(2.0)

2.2013533791690376

now we take the closed-form symbolic derivative

In [7]:
# so going through each element we take the derivative, i.e. the derivative of the exponential is just the exponential, then we go through the layers deeper
f_prime = lambda x: exp(sin(sin(x))) * cos(sin(x)) * cos(x)

In [None]:
# now let's query our derivative function at the same point
f_prime(2.0)

-0.562752038662712

we can validate this derivative using finite differences approximations

In [9]:
# so we take the original primal function and evaluate it at 2.0 with a small value
(f(2.0 + 1e-8) - f(2.0)) / 1e-8

-0.5627520671680486

In [None]:
# so now we need to build our primatives which are the exp and sin which was used in f:

# note there are some coroutines which can compute the sine and cosine at the same time instead as below which are done seperatly
def sin_backprop_rule(x):
    """back-pop rule returns both the primal output as well as the pullback operation"""
    # first it produces a primal output y
    y = sin(x)

    # then it defines a pullback operation which is a closure function which we will return which will allow us to back propergate cotangent information on x
    def sin_pullback(y_cotangent):
        x_cotangent = y_cotangent * cos(x)
        return x_cotangent

    return y, sin_pullback

In [15]:
def exp_backprop_rule(x):
    """This takes a primal input x and produces a primal output y"""
    y = exp(x)

    def exp_pullback(y_cotangent):
        """This closure function captures the cotangent information of the output which is the value of y 
        and back-props to the cotangent of the input of x
        """
        x_cotangent = y_cotangent * y
        return x_cotangent
    
    return y, exp_pullback

In [None]:
# (rule library as a dict) - associate a back-prop rule to each of the functions we want to use in our computational chain
primative_rules: dict = {
    sin: sin_backprop_rule,
    exp: exp_backprop_rule
}

In [None]:
# vector-jacobian-product
def vjp(chain: list, primal):
    """ A function which takes in a computational chain, as well as a primal point in which you want to evaluate the chain
    - it produces a primal pass and records the pullback operations
    - then it produces a vector jacobian product function (or a pullback function) which you can query cotangent information
    - then you can back propogate that
    """

    # create a container to record the pullback operations
    pullback_stack: list = []
    
    # starting value position in which we want to evaluate the chain at
    current_value = primal

    # primal pass
    for operation in chain:
        # retrieve the rule for particular primative operation
        rule = primative_rules[operation]
        # this will return a value which we will override as well as another function
        current_value, current_pullback = rule(current_value)
        # so we will be saving a function in this list container
        pullback_stack.append(current_pullback)

    def pullback(cotangent):
        """"""
        # reverse pass
        current_cotangent = cotangent
        for back in reversed(pullback_stack):
            current_cotangent = back(current_cotangent)

        return current_cotangent

    return current_value, pullback

In [None]:
# the function we want to evaluate would be the chain of operations going from the inner most to the outer i.e. exp(sin(sin(x))) to [sin, sin, exp]
out, back = vjp([sin, sin, exp], 2.0)

In [None]:
# this is the same as the primal operation
out

2.2013533791690376

In [None]:
# we call it at 1.0 to get the derivative because we are evaluation a vjp computes the effect of the jacobian of it was left multiplied with a vector and if the scaler is 1.0 then we are not scaling it but we are just evaluating the derivative as is
back(1.0)

-0.562752038662712

In [24]:
# convenience function
def val_and_grad(chain, x):
    """"""
    y, back = vjp(chain, x)
    derivative = back(1.0)
    return y, derivative

In [25]:
val_and_grad([sin, sin, exp], 2.0)

(2.2013533791690376, -0.562752038662712)

In [26]:
f(2.0), f_prime(2.0)

(2.2013533791690376, -0.562752038662712)

to compare, AD in general provides derivative information at machine precision, so it uses the same algorithmic implementation of the graph of the function that you use for the primal evaluation