In [1]:
# Import TensorFlow
import tensorflow as tf

In [2]:
# Sample Tensor
x = tf.ones((2, 2))
x.numpy()

array([[1., 1.],
       [1., 1.]], dtype=float32)

In [3]:
# Using GradientTape API for AutoDiff
with tf.GradientTape() as t:
    t.watch(x)
    y = tf.reduce_sum(x)
    print("y: ", y.numpy())
    z = tf.multiply(y, y)
    print("z: ", z.numpy())

y:  4.0
z:  16.0


In [4]:
# Derivative of z with respect to the original input tensor x
# z = y^2
# y = summation(x)
# dz/dx = d(y^2)/dx = 2y dy/dx = 2(summation(x)) * dx/dx = 2 * (summation(x)) = 2 * 4 = 8.0
dz_dx = t.gradient(z, x)
print("dz/dx: ", dz_dx.numpy())

dz/dx:  [[8. 8.]
 [8. 8.]]


In [5]:
# Check if the Gradient values are correct
for i in [0, 1]:
    for j in [0, 1]:
        assert dz_dx[i][j].numpy() == 8.0

### Intermediate Gradient Computation

Gradients of the output with respect to intermediate values computed during a "recorded" tf.GradientTape context

In [6]:
# Using GradientTape API for AutoDiff
with tf.GradientTape() as t:
    t.watch(x)
    y = tf.reduce_sum(x)
    print("y: ", y.numpy())
    z = tf.multiply(y, y)
    print("z: ", z.numpy())

y:  4.0
z:  16.0


In [7]:
# Derivative of z with respect to the intermediate tensor y
# z = y^2
# y = summation(x)
# dz/dy = d(y^2)/dy = 2y * dy/dy = 2y = 2 * 4.0 = 8.0
dz_dy = t.gradient(z,y)
print("dz/dy: ", dz_dy.numpy())

dz/dy:  8.0


### Persistant Gradients

By default, the resources held by a GradientTape are released as soon as GradientTape.gradient() method is called. To compute multiple gradients over the same computation, create a persistent gradient tape. This allows multiple calls to the gradient() method as resources are released when the tape object is garbage collected.

In [8]:
# Sample Tensor
x = tf.constant(3.0)
x.numpy()

3.0

In [9]:
# Using GradientTape API for AutoDiff with Persistant Gradients
with tf.GradientTape(persistent=True) as t:
    t.watch(x)
    y = x * x
    print("y: ", y.numpy())
    z = y * y
    print("z: ", z.numpy())

y:  9.0
z:  81.0


In [10]:
# Derivative of z with respect to the original input tensor x
# z = y^2
# y = x^2
# dz/dx = d(y^2)/dx = 2y * dy/dx = 2 * x^2 * d(x^2)/dx = 2 * x^2 * 2x * dx/dx = 4 * x^3 = 108.0
dz_dx = t.gradient(z, x)
print("dz/dx: ", dz_dx.numpy())

dz/dx:  108.0


In [11]:
# Derivative of y with respect to the input tensor y
# dy/dx = d(x^2)/dx = 2*x = 6.0
dy_dx = t.gradient(y, x)
print("dy/dx: ", dy_dx.numpy())

dy/dx:  6.0


In [12]:
# Drop the reference to the tape
del t

### Recording Control Flow

Because tapes record operations as they are executed, Python control flow (using ifs and whiles for example) is naturally handled.

In [13]:
# Sample Function
def sample(x, y):
    output = 1.0
    for i in range(y):
        if i > 1 and i < 5:
            output = tf.multiply(output, x)
    return output

In [14]:
# Sample Grad Function
def grad(x, y):
    with tf.GradientTape() as t:
        t.watch(x)
        # get output from function
        out = sample(x, y)
        print("output: ", out)
    # Return gradient of output w.r.t input
    return t.gradient(out, x)

In [15]:
# Sample Input Tensor
x = tf.convert_to_tensor(2.0)

In [16]:
assert grad(x, 6).numpy() == 12.0
assert grad(x, 5).numpy() == 12.0
assert grad(x, 4).numpy() == 4.0

output:  tf.Tensor(8.0, shape=(), dtype=float32)
output:  tf.Tensor(8.0, shape=(), dtype=float32)
output:  tf.Tensor(4.0, shape=(), dtype=float32)


### Higher-order gradients

Operations inside of the GradientTape context manager are recorded for automatic differentiation. If gradients are computed in that context, then the gradient computation is recorded as well. As a result, the exact same API works for higher-order gradients as well.

In [17]:
# Sample Input Tensor
x = tf.Variable(1.0)

In [18]:
with tf.GradientTape() as t:
    with tf.GradientTape() as t2:
        y = x * x * x
        print('y: ', y.numpy())
        
    # Compute Gradient of y w.r.t input x
    # y = x^3
    # dy/dx = 3 * x^2 = 3.0
    dy_dx = t2.gradient(y, x)
    print("dy/dx: ", dy_dx.numpy())
    
# Compute Gradient of Gradient of y w.r.t x w.r.t x
# dy/dx = 3 * x^2
# d2y/dx2 = 3 * 2x = 6x = 6.0
d2y_dx2 = t.gradient(dy_dx, x)
print("d2y/dx2: ", d2y_dx2.numpy())

y:  1.0
dy/dx:  3.0
d2y/dx2:  6.0
