<a href="https://colab.research.google.com/github/aaalexlit/tf-advanced-techniques-spec/blob/main/course_2_custom_distributed_training/Week1_Gradient_tape_basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import tensorflow as tf
import numpy as np
import random

# Gradient tape

In [6]:
x_train = np.array([-1, 0, 1, 2, 3, 4], dtype=float)
# y = 2*x - 1
y_train = np.array([-3, -1, 1, 3, 5, 7], dtype=float)

# Trainable variables
w = tf.Variable(random.random(), trainable=True)
b = tf.Variable(random.random(), trainable=True)

# Loss function
def simple_loss(real_y, pred_y):
  return tf.abs(real_y - pred_y)

# Learning rate
LEARNING_RATE = 0.01

def fit_data(real_x, real_y):
  with tf.GradientTape(persistent=True) as tape:
    # Make prediction
    pred_y = w * real_x + b
    # Calculate loss
    reg_loss = simple_loss(real_y, pred_y)

  # Calculate gradients
  w_gradient = tape.gradient(reg_loss, w)
  b_gradient = tape.gradient(reg_loss, b)

  # Update vars
  w.assign_sub(w_gradient * LEARNING_RATE)
  b.assign_sub(b_gradient * LEARNING_RATE)

for _ in range(500):
  fit_data(x_train, y_train)

print(f'y ~ {w.numpy()}x + {b.numpy()}')

y ~ 2.048419713973999x + -0.9715903401374817


## Simple gradient

In [7]:
w = tf.Variable([[1.0]])
with tf.GradientTape() as tape:
  loss = w * w

tape.gradient(loss, w)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[2.]], dtype=float32)>

In [8]:
x = tf.ones((2,2))
print('x', x)
with tf.GradientTape() as t:
  t.watch(x)
  y = tf.reduce_sum(x)
  z = tf.square(y)

# derivative of z wrt the original input tensor x
dz_dx = t.gradient(z, x)
print('dz_dx', dz_dx)

x tf.Tensor(
[[1. 1.]
 [1. 1.]], shape=(2, 2), dtype=float32)
dz_dx tf.Tensor(
[[8. 8.]
 [8. 8.]], shape=(2, 2), dtype=float32)


## `persistent=True`

In [9]:
x = tf.constant(3.0)
with tf.GradientTape(persistent=True) as t:
	t.watch(x)
	y = x * x
	z = y * y
dz_dx = t.gradient(z, x)
print(dz_dx)
dy_dx = t.gradient(y, x)
print(dy_dx)
del t # Drop the reference to the tape

tf.Tensor(108.0, shape=(), dtype=float32)
tf.Tensor(6.0, shape=(), dtype=float32)


## Higher-order gradients (Nested gradient tapes)



In [10]:
x = tf.Variable(1.0)
with tf.GradientTape() as tape_2:
  with tf.GradientTape() as tape_1:
    y = x * x * x
  dy_dx = tape_1.gradient(y, x)
d2y_dx2 = tape_2.gradient(dy_dx, x)

assert dy_dx.numpy() == 3.0
assert d2y_dx2.numpy() == 6.0

#### Where not to indent the first gradient calculation
If the first gradient calculation is OUTSIDE of the outer `with` block, it won't persist for the second gradient calculation.

In [11]:
x = tf.Variable(1.0)

with tf.GradientTape() as tape_2:
    with tf.GradientTape() as tape_1:
        y = x * x * x

# The first gradient call is outside the outer with block
# so the tape will expire after this
dy_dx = tape_1.gradient(y, x)

# The tape is now expired and the gradient output will be `None`
d2y_dx2 = tape_2.gradient(dy_dx, x)

print(dy_dx)
print(d2y_dx2)

tf.Tensor(3.0, shape=(), dtype=float32)
None
