**Necessary Imports**
--

In [1]:
import tensorflow as tf
import numpy as np

from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

2024-05-18 19:52:04.838109: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-18 19:52:04.838244: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-18 19:52:04.997538: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
housing_data = fetch_california_housing()
X_train, X_test, y_train, y_test = train_test_split(housing_data.data, housing_data.target)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train)

std_sc = StandardScaler()
X_train = std_sc.fit_transform(X_train)
X_valid = std_sc.fit_transform(X_valid)
X_test = std_sc.fit_transform(X_test)

print("X Train: ", X_train.shape)
print("X Valid: ", X_valid.shape)
print("X Test: ", X_test.shape)

X Train:  (11610, 8)
X Valid:  (3870, 8)
X Test:  (5160, 8)


**Building a Residual Network (ResNet)**
--

<center>
<img src="https://d2l.ai/_images/resnet-block.svg" alt="Image of ResNet">
</center>

The above image presents a common architecture utilising **`ResNets`**

In [3]:
# Defining the Custom Residual Layer
class ResidualBlock(tf.keras.layers.Layer):
    def __init__(self, n_layers, n_neurons, **kwargs):
        
        # Inheriting all the attributes from the parent class
        super().__init__(**kwargs)
        
        # Defining the Residual Layer
        self.hidden = [
            tf.keras.layers.Dense(n_neurons, activation="relu", kernel_initializer="he_normal") for layer in range(n_layers)
        ]
        
        '''
        The above list comprehension defines all the Dense layers in between the skip connection of the Residual block
        '''
        
    def call(self, inputs):
        Z = inputs
        
        # Looping to complete forward propagation of the hidden layers in the Residual Block
        for layer in self.hidden:
            Z = layer(Z)
            
        # Returns a concatenation of the results from forward prop through the dense layers with the skip connection
        return inputs + Z
    
    def get_config(self):
        base_config = super().get_config()
        return {**base_config, "hidden": self.hidden}

**Important**
- When Keras detects the hidden attribute it automatically tallies the parameters of all the hidden layers.
- Keras then adds all the tallied parameters to the trackable parameters of custom layer for transparency.

In [4]:
class ResidualRegressor(tf.keras.Model):
    def __init__(self, output_dim, **kwargs):
        
        # Inheriting all the attributes from the parent class
        super().__init__(**kwargs)
        
        # First Layer of the Residual Regressor Model
        self.hidden1 = tf.keras.layers.Dense(30, activation="relu", kernel_initializer="he_normal")
        
        # Residual Layers
        self.resblock1 = ResidualBlock(2, 30)
        self.resblock2 = ResidualBlock(2, 30)
        
        # Output Layer
        self.out = tf.keras.layers.Dense(output_dim)
        
    def call(self, inputs):
        
        # Propagation through the Input Layer
        Z = self.hidden1(inputs)
        
        # Propagation through the first Residual Blocks 4 times 
        for layer in range(4):
            Z = self.resblock1(Z)
            
        # Propagation through the second Residual Block
        Z = self.resblock2(Z)
        
        # Final Output
        return self.out(Z)
    
    def get_config(self):
        base_config = super().get_config()
        return {**base_config, "output_dim": self.out}

In [5]:
# Building the Model
residual_model = ResidualRegressor(1)
residual_model.summary()

# Compiling the model
residual_model.compile(
    loss="mse",
    optimizer="adam"
)

# Training the model
residual_model.fit(
    X_train, y_train, validation_data=(X_valid, y_valid), epochs=20
)

Epoch 1/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 2.2475 - val_loss: 0.7206
Epoch 2/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.7112 - val_loss: 0.6658
Epoch 3/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.5615 - val_loss: 0.6394
Epoch 4/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.5326 - val_loss: 0.7940
Epoch 5/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.7577 - val_loss: 0.8395
Epoch 6/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.4657 - val_loss: 1.0456
Epoch 7/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.4426 - val_loss: 1.0787
Epoch 8/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.7416 - val_loss: 1.0922
Epoch 9/20
[1m363/363[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x7b745c81dba0>

In [6]:
# Saving the model
residual_model.save("resnet_model.keras")

**Losses and Metrics based on model internals**
--

- When training more complex models we will need to account for losses due to specific behaviour encountered by specific layers of the model.
- This is done by monitoring the internal losses of the model in specific parts of the model.

In [7]:
class ReconstructionRegressor(tf.keras.models.Model):
    def __init__(self, output_dim, **kwargs):
        super().__init__(**kwargs)
        
        # Defining the hidden layers
        self.hidden = [
            tf.keras.layers.Dense(30, activation="relu", kernel_initializer="he_normal") for layer in range(5)
        ]
        
        # Defining the output layer
        self.out = tf.keras.layers.Dense(output_dim)
        
        # Reconstruction Mean
        self.reconstruction_mean = tf.keras.metrics.Mean(name="reconstruction_error")
        
    def build(self, batch_input_shape):
        n_inputs = batch_input_shape[-1]
        self.reconstruct = tf.keras.layers.Dense(n_inputs)
        
    def call(self, inputs, training=None):
        
        # Carrying Forward Propagation
        Z = inputs
        for layer in self.hidden:
            Z = layer(Z)
            
        # Passing the final o/p of forward prop through the reconstruction layer to calculate the reconstruction loss
        reconstruction = self.reconstruct(Z)
        reconstruction_loss = tf.reduce_mean(tf.square(reconstruction - inputs))
        
        # Adding the calculated loss to the loss of the model with a 5% weight
        self.add_loss(0.05 * reconstruction_loss)
        
        # Updating the extra metric
        if training:
            result = self.reconstruction_mean(reconstruction_loss)
            
        # Output
        return self.out(Z)

In [8]:
# Using the Reconstruction Model
rec_model = ReconstructionRegressor(1)

# Compiling the model
rec_model.compile(
    loss="mse",
    optimizer="nadam"
)

# Training the model
rec_model.fit(
    X_train, y_train, validation_data=(X_valid, y_valid), epochs=20
)

Epoch 1/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 2.1144 - reconstruction_error: 1.2771 - val_loss: 0.6131 - val_reconstruction_error: 0.0000e+00
Epoch 2/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.5366 - reconstruction_error: 0.5980 - val_loss: 0.5336 - val_reconstruction_error: 0.0000e+00
Epoch 3/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.4458 - reconstruction_error: 0.4101 - val_loss: 0.6151 - val_reconstruction_error: 0.0000e+00
Epoch 4/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.4080 - reconstruction_error: 0.3361 - val_loss: 0.6715 - val_reconstruction_error: 0.0000e+00
Epoch 5/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.3953 - reconstruction_error: 0.2379 - val_loss: 0.7399 - val_reconstruction_error: 0.0000e+00
Epoch 6/20
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7b745c32b4f0>

**Computing Gradient using the TensorFlow AutoDiff**
--

In [9]:
# Working on Differentiating a Simple Function
def my_simple_function(w1, w2):
    return 3 * w1 ** 2 + 2 * w1 * w2

# Example Working
w1, w2 = 30, 60
print(my_simple_function(w1, w2))

6300


In [10]:
# Differentiating by tracking the difference in function by tweaking the variables
eps = 1e-6
w1, w2 = 30, 60

diff_wrt_w1 = my_simple_function(w1 + eps, w2) - my_simple_function(w1, w2)
diff_wrt_w2 = my_simple_function(w1, w2 + eps) - my_simple_function(w1, w2)
print("Differentiating wrt to W1: ", round(diff_wrt_w1 / eps, 8))
print("Differentiating wrt to W2: ", round(diff_wrt_w2 / eps, 8))

Differentiating wrt to W1:  300.00000334
Differentiating wrt to W2:  60.0000003


**Important**
- Though the approximations are correct we need to call the `my_simple_function()` atleast one time for each parameter to attain the differentiation for each function.
- This is not scalable for large and deep neural networks since there will be **tens of thousands of parameters**.

## Using the TensorFlow AutoDiff

- The tf.Variables are tracked by the GradientTape once defined.
- The gradient method from the TensorFlow AutoDiff goes through all the recorded computations once in reverse order.
- Thus the TensorFlow AutoDiff is very efficient.

In [11]:
w1, w2 = tf.Variable(30.), tf.Variable(60.)
with tf.GradientTape() as tape:
    z = my_simple_function(w1, w2)

gradients = tape.gradient(z, [w1, w2])
print(gradients)

[<tf.Tensor: shape=(), dtype=float32, numpy=300.0>, <tf.Tensor: shape=(), dtype=float32, numpy=60.0>]


**Important**
- Once the Tape is used it is automatically deleted to save memory, thus subsequent calls of the Tape variable leads to a runtime error

In [12]:
with tf.GradientTape() as tape:
    z = my_simple_function(w1, w2)

gradient1 = tape.gradient(z, [w1, w2])
print(gradient1)

try:
    gradient2 = tape.gradient(z, [w1, w2])
except RuntimeError:
    print("The tape variable doesnt exist anymore")

[<tf.Tensor: shape=(), dtype=float32, numpy=300.0>, <tf.Tensor: shape=(), dtype=float32, numpy=60.0>]
The tape variable doesnt exist anymore


**Important**
- We can make the tape persist untill we manually release the space allocated for the tape.
- This way the tape can be called subsequently.

In [13]:
with tf.GradientTape(persistent=True) as tape:
    z = my_simple_function(w1, w2)

gradient1 = tape.gradient(z, [w1, w2])
print(gradient1)

try:
    gradient2 = tape.gradient(z, [w1, w2])
    print(gradient2)
except RuntimeError:
    print("The tape variable doesnt exist anymore")
finally:
    print("Thats two subsequent calls of the tape variable")
    del tape
    print("The tape has now been released")
    
try:
    print("Trying a third access of the tape ...")
    gradient3 = tape.gradient(z, [w1, w2])
    print(gradient3)
except NameError:
    print("The tape variable doesnt exist anymore")

[<tf.Tensor: shape=(), dtype=float32, numpy=300.0>, <tf.Tensor: shape=(), dtype=float32, numpy=60.0>]
[<tf.Tensor: shape=(), dtype=float32, numpy=300.0>, <tf.Tensor: shape=(), dtype=float32, numpy=60.0>]
Thats two subsequent calls of the tape variable
The tape has now been released
Trying a third access of the tape ...
The tape variable doesnt exist anymore


**Important**
- By default the tape only track operations of variables.
- Thus operating on anything other than variables returns none.

In [14]:
c1, c2, = tf.constant(30.), tf.constant(60.)

with tf.GradientTape(persistent=True) as tape:
    z = my_simple_function(c1, c2)

gradients = tape.gradient(z, [c1, c2])
print(gradients)

[None, None]


**Important**
- The GradientTape can be forced to take gradient of other objects aswell by utilising the tf.watch()
- The tf.watch() forces tape to keep a track of all the tensors that arent even variables.
- They are then operated upon as variables.

In [15]:
c1, c2, = tf.constant(30.), tf.constant(60.)

with tf.GradientTape(persistent=True) as tape:
    tape.watch(c1)
    tape.watch(c2)
    z = my_simple_function(c1, c2)

gradients = tape.gradient(z, [c1, c2])
print(gradients)

[<tf.Tensor: shape=(), dtype=float32, numpy=300.0>, <tf.Tensor: shape=(), dtype=float32, numpy=60.0>]


**Important**
- The gradients calculated by the GradientTape can also be stopped for different parts of the neural network during backprop where necessary.
- This is done using the `tf.stop_gradient()` in the function that for which gradients are being taken.

In [16]:
def my_stopping_fn(w1, w2):
    return 3 * w1 ** 2 + tf.stop_gradient(2 * w1 * w2)

w1, w2 = tf.Variable(30.), tf.Variable(60.)
with tf.GradientTape() as tape:
    z = my_stopping_fn(w1, w2)
    
gradients = tape.gradient(z, [w1, w2])
print(gradients)
print("Here the second gradient returns none as it has been stopped during backprop")

[<tf.Tensor: shape=(), dtype=float32, numpy=180.0>, None]
Here the second gradient returns none as it has been stopped during backprop


**Important**
- Given a list of losses the gradients are calculated for all the elements in the list together.
- To avoid this we utilise the `jacobian()` which calculates the derivative for each element of the list individually.
- Partial derivatives can take a step further and also calculates `hessians()`.
- Both the `jacobian()` and the `hessian()` are provided in the **`GradientTape of TensorFlow`**.

In [17]:
# Taking the sum of elements of list for calculating the gradient
with tf.GradientTape() as tape:
    z1 = my_simple_function(w1, w2 + .1)
    z2 = my_simple_function(w1, w2 + .3)
    z3 = my_simple_function(w1, w2 + .5)
    
gradients_without_summation = tape.gradient([z1, z2, z3], [w1, w2])
print("Gradients of the List (Automated Sum)\n", gradients_without_summation)

with tf.GradientTape() as tape:
    z1 = my_simple_function(w1, w2 + .1)
    z2 = my_simple_function(w1, w2 + .3)
    z3 = my_simple_function(w1, w2 + .5)
    z = z1 + z2 + z3
    
gradients_with_summation = tape.gradient(z, [w1, w2])
print("\nGradients of the list (Summed together)\n", gradients_with_summation)

Gradients of the List (Automated Sum)
 [<tf.Tensor: shape=(), dtype=float32, numpy=901.8>, <tf.Tensor: shape=(), dtype=float32, numpy=180.0>]

Gradients of the list (Summed together)
 [<tf.Tensor: shape=(), dtype=float32, numpy=901.8>, <tf.Tensor: shape=(), dtype=float32, numpy=180.0>]


In [18]:
# Computing Jacobians and Hessians
with tf.GradientTape(persistent=True) as hessian_tape:
    with tf.GradientTape() as jacobian_tape:
        z = my_simple_function(w1, w2)
    
    # Calculating the Jacobians
    jacobians = jacobian_tape.gradient(z, [w1, w2])
    print("The calculated Jacobians")
    print(jacobians)
    
# Calculating the Jacobians
hessians = [hessian_tape.gradient(jacobian, [w1, w2]) for jacobian in jacobians]
print("\nThe calculated Hessians")
print(hessians)

# Deleting the Gradient Tape
del hessian_tape

The calculated Jacobians
[<tf.Tensor: shape=(), dtype=float32, numpy=300.0>, <tf.Tensor: shape=(), dtype=float32, numpy=60.0>]

The calculated Hessians
[[<tf.Tensor: shape=(), dtype=float32, numpy=6.0>, <tf.Tensor: shape=(), dtype=float32, numpy=2.0>], [<tf.Tensor: shape=(), dtype=float32, numpy=2.0>, None]]


**Working with Custom Training Loops**
--

In [19]:
tf.keras.backend.clear_session()

# Simple Model
my_simple_model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(30, activation="relu", kernel_initializer="he_normal"),
    tf.keras.layers.Dense(1)
])

# Random Sampling of the Training set to batch examples together
def random_batch(X, y, batch_size=32):
    
    # Acquiring indexs of the required batch size
    idx = np.random.randint(len(X), size=batch_size)
    
    # Returning the subsets in the dataset
    return X[idx], y[idx]
    
# A function to display the status of training taking place
def status_bar(step, total, loss, metrics=None):
    
    # Prints the metrics as long as they are not none along with the losses for each epoch
    metrics = " - ".join([f"{metric.name}: {metric.result():.4f}" for metric in [loss] + (metrics or [])])
    
    # New line if training has completed else space
    end = "" if step < total else "\n"
    
    # Result string
    print(f"\r{step}/{total} - {metrics}", end=end)

In [20]:
# Hyperparameters for the model
n_epochs = 20
batch_size = 32
n_steps = len(X_train) // batch_size
optimizer = tf.keras.optimizers.Adam()
loss_fn = tf.keras.losses.mean_squared_error
mean_loss = tf.keras.metrics.Mean()
metrics = [tf.keras.metrics.MeanAbsoluteError()]

# Training Loop
# Looping for n_epochs
for epoch in range(1, n_epochs + 1):
    print(f"{epoch}/{n_epochs}")
    
    # Looping for n_steps per epoch
    for step in range(1, n_steps + 1):
        
        # Sampling the batches for each epoch
        X_batch, y_batch = random_batch(X_train, y_train)
        with tf.GradientTape() as tape:
            
            # Making predictions for the current batch
            y_pred = my_simple_model(X_batch, training=True)
            
            # Calculating the Squared Loss for the current batch
            squared_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
            
            # Total loss alongside the regularisation losses
            loss = tf.add_n([squared_loss] + my_simple_model.losses)
            
        # Calculating the Gradients
        gradients = tape.gradient(loss, my_simple_model.trainable_variables)
        
        # Updating the weights of in backprop
        optimizer.apply_gradients(zip(gradients, my_simple_model.trainable_variables))
        
        # Accounting for Constraints
        for variable in my_simple_model.variables:
            if variable.constraint is not None:
                variable.assign(variable.constraint(variable))
                
        # Displaying the status
        mean_loss(loss)
        for metric in metrics:
            metric(y_batch, y_pred)
        status_bar(step, n_steps, mean_loss, metrics)
        
    # Resetting the states of the streamline metrics for the next epoch
    for metric in [mean_loss] + metrics:
        metric.reset_state

1/20
362/362 - mean: 1.6523 - mean_absolute_error: 0.9271
2/20
362/362 - mean: 1.1951 - mean_absolute_error: 0.7728
3/20
362/362 - mean: 0.9898 - mean_absolute_error: 0.6975
4/20
362/362 - mean: 0.8714 - mean_absolute_error: 0.6506
5/20
362/362 - mean: 0.7924 - mean_absolute_error: 0.6192
6/20
362/362 - mean: 0.7348 - mean_absolute_error: 0.5957
7/20
362/362 - mean: 0.6892 - mean_absolute_error: 0.5771
8/20
362/362 - mean: 0.6564 - mean_absolute_error: 0.5628
9/20
362/362 - mean: 0.6287 - mean_absolute_error: 0.5506
10/20
362/362 - mean: 0.6084 - mean_absolute_error: 0.5413
11/20
362/362 - mean: 0.5933 - mean_absolute_error: 0.5328
12/20
362/362 - mean: 0.5772 - mean_absolute_error: 0.5257
13/20
362/362 - mean: 0.5631 - mean_absolute_error: 0.5194
14/20
362/362 - mean: 0.5503 - mean_absolute_error: 0.5136
15/20
362/362 - mean: 0.5385 - mean_absolute_error: 0.5083
16/20
362/362 - mean: 0.5281 - mean_absolute_error: 0.5034
17/20
362/362 - mean: 0.5196 - mean_absolute_error: 0.4994
18/20


**Working with TensorFlow Functions and Graphs**
--

In [21]:
# Simple Python function
def simple_function(x, y):
    return 3 * x ** 2 + 2 * y + 1

print("Namespace of the python function:\n", simple_function)
print("\nResult for integers:\n", simple_function(10, 20))
print("\nResult for TensorFlow variables:\n", simple_function(tf.Variable(10), tf.Variable(20)))

# Converting the Python Function to a TensorFlow Function
tf_simple_function = tf.function(simple_function)

print("\nNamespace of the tensorflow function:\n", tf_simple_function)
print("\nResult for integers:\n", tf_simple_function(10, 20))
print("\nResult for TensorFlow variables:\n", tf_simple_function(tf.Variable(10), tf.Variable(20)))

Namespace of the python function:
 <function simple_function at 0x7b745c388a60>

Result for integers:
 341

Result for TensorFlow variables:
 tf.Tensor(341, shape=(), dtype=int32)

Namespace of the tensorflow function:
 <tensorflow.python.eager.polymorphic_function.polymorphic_function.Function object at 0x7b745c4fc160>

Result for integers:
 tf.Tensor(341, shape=(), dtype=int32)

Result for TensorFlow variables:
 tf.Tensor(341, shape=(), dtype=int32)


**Importance of TensorFlow Functions**
--

- TensorFlow functions work by creating computation graphs for each of the operations taking place within the function.
- They automatically simplify the operations and prune unused nodes to increase efficiency.
- Thus are often much faster than regular python functions as they support parallel execution by default.
- Therefore complex expressions, functions are execute much better when executed as TensorFlow functions.


**Properties of TensorFlow Functions**
--

- When passing tensors to TensorFlow function, they further try to reuse already created computation graphs where possible through polymorphism after breaking down the expressions.
- It any python datastructure is passed into a TensorFlow function a new computation graph is created for each call of the TensorFlow function.
- This leads to loss RAM and storage quickly, hence any TensorFlow function once used must be deallocated.

In [22]:
# Python functions can also be decorated into TensorFlow functions
@tf.function
def new_simple_function(x, y, z):
    return x ** 2 + y ** 2 - z ** 2

print("Name space of the function:\n", new_simple_function)
print("\nResult for Integers:\n", new_simple_function(2, 3, 5))
print("\nResult for TensorFlow Variables:\n", new_simple_function(tf.Variable(2), tf.Variable(3), tf.Variable(5)))

Name space of the function:
 <tensorflow.python.eager.polymorphic_function.polymorphic_function.Function object at 0x7b7454328be0>

Result for Integers:
 tf.Tensor(-12, shape=(), dtype=int32)

Result for TensorFlow Variables:
 tf.Tensor(-12, shape=(), dtype=int32)


**Important**
- The python function can still be accessed from the TensorFlow function using the python_function attribute

In [23]:
new_simple_function.python_function(2, 3, 5)

-12

**Fin ✨**
--

## I had a lot of fun working on this notebook
## Hope you had just as much fun reading
🙂🙃