### **Using Tape for training a Neural Network**

In [3]:
import tensorflow as tf
import keras
from keras import layers
import numpy as np
import time

import sys
sys.stderr = open('err.txt', 'w')

2025-03-10 17:47:24.914439: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-10 17:47:24.923851: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-10 17:47:24.936115: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-10 17:47:24.939593: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-10 17:47:24.948032: I tensorflow/core/platform/cpu_feature_guar

In [4]:
# Parameters
batch_size = 64
hidden_size = 128
num_classes = 10
epochs = 5

inputs = layers.Input(shape=(784,), name="digits")
x1 = layers.Dense(64, activation="relu")(inputs)
x2 = layers.Dense(64, activation="relu")(x1)
outputs = layers.Dense(10, name="predictions")(x2)
model = keras.Model(inputs=inputs, outputs=outputs)



# Model definition
class MNISTModel(tf.keras.Model):
    def __init__(self, hidden_size, num_classes):
        super(MNISTModel, self).__init__()
        self.dense1 = tf.keras.layers.Dense(hidden_size, activation="relu")
        self.dense2 = tf.keras.layers.Dense(num_classes, activation="softmax")

    def call(self, inputs):
        x = self.dense1(inputs)
        return self.dense2(x)

# Instantiate the model
model = MNISTModel(hidden_size, num_classes)





In [5]:
# Instantiate an optimizer.
optimizer = keras.optimizers.SGD(learning_rate=1e-3)
# Instantiate a loss function.
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# Prepare the metrics.
train_acc_metric = keras.metrics.SparseCategoricalAccuracy()
val_acc_metric = keras.metrics.SparseCategoricalAccuracy()

# Prepare the training dataset.
batch_size = 64
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
x_train = np.reshape(x_train, (-1, 784))
x_test = np.reshape(x_test, (-1, 784))

# Reserve 10,000 samples for validation.
x_val = x_train[-10000:]
y_val = y_train[-10000:]
x_train = x_train[:-10000]
y_train = y_train[:-10000]

# Prepare the training dataset.
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)

# Prepare the validation dataset.
val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_dataset = val_dataset.batch(batch_size)

In [6]:


epochs = 10
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))
    start_time = time.time()

    # Iterate over the batches of the dataset.
    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            logits = model(x_batch_train, training=True)
            loss_value = loss_fn(y_batch_train, logits)
        grads = tape.gradient(loss_value, model.trainable_weights)
        optimizer.apply_gradients(zip(grads, model.trainable_weights))

        # Update training metric.
        train_acc_metric.update_state(y_batch_train, logits)

        # Log every 200 batches.
        if step % 200 == 0:
            print(
                "Training loss (for one batch) at step %d: %.4f"
                % (step, float(loss_value))
            )
            print("Seen so far: %d samples" % ((step + 1) * batch_size))

    # Display metrics at the end of each epoch.
    train_acc = train_acc_metric.result()
    print("Training acc over epoch: %.4f" % (float(train_acc),))

    # Reset training metrics at the end of each epoch
    train_acc_metric.reset_state()

    # Run a validation loop at the end of each epoch.
    for x_batch_val, y_batch_val in val_dataset:
        val_logits = model(x_batch_val, training=False)
        # Update val metrics
        val_acc_metric.update_state(y_batch_val, val_logits)
    val_acc = val_acc_metric.result()
    val_acc_metric.reset_state()
    print("Validation acc: %.4f" % (float(val_acc),))
    print("Time taken: %.2fs" % (time.time() - start_time))


Start of epoch 0
Training loss (for one batch) at step 0: 0.7532
Seen so far: 64 samples
Training loss (for one batch) at step 200: 0.2035
Seen so far: 12864 samples
Training loss (for one batch) at step 400: 0.3066
Seen so far: 25664 samples
Training loss (for one batch) at step 600: 0.2454
Seen so far: 38464 samples
Training acc over epoch: 0.8824
Validation acc: 0.8918
Time taken: 5.89s

Start of epoch 1
Training loss (for one batch) at step 0: 0.3629
Seen so far: 64 samples
Training loss (for one batch) at step 200: 0.6975
Seen so far: 12864 samples
Training loss (for one batch) at step 400: 0.4102
Seen so far: 25664 samples
Training loss (for one batch) at step 600: 0.4883
Seen so far: 38464 samples
Training acc over epoch: 0.9005


2024-12-24 11:22:18.754636: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Validation acc: 0.9075
Time taken: 9.36s

Start of epoch 2
Training loss (for one batch) at step 0: 0.1266
Seen so far: 64 samples
Training loss (for one batch) at step 200: 0.2198
Seen so far: 12864 samples
Training loss (for one batch) at step 400: 0.3629
Seen so far: 25664 samples
Training loss (for one batch) at step 600: 0.3177
Seen so far: 38464 samples
Training acc over epoch: 0.9132
Validation acc: 0.9159
Time taken: 6.20s

Start of epoch 3
Training loss (for one batch) at step 0: 0.2741
Seen so far: 64 samples
Training loss (for one batch) at step 200: 0.1387
Seen so far: 12864 samples
Training loss (for one batch) at step 400: 0.1461
Seen so far: 25664 samples
Training loss (for one batch) at step 600: 0.3236
Seen so far: 38464 samples
Training acc over epoch: 0.9222
Validation acc: 0.9227
Time taken: 6.19s

Start of epoch 4
Training loss (for one batch) at step 0: 0.2109
Seen so far: 64 samples
Training loss (for one batch) at step 200: 0.3178
Seen so far: 12864 samples
Trai

2024-12-24 11:22:43.544765: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Validation acc: 0.9271
Time taken: 6.21s

Start of epoch 6
Training loss (for one batch) at step 0: 0.1451
Seen so far: 64 samples
Training loss (for one batch) at step 200: 0.1693
Seen so far: 12864 samples
Training loss (for one batch) at step 400: 0.1772
Seen so far: 25664 samples
Training loss (for one batch) at step 600: 0.2849
Seen so far: 38464 samples
Training acc over epoch: 0.9383
Validation acc: 0.9279
Time taken: 6.23s

Start of epoch 7
Training loss (for one batch) at step 0: 0.2048
Seen so far: 64 samples
Training loss (for one batch) at step 200: 0.0360
Seen so far: 12864 samples
Training loss (for one batch) at step 400: 0.0411
Seen so far: 25664 samples
Training loss (for one batch) at step 600: 0.2213
Seen so far: 38464 samples
Training acc over epoch: 0.9415
Validation acc: 0.9312
Time taken: 9.77s

Start of epoch 8
Training loss (for one batch) at step 0: 0.2941
Seen so far: 64 samples
Training loss (for one batch) at step 200: 0.1181
Seen so far: 12864 samples
Trai

In [5]:
print(tf.__version__)

2.17.1
