In [1]:
# Load the TensorBoard notebook extension.
%load_ext tensorboard

In [2]:
# Clear any logs from previous runs
!rm -rf ./logs/

'rm' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
from datetime import datetime
from packaging import version

import tensorflow as tf
from tensorflow import keras
tf.debugging.experimental.enable_dump_debug_info('./logs/',
                                                 tensor_debug_mode="FULL_HEALTH", 
                                                 circular_buffer_size=-1)
from keras import backend as K
import numpy as np

print("TensorFlow version: ", tf.__version__)
assert version.parse(tf.__version__).release[0] >= 2, \
    "This notebook requires TensorFlow 2.0 or above."

INFO:tensorflow:Enabled dumping callback in thread MainThread (dump root: ./logs/, tensor debug mode: FULL_HEALTH)
TensorFlow version:  2.20.0


In [4]:
data_size = 1000
# 80% of the data is for training.
train_pct = 0.8

train_size = int(data_size * train_pct)

# Create some input data between -1 and 1 and randomize it.
x = np.linspace(-1, 1, data_size)
np.random.shuffle(x)

# Generate the output data.
# y = 0.5x + 2 + noise
y = 0.5 * x + 2 + np.random.normal(0, 0.05, (data_size, ))

# Split into test and train pairs.
x_train, y_train = x[:train_size], y[:train_size]
x_test, y_test = x[train_size:], y[train_size:]

In [11]:
logdir = "logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(
    log_dir=logdir,
    histogram_freq=1,  
    write_images=True,  
    write_graph=True,   
    update_freq='epoch', 
    profile_batch='10,20',  
    embeddings_freq=1  
)

# Simple model
model = keras.models.Sequential([
    keras.layers.Dense(16, input_dim=1),
    keras.layers.Dense(1),
], name='simple_model')

# Deep model
model_deep = keras.models.Sequential([
    keras.layers.Dense(64, input_dim=1, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dense(1)
], name='deep_model')

# Compile and train simple model
model.compile(
    loss='mse',
    optimizer=keras.optimizers.SGD(learning_rate=0.2),
    metrics=['mae']  # Add MAE for additional metric
)

print("Training simple model...")
training_history = model.fit(
    x_train,
    y_train,
    batch_size=train_size,
    verbose=1,
    epochs=20,
    validation_data=(x_test, y_test),
    callbacks=[tensorboard_callback],
)
print(f"Simple model - Average test loss: {np.average(training_history.history['val_loss']):.4f}")

# Compile and train deep model with different optimizer
logdir_deep = "logs/scalars/deep_" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback_deep = keras.callbacks.TensorBoard(log_dir=logdir_deep, histogram_freq=1)

model_deep.compile(
    loss='mse',
    optimizer=keras.optimizers.Adam(learning_rate=0.01),  # Using Adam for deep model
    metrics=['mae']
)

print("\nTraining deep model...")
training_history_deep = model_deep.fit(
    x_train,
    y_train,
    batch_size=32,  # Smaller batch size for deep model
    verbose=1,
    epochs=50,  # More epochs for deep model
    validation_data=(x_test, y_test),
    callbacks=[tensorboard_callback_deep],
)
print(f"Deep model - Average test loss: {np.average(training_history_deep.history['val_loss']):.4f}")

# Compare both models
print("\n" + "="*50)
print("Model Comparison:")
print("="*50)
print(f"Simple Model - Final Val Loss: {training_history.history['val_loss'][-1]:.4f}")
print(f"Deep Model - Final Val Loss: {training_history_deep.history['val_loss'][-1]:.4f}")

# Optional: Save models for later use
model.save('simple_model.h5')
model_deep.save('deep_model.h5')
print("\nModels saved. Check TensorBoard for detailed visualization.")
print(f"Run: tensorboard --logdir logs/scalars/")

Training simple model...
Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - loss: 4.2006 - mae: 2.0041 - val_loss: 0.1010 - val_mae: 0.2693
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 961ms/step - loss: 0.0969 - mae: 0.2596 - val_loss: 0.0248 - val_mae: 0.1358
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 944ms/step - loss: 0.0274 - mae: 0.1425 - val_loss: 0.0077 - val_mae: 0.0734
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 943ms/step - loss: 0.0070 - mae: 0.0689 - val_loss: 0.0030 - val_mae: 0.0434
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 943ms/step - loss: 0.0036 - mae: 0.0477 - val_loss: 0.0028 - val_mae: 0.0416
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 964ms/step - loss: 0.0028 - mae: 0.0421 - val_loss: 0.0024 - val_mae: 0.0389
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 957ms



Deep model - Average test loss: 0.0049

Model Comparison:
Simple Model - Final Val Loss: 0.0024
Deep Model - Final Val Loss: 0.0026

Models saved. Check TensorBoard for detailed visualization.
Run: tensorboard --logdir logs/scalars/


In [7]:
#http://localhost:6006

In [12]:
%tensorboard --logdir logs/ --port 6007

Reusing TensorBoard on port 6007 (pid 14944), started 0:27:17 ago. (Use '!kill 14944' to kill it.)

A brief overview of the visualizations created in this example and the dashboards (tabs in top navigation bar) where they can be found:

* Scalars show how the loss and metrics change with every epoch. You can use them to also track training speed, learning rate, and other scalar values. Scalars can be found in the Time Series or Scalars dashboards.
* Graphs help you visualize your model. In this case, the Keras graph of layers is shown which can help you ensure it is built correctly. Graphs can be found in the Graphs dashboard.
* Histograms and Distributions show the distribution of a Tensor over time. This can be useful to visualize weights and biases and verify that they are changing in an expected way. Histograms can be found in the Time Series or Histograms dashboards. Distributions can be found in the Distributions dashboard.

Breakdown of the Debugger Interface
The Debugger Dashboard on the Tensorboard consists of five main components:

* __Alerts:__ This top-left section contains a list of alert events detected by the debugger in the debug data from the instrumented TensorFlow program. Each alert indicates a certain anomaly that warrants attention. In our case, this section highlights 499 NaN/∞ events with a salient pink-red color. This confirms our suspicion that the model fails to learn because of the presence of NaNs and/or infinities in its internal tensor values.
* __Python Execution Timeline:__ This is the upper half of the top-middle section. It presents the full history of the eager execution of ops and graphs. Each box of the timeline is marked by the initial letter of the op or graph’s name. We can navigate this timeline by using the navigation buttons and the scrollbar above the timeline.
* __Graph Execution:__ Located at the top-right corner of the GUI, this section will be central to our debugging task. It contains a history of all the floating-type tensors computed inside graphs, i.e., the ones compiled by @tf-functions.
* __Stack Trace:__ The bottom-right section, shows the stack trace of the creation of every single operation on the graph.
* __Source Code:__ The bottom-left section, highlights the source code corresponding to each operation on the graph.
