- Implement CNN from Page 496 (Chapter14) for the Fashion MNIST dataset (Chapter 10, Page 318) 

In [1]:
# NVTOP
import numpy as np

In [2]:
import tensorflow as tf

# Testen, ob TensorFlow korrekt installiert a
print("TensorFlow version:", tf.__version__)
print("Available CPUs:", tf.config.list_physical_devices('CPU'))
print("Available GPUs:", tf.config.list_physical_devices('GPU'))

2025-02-23 13:54:59.336371: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740315299.350908   36860 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740315299.355715   36860 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-23 13:54:59.371309: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


TensorFlow version: 2.18.0
Available CPUs: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]
Available GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
# Fixes memory issue
# Before this: compiled model reserves nearly all of the GDDR (7 of 8 GB GDDR) -> crashed
# This fixes memory usage to about 20 %
# Here memory is only reserved when needed (dynamic)
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [4]:
# Load famous fashion MNIST dataset
fashion_mnist = tf.keras.datasets.fashion_mnist.load_data()
# Already shuffled and split into training set (60k images) and test set (10k images)
(X_train_full, y_train_full), (X_test, y_test) = fashion_mnist
# Hold out the last 5k images from training set for validation
X_train, y_train = X_train_full[:-5000], y_train_full[:-5000]
X_valid, y_valid = X_train_full[-5000:], y_train_full[-5000:]

In [5]:
# Keras images are represented by 28x28 rather than 1-D 784 (sci-kit learn)
X_train.shape

(55000, 28, 28)

In [6]:
# Keras pixel intensity integers 0-255, rather than float 0.0-255.0 (sci-kit learn)
X_train.dtype

dtype('uint8')

In [7]:
# Simplicity: Scale down to 0-1 range and transform to float by dividing by 255.0
X_train, X_valid, X_test = X_train / 255., X_valid / 255., X_test / 255.

In [8]:
# Add greyscale dimension to fit into expected model input
# Alternatives, np.reshape, np.expanddims, or Reshape layer in CNN
X_train, X_valid, X_test = X_train[:,:,:, np.newaxis], X_valid[:,:,:, np.newaxis], X_test[:,:,:, np.newaxis]

In [9]:
# Dimension now [28, 28, 1]
X_train.shape

(55000, 28, 28, 1)

In [10]:
# Connect output to classes defined in "https://keras.io/api/datasets/fashion_mnist/"
class_names = ["T-shirt/top", "Trousers", "Pullover", "Dress", "Coat", "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"]

In [11]:
# y_train contains 4,9,0,2 class labels
class_names[y_train[0]]

'Ankle boot'

## Build the CNN
- Page 496

In [12]:
# partial introduced in Chapter 11
from functools import partial

# Acts like Conv2D but with different default arguments:
# small kernel size of 3
# same padding
# ReLu activation function and corresponding He initializer
DefaultConv2D = partial(tf.keras.layers.Conv2D, kernel_size=3, padding="same", activation="relu", kernel_initializer="he_normal")

model = tf.keras.Sequential([
    # Start with a large filter (7x7)
    # Default stride of 1 , because images small
    # Input shape according to dataset with a single (greyscale) color (maybe use Reshape layer)
    DefaultConv2D(filters=64, kernel_size=7, input_shape=[28,28,1]),
    # MaxPooling layer with default stride of 2, therefore each dimension divided by 2
    tf.keras.layers.MaxPool2D(),
    ### Repeat 2x : 2x CNN + MaxPooling
    # For larger images this could be repeated further
    # Note: 
    #   Filters double after MaxPooling as shape reduces by half, no fear: exploding parameters, memory usage, computational load
    #   Filters get larger towards the output layer: 64, 128, 256
    #   Low-level features are mostly low (small circles, horizontal lines)
    #   But many ways to combine them into higher-level features (e.g. face)
    # 1.
    DefaultConv2D(filters=128),
    DefaultConv2D(filters=128),
    tf.keras.layers.MaxPool2D(),
    # 2.
    DefaultConv2D(filters=256),
    DefaultConv2D(filters=256),
    tf.keras.layers.MaxPool2D(),
    # Fully connected layer
    # Flatten -> 2-D image to 1-D array
    tf.keras.layers.Flatten(),
    # Dense: each node is connected with all of the others
    tf.keras.layers.Dense(units=128, activation="relu", kernel_initializer="he_normal"),
    # Each training iteration a random subset of all neurons in one layer (except output layer) are dropped out (output 0)
    tf.keras.layers.Dropout(0.5),
    # Fewer units towards the output layer
    tf.keras.layers.Dense(units=64, activation="relu", kernel_initializer="he_normal"),
    # Dropout 50 % still
    tf.keras.layers.Dropout(0.5),
    # Softmax converts into propabilities
    tf.keras.layers.Dense(units=10, activation="softmax"),
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1740315301.363679   36860 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6887 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1080, pci bus id: 0000:05:00.0, compute capability: 6.1


In [13]:
model.compile(
            # Sparse labels: i.e. for each instance, there is just a target class index from 0 to 9
            # One-hot vector like output would be just categorical_crossentropy
            # Binary or multilabel binary classification: sigmoid in output instead of softmax + binary_crossentropy 
            loss="sparse_categorical_crossentropy",
            optimizer="sgd", # Gradient decent?
            metrics=["accuracy"]) 

In [14]:
# Tracking 
run_num = "1"
graph_dir = f"logs/{run_num}/trace"
graph_name = f"trace_{run_num}"

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=f"logs/{run_num}", histogram_freq=1, write_graph=True)

# Retrieve GPU Usage after each epoch
import subprocess
class GPUMonitorCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        result = subprocess.run(["nvidia-smi", "--query-gpu=memory.used,memory.total", "--format=csv,noheader,nounits"], 
                                stdout=subprocess.PIPE)
        gpu_info = result.stdout.decode('utf-8').strip()
        print(f"Epoch {epoch + 1} GPU usage: {gpu_info}")

In [15]:
# Training set: actual data model trains on
# Test set: 
#   ...is independent of the training set but has a somewhat similar type of probability distribution of classes 
#   and is used as a benchmark to evaluate the model, used only after the training of the model is complete. 
# Validation set:
#   ...is used to fine-tune the hyperparameters of the model and is considered a part of the training of the model.
# Default batch size 32

tf.summary.trace_on(graph=True, profiler=True,profiler_outdir=graph_dir)
# Dann deinen Trainingscode ausführen
history = model.fit(X_train, y_train, batch_size=32, epochs=2, validation_data=(X_test, y_test), callbacks=[tensorboard_callback, GPUMonitorCallback()])
# Anschließend exportieren:
with tf.summary.create_file_writer(graph_dir).as_default():
    tf.summary.trace_export(name=graph_name, step=0)

2025-02-23 13:55:01.938488: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:103] Profiler session initializing.
2025-02-23 13:55:01.938506: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:118] Profiler session started.
2025-02-23 13:55:01.939090: I external/local_xla/xla/backends/profiler/gpu/cupti_tracer.cc:1006] Profiler found 1 GPUs


Epoch 1/2


I0000 00:00:1740315302.943993   36939 service.cc:148] XLA service 0x79da58066390 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1740315302.944029   36939 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce GTX 1080, Compute Capability 6.1
2025-02-23 13:55:02.979527: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1740315303.068693   36939 cuda_dnn.cc:529] Loaded cuDNN version 90701
2025-02-23 13:55:03.360252: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:557] Omitted potentially buggy algorithm eng14{} for conv (f32[32,128,14,14]{3,2,1,0}, u8[0]{0}) custom-call(f32[32,64,14,14]{3,2,1,0}, f32[128,64,3,3]{3,2,1,0}, f32[128]{0}), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"cudnn_conv_backend_confi

[1m  27/1719[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m10s[0m 6ms/step - accuracy: 0.1094 - loss: 2.9137 

I0000 00:00:1740315305.376945   36939 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1711/1719[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 6ms/step - accuracy: 0.5127 - loss: 1.3727

2025-02-23 13:55:15.995848: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:557] Omitted potentially buggy algorithm eng14{} for conv (f32[24,128,14,14]{3,2,1,0}, u8[0]{0}) custom-call(f32[24,64,14,14]{3,2,1,0}, f32[128,64,3,3]{3,2,1,0}, f32[128]{0}), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"leakyrelu_alpha":0,"side_input_scale":0},"force_earliest_schedule":false,"operation_queue_id":"0","wait_on_operation_queues":[]}
2025-02-23 13:55:16.058552: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:557] Omitted potentially buggy algorithm eng14{} for conv (f32[24,128,14,14]{3,2,1,0}, u8[0]{0}) custom-call(f32[24,128,14,14]{3,2,1,0}, f32[128,128,3,3]{3,2,1,0}, f32[128]{0}), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationF

[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.5133 - loss: 1.3710

2025-02-23 13:55:18.230641: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:557] Omitted potentially buggy algorithm eng14{} for conv (f32[32,128,14,14]{3,2,1,0}, u8[0]{0}) custom-call(f32[32,64,14,14]{3,2,1,0}, f32[128,64,3,3]{3,2,1,0}, f32[128]{0}), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"cudnn_conv_backend_config":{"activation_mode":"kRelu","conv_result_scale":1,"leakyrelu_alpha":0,"side_input_scale":0},"force_earliest_schedule":false,"operation_queue_id":"0","wait_on_operation_queues":[]}
2025-02-23 13:55:18.296063: I external/local_xla/xla/service/gpu/autotuning/conv_algorithm_picker.cc:557] Omitted potentially buggy algorithm eng14{} for conv (f32[32,128,14,14]{3,2,1,0}, u8[0]{0}) custom-call(f32[32,128,14,14]{3,2,1,0}, f32[128,128,3,3]{3,2,1,0}, f32[128]{0}), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationF

Epoch 1 GPU usage: 1831, 8192
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 9ms/step - accuracy: 0.5133 - loss: 1.3708 - val_accuracy: 0.7715 - val_loss: 0.5805
Epoch 2/2
[1m1716/1719[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 6ms/step - accuracy: 0.7639 - loss: 0.6562Epoch 2 GPU usage: 1759, 8192
[1m1719/1719[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.7639 - loss: 0.6561 - val_accuracy: 0.8427 - val_loss: 0.4420


2025-02-23 13:55:31.670162: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:68] Profiler session collecting data.
2025-02-23 13:55:31.916122: I external/local_xla/xla/backends/profiler/gpu/cupti_tracer.cc:1213] CUPTI activity buffer flushed
2025-02-23 13:55:33.252167: I external/local_xla/xla/backends/profiler/gpu/cupti_collector.cc:635]  GpuTracer has collected 513683 callback api events and 496610 activity events. 
2025-02-23 13:55:33.252209: I external/local_xla/xla/backends/profiler/gpu/cupti_collector.cc:638]  GpuTracer max callback_events: 2097152, max activity events: 2097152
2025-02-23 13:55:36.520494: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:130] Profiler session tear down.
2025-02-23 13:55:36.525918: I external/local_xla/xla/tsl/profiler/rpc/client/save_profile.cc:147] Collecting XSpace to repository: ./logs/trace/plugins/profile/2025_02_23_13_55_36/tower.xplane.pb




In [17]:
#%load_ext tensorboard
#%reload_ext tensorboard
#%tensorboard --logdir ./logs/