In [None]:
# based on https://www.tensorflow.org/hub/tutorials/tf2_image_retraining

In [2]:
from tensorflow.keras.applications import EfficientNetB0
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import tensorflow_hub as hub

import os
import sys

tf.__version__

2023-04-14 15:24:00.222508: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


'2.12.0'

In [4]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)

In [5]:
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")


GPU is available


In [6]:
batch_size = 64
image_size = 224

model_name = "efficientnetv2-b0-21k"
model_handle = "https://tfhub.dev/google/imagenet/efficientnet_v2_imagenet21k_b0/classification/2"

print(f"Selected model: {model_name} : {model_handle}")

pixels = 224
IMAGE_SIZE = (pixels, pixels)
print(f"Input size {IMAGE_SIZE}")



Selected model: efficientnetv2-b0-21k : https://tfhub.dev/google/imagenet/efficientnet_v2_imagenet21k_b0/classification/2
Input size (224, 224)


In [7]:

data_dir = "/tf/dataset/cavity_images"

train_ds = tf.keras.utils.image_dataset_from_directory(
    data_dir,
    shuffle=True,
    validation_split=0.2,
    subset="training",
    label_mode='categorical',
    seed=123,
    image_size=(image_size, image_size),
    batch_size=batch_size,
)

test_ds = tf.keras.utils.image_dataset_from_directory(
    data_dir,
    shuffle=True,
    validation_split=0.2,
    subset="validation",
    label_mode='categorical',
    seed=123,
    image_size=(image_size, image_size),
    batch_size=batch_size,
)
class_names = train_ds.class_names
NUM_CLASSES = len(class_names)

Found 20067 files belonging to 10 classes.
Using 16054 files for training.


2023-04-14 15:24:09.030798: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-04-14 15:24:09.031039: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-04-14 15:24:09.031179: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

Found 20067 files belonging to 10 classes.
Using 4013 files for validation.


In [None]:
do_fine_tuning = True

print("Building model with", model_handle)
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=IMAGE_SIZE + (3,)),
    hub.KerasLayer(model_handle, trainable=do_fine_tuning),
    tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Dense(len(class_names),
                          kernel_regularizer=tf.keras.regularizers.l2(0.0001))
])
model.build()
model.summary()

In [9]:
model.summary()
(None,)+IMAGE_SIZE+(3,)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 21843)             33900195  
                                                                 
 dropout (Dropout)           (None, 21843)             0         
                                                                 
 dense (Dense)               (None, 10)                218440    
                                                                 
Total params: 34,118,635
Trainable params: 34,058,027
Non-trainable params: 60,608
_________________________________________________________________


(None, 224, 224, 3)

In [10]:
model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-2), 
  loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
  metrics=['accuracy'])

In [11]:
steps_per_epoch = len(train_ds) // batch_size
validation_steps = len(test_ds) // batch_size

hist = model.fit(
    train_ds,
    epochs=5, steps_per_epoch=steps_per_epoch,
    validation_data=test_ds
    )

Epoch 1/5


2023-04-14 15:24:24.287353: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [16054]
	 [[{{node Placeholder/_0}}]]
2023-04-14 15:24:24.287649: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [16054]
	 [[{{node Placeholder/_0}}]]
2023-04-14 15:24:24.658876: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/head/StatefulPartitionedCall_grad/hea

2023-04-14 15:24:24.929859: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/blocks_18/StatefulPartitionedCall_grad/blocks_18/StatefulPartitionedCall' with dtype float and shape [?,?,?,192]
	 [[{{node gradients/blocks_18/StatefulPartitionedCall_grad/blocks_18/StatefulPartitionedCall}}]]
2023-04-14 15:24:24.929943: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/blocks_18/StatefulPartitionedCall_grad/blocks_18/StatefulPartitionedCall_1' with dtype float and shape [?,?,?,192]
	 [[{{node gradients/blocks_18/StatefulPartitionedCall_grad/blocks_18/StatefulPartitionedCall_1}}]]
2023-04-14 15:24:24.929987: 

2023-04-14 15:24:25.207466: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/blocks_15/StatefulPartitionedCall_grad/blocks_15/StatefulPartitionedCall' with dtype float and shape [?,?,?,192]
	 [[{{node gradients/blocks_15/StatefulPartitionedCall_grad/blocks_15/StatefulPartitionedCall}}]]
2023-04-14 15:24:25.207550: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/blocks_15/StatefulPartitionedCall_grad/blocks_15/StatefulPartitionedCall_1' with dtype float and shape [?,?,?,192]
	 [[{{node gradients/blocks_15/StatefulPartitionedCall_grad/blocks_15/StatefulPartitionedCall_1}}]]
2023-04-14 15:24:25.207594: 

2023-04-14 15:24:25.554035: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/blocks_13/StatefulPartitionedCall_grad/blocks_13/StatefulPartitionedCall_7' with dtype float and shape [?,?,?,672]
	 [[{{node gradients/blocks_13/StatefulPartitionedCall_grad/blocks_13/StatefulPartitionedCall_7}}]]
2023-04-14 15:24:25.554120: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/blocks_13/StatefulPartitionedCall_grad/blocks_13/StatefulPartitionedCall_9' with dtype float and shape [?,1,1,672]
	 [[{{node gradients/blocks_13/StatefulPartitionedCall_grad/blocks_13/StatefulPartitionedCall_9}}]]
2023-04-14 15:24:25.5541

2023-04-14 15:24:25.813764: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/blocks_10/StatefulPartitionedCall_grad/blocks_10/StatefulPartitionedCall' with dtype float and shape [?,?,?,112]
	 [[{{node gradients/blocks_10/StatefulPartitionedCall_grad/blocks_10/StatefulPartitionedCall}}]]
2023-04-14 15:24:25.813849: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/blocks_10/StatefulPartitionedCall_grad/blocks_10/StatefulPartitionedCall_1' with dtype float and shape [?,?,?,112]
	 [[{{node gradients/blocks_10/StatefulPartitionedCall_grad/blocks_10/StatefulPartitionedCall_1}}]]
2023-04-14 15:24:25.813895: 

2023-04-14 15:24:26.062708: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/blocks_7/StatefulPartitionedCall_grad/blocks_7/StatefulPartitionedCall' with dtype float and shape [?,?,?,96]
	 [[{{node gradients/blocks_7/StatefulPartitionedCall_grad/blocks_7/StatefulPartitionedCall}}]]
2023-04-14 15:24:26.062794: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/blocks_7/StatefulPartitionedCall_grad/blocks_7/StatefulPartitionedCall_1' with dtype float and shape [?,?,?,96]
	 [[{{node gradients/blocks_7/StatefulPartitionedCall_grad/blocks_7/StatefulPartitionedCall_1}}]]
2023-04-14 15:24:26.062840: I tensorfl

2023-04-14 15:24:26.271160: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/blocks_4/StatefulPartitionedCall_grad/blocks_4/StatefulPartitionedCall' with dtype float and shape [?,?,?,48]
	 [[{{node gradients/blocks_4/StatefulPartitionedCall_grad/blocks_4/StatefulPartitionedCall}}]]
2023-04-14 15:24:26.271245: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/blocks_4/StatefulPartitionedCall_grad/blocks_4/StatefulPartitionedCall_1' with dtype float and shape [?,?,?,48]
	 [[{{node gradients/blocks_4/StatefulPartitionedCall_grad/blocks_4/StatefulPartitionedCall_1}}]]
2023-04-14 15:24:26.271289: I tensorfl

2023-04-14 15:24:26.731026: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall_2' with dtype float and shape [?,1280]
	 [[{{node gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall_2}}]]
2023-04-14 15:24:26.731126: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall_3' with dtype float and shape [?,1280]
	 [[{{node gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall_3}}]]
2023-04-14 15:24:26.731171: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Ex

2023-04-14 15:24:27.081168: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall_2' with dtype float and shape [?,1280]
	 [[{{node gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall_2}}]]
2023-04-14 15:24:27.081275: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall_3' with dtype float and shape [?,1280]
	 [[{{node gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall_3}}]]
2023-04-14 15:24:27.081323: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Ex

2023-04-14 15:24:38.832388: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8600
2023-04-14 15:24:40.555580: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 144.00MiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2023-04-14 15:24:40.564210: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 299.71MiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2023-04-14 15:24:40.566626: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 144.00MiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gain

ResourceExhaustedError: Graph execution error:

OOM when allocating tensor with shape[64,1152,7,7] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node tpu_batch_normalization_1/FusedBatchNormV3}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_57082]

In [10]:
def plot_hist(hist):
    plt.plot(hist.history["accuracy"])
    plt.plot(hist.history["val_accuracy"])
    plt.title("model accuracy")
    plt.ylabel("accuracy")
    plt.xlabel("epoch")
    plt.legend(["train", "validation"], loc="upper left")
    plt.show()