In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
import tensorflow_hub as hub

# Helper libraries
import numpy as np
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
import tensorflow_datasets as tfds
tfds.disable_progress_bar()

In [3]:
splits = ['train[:80%]', 'train[80%:90%]', 'train[90%:]']

(train_examples, validation_examples, test_examples), info = tfds.load('oxford_flowers102', with_info=True, as_supervised=True, split = splits, data_dir='data/')

num_examples = info.splits['train'].num_examples
print('Number of examples is {}'.format(num_examples))
num_classes = info.features['label'].num_classes
print('Number of classes is {}'.format(num_classes))

Number of examples is 1020
Number of classes is 102


2022-06-29 18:18:21.088399: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-06-29 18:18:23.091128: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14628 MB memory:  -> device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:18:00.0, compute capability: 7.0
2022-06-29 18:18:23.091785: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 14628 MB memory:  -> device: 1, name: Tesla V100-SXM2-16GB, pci bus id: 0000:3b:00.0, compute capability: 7.0
2022-06-29 18:18:23.092218: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localho

In [4]:
for image,label in train_examples.take(1):
    print(image.shape)
    print(label.shape)

(500, 667, 3)
()


In [5]:
# If the list of devices is not specified in the
# `tf.distribute.MirroredStrategy` constructor, it will be auto-detected.
strategy = tf.distribute.MirroredStrategy()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3')


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3')


In [6]:
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

Number of devices: 4


In [7]:
BUFFER_SIZE = num_examples
EPOCHS = 10
pixels = 224
MODULE_HANDLE = 'https://tfhub.dev/google/imagenet/resnet_v1_50/feature_vector/5'
#MODULE_HANDLE = 'data/resnet_50_feature_vector'
IMAGE_SIZE = (pixels, pixels)
print("Using {} with input size {}".format(MODULE_HANDLE, IMAGE_SIZE))

Using https://tfhub.dev/google/imagenet/resnet_v1_50/feature_vector/5 with input size (224, 224)


In [8]:
def format_image(image, label):
    image = tf.image.resize(image, IMAGE_SIZE) / 255.0
    return  image, label

In [9]:
def set_global_batch_size(batch_size_per_replica, strategy):
    '''
    Args:
        batch_size_per_replica (int) - batch size per replica
        strategy (tf.distribute.Strategy) - distribution strategy
    '''
    
    # set the global batch size
    global_batch_size = batch_size_per_replica * strategy.num_replicas_in_sync
    
    return global_batch_size

In [10]:
BATCH_SIZE_PER_REPLICA = 64
GLOBAL_BATCH_SIZE = set_global_batch_size(BATCH_SIZE_PER_REPLICA, strategy)

print(GLOBAL_BATCH_SIZE)

256


In [11]:
train_batches = train_examples.shuffle(num_examples // 4).map(format_image).batch(BATCH_SIZE_PER_REPLICA).prefetch(1)
validation_batches = validation_examples.map(format_image).batch(BATCH_SIZE_PER_REPLICA).prefetch(1)
test_batches = test_examples.map(format_image).batch(1)

In [12]:
def distribute_datasets(strategy, train_batches, validation_batches, test_batches):
    
    train_dist_dataset = strategy.experimental_distribute_dataset(train_batches)
    val_dist_dataset = strategy.experimental_distribute_dataset(validation_batches)
    test_dist_dataset = strategy.experimental_distribute_dataset(test_batches)
    
    return train_dist_dataset, val_dist_dataset, test_dist_dataset

In [13]:
train_dist_dataset, val_dist_dataset, test_dist_dataset = distribute_datasets(strategy, train_batches, validation_batches, test_batches)

2022-06-29 18:18:23.702881: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:537] The `assert_cardinality` transformation is currently not handled by the auto-shard rewrite and will be removed.
2022-06-29 18:18:23.731686: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:537] The `assert_cardinality` transformation is currently not handled by the auto-shard rewrite and will be removed.
2022-06-29 18:18:23.754134: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:537] The `assert_cardinality` transformation is currently not handled by the auto-shard rewrite and will be removed.


In [14]:
print(type(train_dist_dataset))
print(type(val_dist_dataset))
print(type(test_dist_dataset))

<class 'tensorflow.python.distribute.input_lib.DistributedDataset'>
<class 'tensorflow.python.distribute.input_lib.DistributedDataset'>
<class 'tensorflow.python.distribute.input_lib.DistributedDataset'>


In [15]:
# Take a look at a single batch from the train_dist_dataset
x = iter(train_dist_dataset).get_next()
    
print(f"x is a tuple that contains {len(x)} values ")
# print(f"x[0] contains the features, and has shape {x[0].shape}")
# print(f"  so it has {x[0].shape[0]} examples in the batch, each is an image that is {x[0].shape[1:]}")
# print(f"x[1] contains the labels, and has shape {x[1].shape}")

x is a tuple that contains 2 values 


In [16]:
class ResNetModel(tf.keras.Model):
    def __init__(self, classes):
        super(ResNetModel, self).__init__()
        self._feature_extractor = hub.KerasLayer(MODULE_HANDLE,
                                                 trainable=False) 
        self._classifier = tf.keras.layers.Dense(classes, activation='softmax')

    def call(self, inputs):
        x = self._feature_extractor(inputs)
        x = self._classifier(x)
        return x

In [17]:
# Create a checkpoint directory to store the checkpoints.
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")

In [18]:
with strategy.scope():
    # Set reduction to `NONE` so we can do the reduction afterwards and divide by
    # global batch size.
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
        reduction=tf.keras.losses.Reduction.NONE)
    # or loss_fn = tf.keras.losses.sparse_categorical_crossentropy
    def compute_loss(labels, predictions):
        per_example_loss = loss_object(labels, predictions)
        return tf.nn.compute_average_loss(per_example_loss, global_batch_size=GLOBAL_BATCH_SIZE)

    test_loss = tf.keras.metrics.Mean(name='test_loss')

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


In [19]:
with strategy.scope():
    train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
        name='train_accuracy')
    test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
        name='test_accuracy')

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


In [20]:
# model and optimizer must be created under `strategy.scope`.
with strategy.scope():
    model = ResNetModel(classes=num_classes)
    optimizer = tf.keras.optimizers.Adam()
    checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)

In [21]:
def train_test_step_fns(strategy, model, compute_loss, optimizer, train_accuracy, loss_object, test_loss, test_accuracy):
    with strategy.scope():
        def train_step(inputs):
            images, labels = inputs

            with tf.GradientTape() as tape:
                predictions = model(images)
                loss = compute_loss(labels, predictions)

            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))

            train_accuracy.update_state(labels, predictions)
            return loss 

        def test_step(inputs):
            images, labels = inputs
            
            predictions = model(images)
            t_loss = compute_loss(labels, predictions)

            test_loss.update_state(t_loss)
            test_accuracy.update_state(labels, predictions)
        
        return train_step, test_step

In [22]:
train_step, test_step = train_test_step_fns(strategy, model, compute_loss, optimizer, train_accuracy, loss_object, test_loss, test_accuracy)

In [23]:
def distributed_train_test_step_fns(strategy, train_step, test_step, model, compute_loss, optimizer, train_accuracy, loss_object, test_loss, test_accuracy):
    with strategy.scope():
        @tf.function
        def distributed_train_step(dataset_inputs):
            per_replica_losses = strategy.run(train_step, args=(dataset_inputs,))
            return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,
                                   axis=None)

        @tf.function
        def distributed_test_step(dataset_inputs):
            return strategy.run(test_step, args=(dataset_inputs,))
    
        return distributed_train_step, distributed_test_step

In [24]:
distributed_train_step, distributed_test_step = distributed_train_test_step_fns(strategy, train_step, test_step, model, compute_loss, optimizer, train_accuracy, loss_object, test_loss, test_accuracy)

In [30]:
with strategy.scope():
    for epoch in range(EPOCHS):
        # TRAIN LOOP
        total_loss = 0.0
        num_batches = 0
        for x in tqdm(train_dist_dataset):
            total_loss += distributed_train_step(x)
            num_batches += 1
        train_loss = total_loss / num_batches

        # TEST LOOP
        for x in test_dist_dataset:
            distributed_test_step(x)

        template = ("Epoch {}, Loss: {}, Accuracy: {}, Test Loss: {}, "
                    "Test Accuracy: {}")
        print (template.format(epoch+1, train_loss,
                               train_accuracy.result()*100, test_loss.result(),
                               test_accuracy.result()*100))

        test_loss.reset_states()
        train_accuracy.reset_states()
        test_accuracy.reset_states()

13it [00:00, 15.18it/s]


Epoch 1, Loss: 0.004509424325078726, Accuracy: 100.0, Test Loss: 0.0012388842878863215, Test Accuracy: 66.66667175292969


13it [00:00, 19.01it/s]


Epoch 2, Loss: 0.004283434711396694, Accuracy: 100.0, Test Loss: 0.0012341475812718272, Test Accuracy: 66.66667175292969


13it [00:00, 18.09it/s]


Epoch 3, Loss: 0.004081496968865395, Accuracy: 100.0, Test Loss: 0.0012352012563496828, Test Accuracy: 66.66667175292969


13it [00:00, 16.24it/s]


Epoch 4, Loss: 0.003890641964972019, Accuracy: 100.0, Test Loss: 0.001234022667631507, Test Accuracy: 66.66667175292969


13it [00:00, 17.30it/s]


Epoch 5, Loss: 0.003720073262229562, Accuracy: 100.0, Test Loss: 0.0012301856186240911, Test Accuracy: 66.66667175292969


13it [00:00, 17.08it/s]


Epoch 6, Loss: 0.003556705778464675, Accuracy: 100.0, Test Loss: 0.0012285284465178847, Test Accuracy: 66.66667175292969


13it [00:00, 16.63it/s]


Epoch 7, Loss: 0.0034005704801529646, Accuracy: 100.0, Test Loss: 0.0012271060841158032, Test Accuracy: 66.66667175292969


13it [00:00, 17.28it/s]


Epoch 8, Loss: 0.0032610436901450157, Accuracy: 100.0, Test Loss: 0.0012247494887560606, Test Accuracy: 66.66667175292969


13it [00:00, 16.88it/s]


Epoch 9, Loss: 0.003130272263661027, Accuracy: 100.0, Test Loss: 0.0012225187383592129, Test Accuracy: 66.66667175292969


13it [00:00, 16.89it/s]


Epoch 10, Loss: 0.0030058512929826975, Accuracy: 100.0, Test Loss: 0.001219544094055891, Test Accuracy: 66.66667175292969


13it [00:00, 17.42it/s]


Epoch 11, Loss: 0.0028942408971488476, Accuracy: 100.0, Test Loss: 0.0012207817053422332, Test Accuracy: 66.66667175292969


13it [00:00, 16.53it/s]


Epoch 12, Loss: 0.002783317118883133, Accuracy: 100.0, Test Loss: 0.001217754208482802, Test Accuracy: 66.66667175292969


13it [00:00, 17.16it/s]


Epoch 13, Loss: 0.0026809496339410543, Accuracy: 100.0, Test Loss: 0.0012161785271018744, Test Accuracy: 66.66667175292969


13it [00:00, 17.28it/s]


Epoch 14, Loss: 0.0025860602036118507, Accuracy: 100.0, Test Loss: 0.00121533393394202, Test Accuracy: 66.66667175292969


13it [00:00, 17.92it/s]


Epoch 15, Loss: 0.00249398872256279, Accuracy: 100.0, Test Loss: 0.001213330077007413, Test Accuracy: 66.66667175292969


13it [00:00, 18.37it/s]


Epoch 16, Loss: 0.002408526837825775, Accuracy: 100.0, Test Loss: 0.0012121729087084532, Test Accuracy: 66.66667175292969


13it [00:00, 17.91it/s]


Epoch 17, Loss: 0.0023297180887311697, Accuracy: 100.0, Test Loss: 0.0012119205202907324, Test Accuracy: 66.66667175292969


13it [00:00, 17.53it/s]


Epoch 18, Loss: 0.00225058919750154, Accuracy: 100.0, Test Loss: 0.0012112020049244165, Test Accuracy: 66.66667175292969


13it [00:00, 18.02it/s]


Epoch 19, Loss: 0.002177456859499216, Accuracy: 100.0, Test Loss: 0.0012100862804800272, Test Accuracy: 66.66667175292969


13it [00:00, 17.58it/s]


Epoch 20, Loss: 0.0021076337434351444, Accuracy: 100.0, Test Loss: 0.00120741396676749, Test Accuracy: 66.66667175292969


In [31]:
model_save_path = "./tmp/mymodel/1/"
tf.saved_model.save(model, model_save_path)

INFO:tensorflow:Assets written to: ./tmp/mymodel/1/assets


INFO:tensorflow:Assets written to: ./tmp/mymodel/1/assets


In [32]:
import os
import zipfile

def zipdir(path, ziph):
    # ziph is zipfile handle
    for root, dirs, files in os.walk(path):
        for file in files:
            ziph.write(os.path.join(root, file))

zipf = zipfile.ZipFile('./mymodel.zip', 'w', zipfile.ZIP_DEFLATED)
zipdir('./tmp/mymodel/1/', zipf)
zipf.close()