In [39]:
import os
import json
import tensorflow as tf
import tempfile

import numpy as np

In [17]:
# Check GPUs available for tf

physical_gpus = tf.config.list_physical_devices("GPU")
physical_gpus


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [None]:
# Splitting a single GPU's RAM into multiple logical devices

# This is useful if you have a single GPU and want to test a multi-GPU algorithm

# DOESN'T WORK - maybe because it's a Mac M2?
tf.config.set_logical_device_configuration(
  physical_gpus[0],
  [tf.config.LogicalDeviceConfiguration(memory_limit=2048), 
   tf.config.LogicalDeviceConfiguration(memory_limit=2048)]
)

In [28]:
# Placing objects and running tasks on different devices (CPU/GPU)

# float32 variable goes on the GPU
a = tf.Variable([1., 2., 3.])
print (a.device)

# int32 variable goes on the CPU
b = tf.Variable([1, 2, 3])
print (b.device)

# Explicitly stating which device an object is placed on.
with tf.device("/cpu:0"):
  c = tf.Variable([1., 2., 3.])
print (c.device)

# The object must have a GPU kernel - an operation type in the GPU to support the type.
# For example, this will still be CPU since there is no GPU kernel for int32
with tf.device("/gpu:0"):
  d = tf.Variable([1, 2, 3])
print (d.device)

/job:localhost/replica:0/task:0/device:GPU:0
/job:localhost/replica:0/task:0/device:CPU:0
/job:localhost/replica:0/task:0/device:CPU:0
/job:localhost/replica:0/task:0/device:CPU:0


In [35]:
# Training a model using a data parallelism approach (mirrored strategy)

# Getting the data
mnist = tf.keras.datasets.mnist.load_data()
(X_train_full, y_train_full), (X_test, y_test) = mnist
X_valid, X_train = X_train_full[:5000], X_train_full[5000:]
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]

# mirrored strategy
strategy = tf.distribute.MirroredStrategy()
with strategy.scope():
  model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28], dtype=tf.uint8),
    tf.keras.layers.Rescaling(scale=1 / 255),
    tf.keras.layers.Dense(100, activation="relu"),
    tf.keras.layers.Dense(10, activation="softmax")
  ])
  optimizer = tf.keras.optimizers.legacy.SGD(learning_rate=1e-2)
  model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

# Preferably divisible by the number of replicas
batch_size = 100
model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid), batch_size=batch_size)



INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2e23ae550>

In [36]:
# Loading a model with a distributed strategy

with strategy.scope():
  model = tf.keras.models.load_model("my_mnist_model/0001")

np.argmax(model.predict([X_test[:3]]), axis=1)






array([7, 2, 1])

## Training models on a Tensorflow cluster

In [None]:
# Need to set up TF_CONFIG with the cluster's specs first 

# NOT WORKING CODE!

# EXAMPLE ONLY!
cluster_spec = {
  "worker": [
    "machine-a.example.com:2222",
    "machine-b.example.com:2222"
  ],
  "ps": ["machine-a.example.com:2221"] # /job:ps/task:0 
}

os.environ["TF_CONFIG"] = json.dumps({
    "cluster": cluster_spec,
    "task": {"type": "worker", "index": 0}
})


strategy = tf.distribute.MultiWorkerMirroredStrategy() # at the start! 
resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver() 

print(f"Starting task {resolver.task_type} #{resolver.task_id}")

with strategy.scope():
  model = tf.keras.Sequential([
      tf.keras.layers.Flatten(input_shape=[28, 28], dtype=tf.uint8),
      tf.keras.layers.Rescaling(scale=1 / 255),
      tf.keras.layers.Dense(100, activation="relu"),
      tf.keras.layers.Dense(10, activation="softmax")
    ])
  optimizer = tf.keras.optimizers.legacy.SGD(learning_rate=1e-2)
  model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
  
model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=10)

if resolver.task_id == 0: # the chief saves the model to the right location 
    model.save("my_mnist_multiworker_model", save_format="tf")
else:
  tmpdir = tempfile.mkdtemp() # other workers save to a temporary directory 
  model.save(tmpdir, save_format="tf")
  tf.io.gfile.rmtree(tmpdir) # and we can delete this directory at the end!