In [1]:
import tensorflow as tf

In [2]:
#Let's say we want multiple processes to have shared access to some common parameters. 
#For simplicity, suppose this is just a single variable:
var = tf.Variable(initial_value=0.0)

In [3]:
#As a first step, we can imagine that each process would need its own session. 
#(Pretend session 1 is created in one process, and session 2 in another.)

sess1 = tf.Session()
sess2 = tf.Session()

sess1.run(tf.global_variables_initializer())
sess2.run(tf.global_variables_initializer())

In [4]:
#Each call to tf.Session() creates a separate execution engine, then connects the session handle to the execution engine. 
#The execution engine is what actually stores variable values and runs operations.
#Normally, execution engines in different processes are unlinked. 
#Changing var in one session (on one execution engine) won't affect var in the other session.

print("Initial value of var in session 1:", sess1.run(var))
print("Initial value of var in session 2:", sess2.run(var))

sess1.run(var.assign_add(1.0))
print("Incremented var in session 1")

print("Value of var in session 1:", sess1.run(var))
print("Value of var in session 2:", sess2.run(var))

('Initial value of var in session 1:', 0.0)
('Initial value of var in session 2:', 0.0)
Incremented var in session 1
('Value of var in session 1:', 1.0)
('Value of var in session 2:', 0.0)


In [5]:
#In order to share variables between processes, we need to link the different execution engines together. Enter Distributed TensorFlow.

#With Distributed TensorFlow, each process runs a special execution engine: a TensorFlow server. 
#Servers are linked together as part of a cluster. (Each server in the cluster is also known as a task.)

#The first step is to define what the cluster looks like. We start off with the simplest possible cluster: two servers (two tasks), both on the same machine; 
#one that will listen on port 2222, one on port 2223.

tasks = ["localhost:2222", "localhost:2223"]

In [6]:
#Each task is associated with a job, which is a collection of related tasks. 
#We associate both tasks with a job called "local".#

jobs = {"local": tasks}

In [7]:
#This completes the definition of the cluster.

cluster = tf.train.ClusterSpec(jobs)

In [8]:
#We can now launch the servers, specifying which server in the cluster definition each server corresponds to. 
#Each server starts immediately, listening on the port specified in the cluster definition.

# "This server corresponds to the the first task (task_index=0)
# of the tasks associated with the 'local' job."
server1 = tf.train.Server(cluster, job_name="local", task_index=0)

server2 = tf.train.Server(cluster, job_name="local", task_index=1)

In [9]:
#With the servers linked together in the same cluster, we can now experience the main magic of Distributed TensorFlow: 
#any variable with the same name will be shared between all servers.
#The simplest example is to run the same graph on all servers, each graph with just one variable, as before:

tf.reset_default_graph()
var = tf.Variable(initial_value=0.0, name='var')
sess1 = tf.Session(server1.target)
sess2 = tf.Session(server2.target)

In [10]:
#Modifications made to the variable on one server will now be mirrored on the second server.

sess1.run(tf.global_variables_initializer())
sess2.run(tf.global_variables_initializer())

print("Initial value of var in session 1:", sess1.run(var))
print("Initial value of var in session 2:", sess2.run(var))

sess1.run(var.assign_add(1.0))
print("Incremented var in session 1")

print("Value of var in session 1:", sess1.run(var))
print("Value of var in session 2:", sess2.run(var))

('Initial value of var in session 1:', 0.0)
('Initial value of var in session 2:', 0.0)
Incremented var in session 1
('Value of var in session 1:', 1.0)
('Value of var in session 2:', 1.0)


In [11]:
#Placement
#A question that might be in our minds at this point is: which server does the variable actually get stored on? 
#And for operations, which server actually runs them?
#Empirically, it seems that by default, variables and operations get placed on the first task in the cluster.

def run_with_location_trace(sess, op):
    # From https://stackoverflow.com/a/41525764/7832197
    run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
    run_metadata = tf.RunMetadata()
    sess.run(op, options=run_options, run_metadata=run_metadata)
    for device in run_metadata.step_stats.dev_stats:
      print(device.device)
      for node in device.node_stats:
        print("  ", node.node_name)

In [12]:
#For example, if we do something to var using the session connected to the first task, everything happens on that task:

run_with_location_trace(sess1, var)

/job:local/replica:0/task:0/device:CPU:0
('  ', u'_SOURCE')
('  ', u'var')


In [13]:
run_with_location_trace(sess1, var.assign_add(1.0))

/job:local/replica:0/task:0/device:CPU:0
('  ', u'_SOURCE')
('  ', u'AssignAdd_1/value')
('  ', u'var')
('  ', u'AssignAdd_1')


In [14]:
#But if we try and try do something to var using the session connected to the second task, 
#the graph nodes still get run on the first task

run_with_location_trace(sess2, var)

/job:local/replica:0/task:1/device:CPU:0
('  ', u'RecvTensor')
/job:local/replica:0/task:1/device:CPU:0
('  ', u'_SOURCE')
/job:local/replica:0/task:0/device:CPU:0
('  ', u'_SOURCE')
('  ', u'var')


In [15]:
#To fix a variable or an operation to a specific task, we can use tf.device:

with tf.device("/job:local/task:0"):
    var1 = tf.Variable(0.0, name='var1')
with tf.device("/job:local/task:1"):
    var2 = tf.Variable(0.0, name='var2')
    
# (This will initialize both variables)
sess1.run(tf.global_variables_initializer())

In [16]:
#Now var1 runs on the first task, as before.

run_with_location_trace(sess1, var1)

/job:local/replica:0/task:0/device:CPU:0
('  ', u'_SOURCE')
('  ', u'var1')


In [17]:
#But var2 runs on the second task. Even if we try to evaluate it using the session connected to the first task, 
#it still runs on the second task.

run_with_location_trace(sess1, var2)

/job:local/replica:0/task:0/device:CPU:0
('  ', u'RecvTensor')
/job:local/replica:0/task:0/device:CPU:0
('  ', u'_SOURCE')
/job:local/replica:0/task:1/device:CPU:0
('  ', u'_SOURCE')
('  ', u'var2')


In [18]:
#And vice-versa with var2.

run_with_location_trace(sess2, var2)

/job:local/replica:0/task:1/device:CPU:0
('  ', u'_SOURCE')
('  ', u'var2')


In [19]:
run_with_location_trace(sess2, var1)

/job:local/replica:0/task:1/device:CPU:0
('  ', u'RecvTensor')
/job:local/replica:0/task:1/device:CPU:0
('  ', u'_SOURCE')
/job:local/replica:0/task:0/device:CPU:0
('  ', u'_SOURCE')
('  ', u'var1')
