In [2]:
#First, although variable values are shared throughout the cluster, the graph is not automatically shared.
#Let's create a fresh cluster with two servers, and set up the first server with an explicitly-created graph.

import tensorflow as tf

cluster = tf.train.ClusterSpec({"local": ["localhost:2224", "localhost:2225"]})
server1 = tf.train.Server(cluster, job_name="local", task_index=0)
server2 = tf.train.Server(cluster, job_name="local", task_index=1)

In [3]:
graph1 = tf.Graph()
with graph1.as_default():
    var1 = tf.Variable(0.0, name='var')
sess1 = tf.Session(target=server1.target, graph=graph1)
print(graph1.get_operations())

[<tf.Operation 'var/initial_value' type=Const>, <tf.Operation 'var' type=VariableV2>, <tf.Operation 'var/Assign' type=Assign>, <tf.Operation 'var/read' type=Identity>]


In [4]:
#If we then create a session connected to the second server, note that the graph does not automatically get mirrored.

graph2 = tf.Graph()
sess2 = tf.Session(target=server2.target, graph=graph2)
print(graph2.get_operations())

[]


In [5]:
#To access the shared variable, we must manually add a variable with the same name to the second graph.

with graph2.as_default():
    var2 = tf.Variable(0.0, name='var')

In [6]:
#Only then can we access it.

sess1.run(var1.assign(1.0))
sess2.run(var2)

1.0

In [8]:
#Does the graph have to be the same on all servers?
#So far, all our examples have run the same graph structure on both servers. This is known as in-graph replication.

#For example, let's say we have a cluster containing three servers. Server 1 holds shared parameters, while server 2 and server 3 are worker nodes, each with local variables. 
#With in-graph replication, each server's graphs would look like:

#The issue with in-graph replication is that every server has to have a copy of the entire graph, including the parts of the graph that might only be relevant for other servers. 
#This can lead to graphs growing very large.
#The alternative is between-graph replication. Here, each server runs a graph containing only the shared parameters, and whatever variables and operations are relevant to that individual server.
#Because it keeps graph sizes smaller, between-graph replication is the recommended approach.

#What happens if we try to run something on the cluster before all servers have connected?
#Let's create another two-task cluster.

cluster = tf.train.ClusterSpec({
    "local": ["localhost:2226", "localhost:2227"]
})


In [10]:
#This time, let's start each server in a separate process. 
#(This allows us to kill the servers, so that we can start them again for later experiments. 
#There's currently no way of killing servers other than killing the process which started them.)

from multiprocessing import Process
from time import sleep

def s1():
    server1 = tf.train.Server(cluster,
                              job_name="local",
                              task_index=0)
    sess1 = tf.Session(server1.target)
    print("server 1: running no-op...")
    sess1.run(tf.no_op())
    print("server 1: no-op run!")
    server1.join() # Block

def s2():
    for i in range(3):
        print("server 2: %d seconds left before connecting..."
              % (3 - i))
        sleep(1.0)
    server2 = tf.train.Server(cluster,
                              job_name="local",
                              task_index=1)
    print("server 2: connected!")
    server2.join() # Block

# daemon=True so that these processes will definitely be killed
# when the parent process restarts
p1 = Process(target=s1, daemon=True)
p2 = Process(target=s2, daemon=True)

TypeError: __init__() got an unexpected keyword argument 'daemon'

In [12]:
p1 = Process(target=s1, daemon=True)
p2 = Process(target=s2, daemon=True)

TypeError: __init__() got an unexpected keyword argument 'daemon'