# Create cluster

## 1. Create with job name localhost

In [None]:
import tensorflow as tf
c = tf.constant("Hello from server1!")

# 生成一个有两个任务的集群，一个任务跑在本地2222端口，另外一个跑在本地2223端口。
cluster = tf.train.ClusterSpec(
    {"local": ["localhost:2222", "localhost: 2223", "localhost: 2224"]})
# 通过上面生成的集群配置生成Server，并通过job_name和task_index指定当前所启动
# 的任务。因为该任务是第一个任务，所以task_index为0。
server = tf.train.Server(cluster, job_name="local", task_index=0)

# 通过server.target生成会话来使用TensorFlow集群中的资源。通过设置
# log_device_placement可以看到执行每一个操作的任务。
sess = tf.Session(
    server.target, config=tf.ConfigProto(log_device_placement=True))
print(sess.run(c))
server.join()

In [None]:
import tensorflow as tf

with tf.device("/job:local/task:1"):
    c = tf.constant("Hello from server2!")

# 和第一个程序一样的集群配置。集群中的每一个任务需要采用相同的配置。
cluster = tf.train.ClusterSpec(
    {"local": ["localhost:2222", "localhost: 2223", "localhost: 2224"]})
# 指定task_index为1，所以这个程序将在localhost:2223启动服务。
server = tf.train.Server(cluster, job_name="local", task_index=1)
# 剩下的代码都和第一个任务的代码一致。
sess = tf.Session(
    server.target, config=tf.ConfigProto(log_device_placement=True))
print(sess.run(c))
server.join()

In [None]:
import tensorflow as tf

# 和第一个程序一样的集群配置。集群中的每一个任务需要采用相同的配置。
cluster = tf.train.ClusterSpec(
    {"local": ["localhost:2222", "localhost: 2223", "localhost: 2224"]})
# 指定task_index为2，所以这个程序将在localhost:2224启动服务。
server = tf.train.Server(cluster, job_name="local", task_index=2)

server.join()

Three local server will begin to run, after execute the above code indivadually

We can run the following script on any of the server 

In [None]:
import tensorflow as tf

a = tf.constant(1.0)
b = a + 2
c = a * 3

# set "grpc://localhost: 2224" , means the session will execute on machine3, however,
# it may not use the job on that machine (a, b, c all executed using task0),
# as Tensorflow will optimize assign the resource at the cluster level
with tf.Session("grpc://localhost: 2224", config=tf.ConfigProto(log_device_placement=True)) as sess:
    print(c.eval())

In [None]:
import tensorflow as tf

with tf.device("/job:local/task:1/cpu:0"):
    a = tf.constant(1.0)

with tf.device("/job:local/task:2/cpu:0"):
    b = a + 2

c = a * 3

# set "grpc://localhost: 2223" , means the session will execute on machine2, however,
# it runs on the device that we specific above: a on task0, b on task12, c on task1(auto assigned)
with tf.Session("grpc://localhost: 2223", config=tf.ConfigProto(log_device_placement=True)) as sess:
    print(c.eval())

Both the tf.Variable and tf.get_variable are shared across the servers

Run following code with "grpc://localhost:2223 init" as parameter will create the variable, and run with parameter "grpc://localhost:2224" will just reuse the variable we create on different server


In [None]:
# simple_client.py
import tensorflow as tf
import sys

#x = tf.Variable(0.0, name="x")
x = tf.get_variable("x", dtype=tf.float32, initializer=0.0)
increment_x = tf.assign(x, x+1)

with tf.Session(sys.argv[1], config=tf.ConfigProto(log_device_placement=True)) as sess:
    if sys.argv[2:] == ["init"]:
        sess.run(x.initializer)
    sess.run(increment_x)
    print(x.eval())

## 2. Create with job name ps and worker

The job name ps and worker also works for the localhost, the following create three localhost servers with jobname ps and worker

In [None]:
import tensorflow as tf

cluster_spec = {
    "ps": ["localhost:2222"],
    "worker": ["localhost:2223", "localhost:2224"]}
server = tf.train.Server(cluster_spec, job_name="ps", task_index=0)

c = tf.constant("Hello from server1!")
with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):
  v = tf.get_variable("v", shape=[20, 20])  # this variable is placed
                                            # in the parameter server
                                            # by the replica_device_setter

sess = tf.Session(
    server.target, config=tf.ConfigProto(log_device_placement=True))
print(sess.run(c))
sess.run(v.initializer)
sess.run(v)
server.join()

In [None]:
import tensorflow as tf

cluster_spec = {
    "ps": ["localhost:2222"],
    "worker": ["localhost:2223", "localhost:2224"]}
server = tf.train.Server(cluster_spec, job_name="worker", task_index=0)

server.join()

In [None]:
import tensorflow as tf

cluster_spec = {
    "ps": ["localhost:2222"],
    "worker": ["localhost:2223", "localhost:2224"]}
server = tf.train.Server(cluster_spec, job_name="worker", task_index=1)

server.join()