In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.examples.tutorials.mnist import input_data

In [2]:
from lib import helper
import os

In [3]:
mnist = input_data.read_data_sets("data/", one_hot=True)
test_data = mnist.test.images
test_label = mnist.test.labels

Extracting data/train-images-idx3-ubyte.gz
Extracting data/train-labels-idx1-ubyte.gz
Extracting data/t10k-images-idx3-ubyte.gz
Extracting data/t10k-labels-idx1-ubyte.gz


In [4]:
num_inputs = 784 # 28x28
num_outputs = 10
num_units = 512
num_layers = 8
batch_size = 64
num_epochs= 10000
learning_rate = 0.0001
use_dropout = True
in_dropout_rate = 0.0
all_dropout_rate = 0.0
worker_num = 2

In [5]:
myint = tf.int32
myfloat = tf.float32

In [6]:
# 適当なモデル (雑)
class DNN:
    
    def __init__(self):
        self.global_step = tf.train.get_or_create_global_step()
        self.x = tf.placeholder(dtype=myfloat, shape=[None, num_inputs], name='x')
        self.y = tf.placeholder(dtype=myfloat, shape=[None, num_outputs], name='y')
        self.is_training = tf.placeholder(dtype=tf.bool, name='is_training')
        
        if use_dropout:
            self.x = tf.layers.dropout(self.x, rate=in_dropout_rate, training=self.is_training)
        for i in range(num_layers):
            if i ==0:
                layer = tf.layers.dense(self.x, num_units, tf.nn.relu, kernel_initializer=tf.truncated_normal_initializer(stddev=0.1), name='l_{}'.format(i))
            else:
                layer = tf.layers.dense(layer, num_units, tf.nn.relu, kernel_initializer=tf.truncated_normal_initializer(stddev=0.1), name='l_{}'.format(i))
            if use_dropout:
                layer = tf.layers.dropout(layer, rate=all_dropout_rate , training=self.is_training, name='l_dropout_{}'.format(i))

        out = tf.layers.dense(layer,
                                   num_outputs, 
                                   tf.nn.softmax, 
                                   kernel_initializer=tf.truncated_normal_initializer(stddev=0.1), 
                                   name='out_layer')
        self.loss_ce = tf.reduce_mean(-tf.reduce_sum(self.y * tf.log(out + 10e-8), 1))
        loss_se = tf.reduce_sum(tf.square(self.y - out))

        self.optimizer = tf.train.AdamOptimizer(learning_rate)
        self.optimizer = tf.train.SyncReplicasOptimizer(self.optimizer, replicas_to_aggregate=worker_num, total_num_replicas=worker_num)
        params = tf.trainable_variables()
        gradients = tf.gradients(self.loss_ce, params)
        self.train = self.optimizer.apply_gradients(zip(gradients, params), global_step=self.global_step)

        grad_norm = [tf.norm(g) for g in gradients]
        grad_norm_sum = tf.reduce_sum(grad_norm)
        
        y_label = tf.argmax(self.y, 1)
        out_label = tf.argmax(out, 1)

        correct = tf.equal(y_label, out_label)
        self.accuracy = tf.reduce_mean(tf.cast(correct, myfloat))
        
        with tf.name_scope('train'):
            smr_loss_ce = tf.summary.scalar('loss_cross_entropy', self.loss_ce)
            smr_loss_se = tf.summary.scalar('loss_squared_error', loss_se)
            smr_acc = tf.summary.scalar('accuracy', self.accuracy)
            smr_grad = tf.summary.scalar('gradient', grad_norm_sum)
            merged_summary = tf.summary.merge([smr_loss_ce, smr_loss_se, smr_acc, smr_grad])

        with tf.name_scope('test'):
            self.test_smr_acc = tf.summary.scalar('accuracy', self.accuracy)

In [7]:
from datetime import datetime
import time
now = datetime.now()
logdir_base = 'logs'
logdir = logdir_base

In [8]:
cluster = tf.train.ClusterSpec({
    'worker': [
        'localhost:3335',
        'localhost:3336',
    ],
    'ps': [
        'localhost:3331',
        'localhost:3332',
    ]
})

In [9]:
def parameter_server(task_index):
    os.environ['CUDA_VISIBLE_DEVICES'] = ''
    server = tf.train.Server(cluster, job_name='ps', task_index=task_index)
    server.join()

In [10]:
def worker(task_index, logs_path):
    logs_path = logs_path + ('worker_%d' % task_index)
    os.environ['CUDA_VISIBLE_DEVICES'] = str(task_index)
    server = tf.train.Server(cluster, job_name='worker', task_index=task_index)
    
    device_function = tf.train.replica_device_setter(worker_device='/job:worker/task:%d' % task_index,
                                                                 cluster=cluster)
    with tf.device(device_function):

        dnn = DNN()

        global_step = dnn.global_step
        loss_op = dnn.loss_ce
        accuracy_op = dnn.accuracy
        train_op = dnn.train
        optimizer = dnn.optimizer

    summary_hook = tf.train.SummarySaverHook(save_secs=5, output_dir=logs_path, summary_op=tf.summary.merge_all())
    hooks=[summary_hook, tf.train.NanTensorHook(loss_op), optimizer.make_session_run_hook(task_index == 0)]

    config = tf.ConfigProto(
        allow_soft_placement=True,
        gpu_options=tf.GPUOptions(
            allow_growth=True
        ))
    
    # training
    with tf.train.MonitoredTrainingSession(master=server.target,
                                           config=config,
                                           is_chief=(task_index == 0),
                                           checkpoint_dir='logs/train_logs',
                                           hooks=hooks) as sess:
        start_all = time.time()
        for i in range(num_epochs):
            start = time.time()
            train_data, train_label = mnist.train.next_batch(batch_size)
            train_step, loss, train_acc, step_val = sess.run([train_op, loss_op, accuracy_op, global_step], feed_dict={
                dnn.x: train_data,
                dnn.y: train_label,
                dnn.is_training: True
            })
            end = time.time()
            
            if i % 100 == 0:
                print("{}/{} epoch, train_loss = {:.3f}, accuracy = {:.4f}, time/epoch = {:.3f}, global_step = {}".format(
                    i, num_epochs, loss, train_acc, end - start, step_val))
        end_all = time.time()
        
        # test accuracy
        test_acc = sess.run(accuracy_op, feed_dict={
                dnn.x: test_data,
                dnn.y: test_label,
                dnn.is_training: False
            })
        print('test accuracy: ', test_acc)
    print('elapsed: ', end_all - start_all)

In [11]:
from multiprocessing import Process
ps1_proc = Process(target=parameter_server, args=(0, ), daemon=True)
ps2_proc = Process(target=parameter_server, args=(1, ), daemon=True)

logs_path = 'logs/test/' + time.strftime("%Y-%m-%d-%H-%M") + '/'
w1_proc = Process(target=worker, args=(0, logs_path), daemon=True)
w2_proc = Process(target=worker, args=(1, logs_path), daemon=True)

In [12]:
ps1_proc.start()
ps2_proc.start()

In [13]:
w1_proc.start()

INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=2; total_num_replicas=2
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from logs/train_logs/model.ckpt-0
INFO:tensorflow:Saving checkpoints for 0 into logs/train_logs/model.ckpt.
0/10000 epoch, train_loss = 13.840, accuracy = 0.0469, time/epoch = 0.744, global_step = 0
INFO:tensorflow:global_step/sec: 56.3579
100/10000 epoch, train_loss = 0.377, accuracy = 0.9062, time/epoch = 0.016, global_step = 105
INFO:tensorflow:global_step/sec: 60.3791
200/10000 epoch, train_loss = 0.171, accuracy = 0.9531, time/epoch = 0.017, global_step = 205
INFO:tensorflow:global_step/sec: 59.5495
300/10000 epoch, train_loss = 0.227, accuracy = 0.9375, time/epoch = 0.016, global_step = 305
INFO:tensorflow:global_step/sec: 58.9251
400/10000 epoch, train_loss = 0.091, accuracy = 0.9531, time/epoch = 0.017, global_step = 405
INFO:tensorflow:global_step/sec: 59.5052
500/10000 epoch, train_loss = 0.091, accuracy = 0.9844, time/

INFO:tensorflow:global_step/sec: 59.3378
5900/10000 epoch, train_loss = 0.064, accuracy = 0.9531, time/epoch = 0.016, global_step = 5905
INFO:tensorflow:global_step/sec: 59.2711
6000/10000 epoch, train_loss = 0.002, accuracy = 1.0000, time/epoch = 0.016, global_step = 6005
INFO:tensorflow:global_step/sec: 55.2099
6100/10000 epoch, train_loss = 0.001, accuracy = 1.0000, time/epoch = 0.018, global_step = 6105
INFO:tensorflow:global_step/sec: 58.9013
6200/10000 epoch, train_loss = 0.001, accuracy = 1.0000, time/epoch = 0.016, global_step = 6205
INFO:tensorflow:global_step/sec: 58.8364
6300/10000 epoch, train_loss = 0.000, accuracy = 1.0000, time/epoch = 0.017, global_step = 6305
INFO:tensorflow:global_step/sec: 58.8459
6400/10000 epoch, train_loss = 0.002, accuracy = 1.0000, time/epoch = 0.016, global_step = 6405
INFO:tensorflow:global_step/sec: 58.6424
6500/10000 epoch, train_loss = 0.003, accuracy = 1.0000, time/epoch = 0.018, global_step = 6505
INFO:tensorflow:global_step/sec: 59.0613


In [14]:
w2_proc.start()

INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=2; total_num_replicas=2
0/10000 epoch, train_loss = 13.840, accuracy = 0.0469, time/epoch = 0.234, global_step = 0
100/10000 epoch, train_loss = 0.769, accuracy = 0.7812, time/epoch = 0.016, global_step = 93
200/10000 epoch, train_loss = 0.318, accuracy = 0.9219, time/epoch = 0.016, global_step = 193
300/10000 epoch, train_loss = 0.444, accuracy = 0.9062, time/epoch = 0.016, global_step = 293
400/10000 epoch, train_loss = 0.130, accuracy = 0.9688, time/epoch = 0.017, global_step = 393
500/10000 epoch, train_loss = 0.193, accuracy = 0.9531, time/epoch = 0.016, global_step = 493
600/10000 epoch, train_loss = 0.088, accuracy = 0.9688, time/epoch = 0.020, global_step = 593
700/10000 epoch, train_loss = 0.141, accuracy = 0.9375, time/epoch = 0.017, global_step = 693
800/10000 epoch, train_loss = 0.363, accuracy = 0.9062, time/epoch = 0.016, global_step = 793
900/10000 epoch, train_loss = 0.232, accuracy = 0.9375, time/epoch = 0.017, glob

8500/10000 epoch, train_loss = 0.000, accuracy = 1.0000, time/epoch = 0.017, global_step = 8493
8600/10000 epoch, train_loss = 0.001, accuracy = 1.0000, time/epoch = 0.018, global_step = 8593
8700/10000 epoch, train_loss = 0.001, accuracy = 1.0000, time/epoch = 0.016, global_step = 8693
8800/10000 epoch, train_loss = 0.109, accuracy = 0.9844, time/epoch = 0.018, global_step = 8793
8900/10000 epoch, train_loss = 0.001, accuracy = 1.0000, time/epoch = 0.017, global_step = 8893
9000/10000 epoch, train_loss = 0.008, accuracy = 1.0000, time/epoch = 0.016, global_step = 8993
9100/10000 epoch, train_loss = 0.111, accuracy = 0.9844, time/epoch = 0.016, global_step = 9093
9200/10000 epoch, train_loss = 0.019, accuracy = 0.9844, time/epoch = 0.016, global_step = 9193
9300/10000 epoch, train_loss = 0.001, accuracy = 1.0000, time/epoch = 0.018, global_step = 9293
9400/10000 epoch, train_loss = 0.061, accuracy = 0.9844, time/epoch = 0.016, global_step = 9393
9500/10000 epoch, train_loss = 0.008, ac