In [24]:
import docker 
import pandas as pd
import tensorflow as tf
from gym_torcs_docker import TorcsDockerEnv, obs_to_state
from ddpg import DDPG
from a3c import A3C

docker_client = docker.from_env()

In [40]:
def testModelOnTrack(
        docker_client, sess, model, trackname, max_steps=1000,
        docker_port=3101):
    """Drives the model around the specified track for 1000 time steps"""

    env = TorcsDockerEnv(
        docker_client, 'test', port=docker_port)
    observation = env.reset(relaunch=True)
    state_t = obs_to_state(observation)

    results = {}

    for i in range(max_steps):
        action_t = model.predict(sess, state_t.reshape(1, state_t.shape[0]))
        observation, reward_t, done, _ = env.step(action_t[0])
        state_t = obs_to_state(observation)
        results[i] = reward_t
        if done:
            break

    env.end()

    return results

def testDDPG(docker_client, modeldir, test_tracks):
    """Loads the weights from the model dir and drives the agent around the provided test tracks"""
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    
    tf.reset_default_graph()

    model = DDPG(docker_client)

    saver = tf.train.Saver(max_to_keep=5)
    rewards = {}
    with tf.Session(config=config) as sess:
        ckpt = tf.train.get_checkpoint_state(modeldir)
        saver.restore(sess, ckpt.model_checkpoint_path)
        
        for track in test_tracks:
            reward = testModelOnTrack(
                docker_client, sess, model.actor, track, max_steps=1000,
                docker_port=3121)
            rewards[track] = reward
    return rewards

In [41]:
test_tracks = ['g-track-3', 'e-track-6', 'alpine-2']

path_ddpg_ref = '../models/ddpg_ref'
path_ddpg_1 = '../models/ddpg_1'
path_ddpg_2 = '../models/ddpg_2'

In [None]:
ddpg_ref = testDDPG(docker_client, path_ddpg_ref, test_tracks)
ddpg_1 = testDDPG(docker_client, path_ddpg_1, test_tracks)
ddpg_2 = testDDPG(docker_client, path_ddpg_2, test_tracks)

INFO:tensorflow:Restoring parameters from ../models/ddpg_ref/model-1050.cptk


[2017-04-25 01:09:43,501] Restoring parameters from ../models/ddpg_ref/model-1050.cptk


Waiting for server on 3121............
Waiting for server on 3121............
Client connected on 3121..............
-0.0518008636786
0.462160325397
1.11785491013
1.66566063166
2.00558318522
2.46476364583
2.68544453634
3.00793015649
3.25467007017
3.48848456451
3.62390686869
3.55512679125
3.67335994301
3.54946736122
3.51483001354
3.28886603451
3.12534135927
2.88665936327
2.55117902034
2.23389734054
1.90482252849
1.55112077925
1.19154577747
0.827728243918
0.43350507896
0.0513089230392
-0.327558419339
-0.70684920611
-1.0613514367
-1.41003149142
-1.72248503749
-2.05891296285
-2.39063411768
-2.68772014806
-2.92322908522
-3.13159963831
-3.38116760762
-3.48844272593
-3.73515456106
-3.83965057713
-3.87921706407
-3.98104878571
-4.04814462109
-4.10673381768
-4.16902851932
-4.25179890045
-4.22574613674
-4.21480686854
Waiting for server on 3121............
Waiting for server on 3121............
Client connected on 3121..............
-0.0518008636786
0.462160325397
1.11785491013
1.66566063166
2.005

In [39]:
ddpg_1

{'alpine-2': -1, 'e-track-6': -1, 'g-track-3': -1}

In [None]:
# %load networks.py
import tensorflow as tf

class Network(object):

    HIDDEN1_UNITS = 300
    HIDDEN2_UNITS = 600

    def __init__(self, state_size, action_size, trainer):
        self.state_size = state_size
        self.action_size = action_size
        self.trainer = trainer
        self.is_training = False


class ActorCriticBaseNetwork(Network):

    def __init__(self, state_size, action_size, trainer, tau):
        super(ActorCriticBaseNetwork, self).__init__(
            state_size, action_size, trainer)

        self.tau = tau
        self.weights = None
        self.target_weights = None
        self.cp_trgt_wgt_frm_wgt = None

    def _create_target_train(self):
        self.cp_trgt_wgt_frm_wgt = tf.group(
            *[v1.assign(self.tau*v2 + (1-v1))
              for v1, v2 in zip(self.target_weights, self.weights)])

    def target_train(self, sess):
        self.is_training = True
        sess.run(self.cp_trgt_wgt_frm_wgt)


class CriticNetwork(ActorCriticBaseNetwork):

    def __init__(self, state_size, action_size, trainer, tau):

        super(CriticNetwork, self).__init__(
            state_size, action_size, trainer, tau)

        self.net_scope = 'critic_network'
        self.target_net_scope = 'target_critic_network'
        # Now create the model
        self.critic, self.weights, self.state, self.action = \
            self._create_network(self.net_scope)
        self.target_critic, self.target_weights, self.target_state, \
            self.target_action = self._create_network(self.target_net_scope)
        self._create_target_train()
        # GRADIENTS for policy update
        self.action_grads = tf.gradients(self.critic, self.action)
        self.optimize, self.loss, self.expected_critic = self._create_train()

    def _create_network(self, scope):
        with tf.variable_scope(scope):

            state = tf.placeholder(
                shape=[None, self.state_size], dtype=tf.float32, name='state')
            action = tf.placeholder(
                shape=[None, self.action_size],
                dtype=tf.float32, name='action')

            s_layer1 = tf.layers.batch_normalization(
                tf.layers.dense(
                    inputs=state, activation=tf.nn.relu,
                    units=CriticNetwork.HIDDEN1_UNITS),
                training=self.is_training, name='s_layer_1')

            s_layer2 = tf.layers.batch_normalization(
                tf.layers.dense(
                    inputs=s_layer1,
                    units=CriticNetwork.HIDDEN2_UNITS),
                training=self.is_training, name='s_layer_2')

            a_layer = tf.layers.batch_normalization(
                tf.layers.dense(
                    inputs=action,
                    units=CriticNetwork.HIDDEN2_UNITS),
                training=self.is_training, name='a_layer')

            c_layer = tf.layers.batch_normalization(
                tf.layers.dense(
                    inputs=(s_layer2 + a_layer),
                    activation=tf.nn.relu,
                    units=CriticNetwork.HIDDEN2_UNITS),
                training=self.is_training, name='c_layer')

            critic = tf.layers.batch_normalization(
                tf.layers.dense(inputs=c_layer,
                                units=self.action_size),
                training=self.is_training, name='critic')

            weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                        scope=scope)

        return critic, weights, state, action

    def _create_train(self):
        expected_critic = tf.placeholder(shape=[None, self.action_size],
                                         dtype=tf.float32,
                                         name='expected_critic')

        loss = tf.reduce_mean(tf.square(expected_critic-self.critic),
                              name="loss")

        optimize = self.trainer.minimize(loss, name='optimize')

        return optimize, loss, expected_critic

    def target_predict(self, sess, states, actions):
        self.is_training = False
        return sess.run(
            self.target_critic,
            feed_dict={self.target_state: states,
                       self.target_action: actions})

    def gradients(self, sess, states, actions):
        self.is_training = False
        return sess.run(
            self.action_grads,
            feed_dict={self.state: states, self.action: actions})[0]

    def train(self, sess, expected_critic, states, actions):
        self.is_training = True
        loss, _ = sess.run(
            [self.loss, self.optimize],
            feed_dict={
                self.expected_critic: expected_critic, self.state: states,
                self.action: actions})

        return loss


class ActorNetwork(ActorCriticBaseNetwork):

    def __init__(self, state_size, action_size, trainer, tau):

        super(ActorNetwork, self).__init__(
            state_size, action_size, trainer, tau)

        self.net_scope = 'actor_network'
        self.target_net_scope = 'target_actor_network'
        # Now create the model
        self.action, self.weights, self.state = \
            self._create_network(self.net_scope)
        self.target_action, self.target_weights, self.target_state = \
            self._create_network(self.target_net_scope)
        self._create_target_train()
        self.optimize, self.action_gradient = self._create_train()

    def _create_network(self, scope):
        with tf.variable_scope(scope):
            state = tf.placeholder(tf.float32, [None, self.state_size],
                                   name='state')

            hidden0 = tf.layers.batch_normalization(
                tf.layers.dense(
                    inputs=state, activation=tf.nn.relu,
                    units=ActorNetwork.HIDDEN1_UNITS),
                training=self.is_training, name='hidden_0')

            hidden1 = tf.layers.batch_normalization(
                tf.layers.dense(inputs=hidden0, activation=tf.nn.relu,
                                units=ActorNetwork.HIDDEN2_UNITS),
                training=self.is_training, name='hidden_1')

            steering = tf.layers.batch_normalization(
                tf.layers.dense(
                    inputs=hidden1, units=1, activation=tf.nn.tanh),
                training=self.is_training, name='steering')

            acceleration = tf.layers.batch_normalization(
                tf.layers.dense(
                    inputs=hidden1, units=1, activation=tf.nn.tanh),
                training=self.is_training, name='acceleration')

            action = tf.concat(
                [steering, acceleration], name='action', axis=1)

            weights = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)

        return action, weights, state

    def _create_train(self):
        action_gradient = tf.placeholder(tf.float32, [None, self.action_size])
        params_grad = tf.gradients(self.action, self.weights,
                                   tf.negative(action_gradient))
        grads = zip(params_grad, self.weights)
        optimize = self.trainer.apply_gradients(grads)
        return optimize, action_gradient

    def predict(self, sess, states):
        self.is_training = False
        return sess.run(self.action, feed_dict={self.state: states})

    def target_predict(self, sess, states):
        self.is_training = False
        return sess.run(
            self.target_action,
            feed_dict={self.target_state: states})

    def train(self, sess, states, action_grads):
        self.training = True
        sess.run(
            self.optimize,
            feed_dict={
                self.state: states, self.action_gradient: action_grads})


class A3CNetwork(Network):

    def __init__(self, state_size, action_size, trainer, scope):
        super(A3CNetwork, self).__init__(
            state_size, action_size, trainer)
        self.scope = scope
        self.is_training = False
        self._create_network()
        if self.scope != 'global':
            self._create_train()

    @staticmethod
    def update_target_graph(from_scope, to_scope):
        from_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
        to_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

        op_holder = []
        for from_var, to_var in zip(from_vars, to_vars):
            op_holder.append(to_var.assign(from_var))

        return op_holder

    def _create_network(self):
        with tf.variable_scope(self.scope):
            # Input and visual encoding layers
            self.inputs = tf.placeholder(
                shape=[None, self.state_size], dtype=tf.float32)

            s_layer1 = tf.layers.batch_normalization(
                tf.layers.dense(
                    inputs=self.inputs, activation=tf.nn.relu,
                    units=A3CNetwork.HIDDEN1_UNITS),
                training=self.is_training, name='s_layer_1')

            s_layer2 = tf.layers.batch_normalization(
                tf.layers.dense(
                    inputs=s_layer1, activation=tf.nn.relu,
                    units=A3CNetwork.HIDDEN2_UNITS),
                training=self.is_training, name='s_layer_2')

            # Output layers for policy and value estimations
            self.policy_mu = tf.layers.batch_normalization(
                tf.layers.dense(
                    inputs=s_layer2, units=2, activation=tf.nn.tanh),
                training=self.is_training, name='policy_mu')

            self.policy_sd = tf.clip_by_value(
                tf.layers.batch_normalization(
                    tf.layers.dense(
                        inputs=s_layer2, units=2, activation=tf.nn.softplus),
                    training=self.is_training),
                [0.05]*self.action_size, [0.25]*self.action_size,
                name='policy_sd')

            self.value = tf.layers.batch_normalization(
                tf.layers.dense(inputs=s_layer2, units=1),
                training=self.is_training, name='value')

            self.normal_dist = tf.contrib.distributions.Normal(
                self.policy_mu, self.policy_sd, name='normal_dist')

            self.action = tf.clip_by_value(
                self.normal_dist.sample(1),
                [-1.0]*self.action_size, [1.0]*self.action_size,
                name='action')

    def _create_train(self):
        with tf.variable_scope(self.scope):
            self.actions = tf.placeholder(
                shape=[None, self.action_size], dtype=tf.float32,
                name='actions')
            self.target_v = tf.placeholder(
                shape=[None], dtype=tf.float32, name='target_v')
            self.advantages = tf.placeholder(
                shape=[None], dtype=tf.float32, name='advantages')

            log_prob = self.normal_dist.log_prob(self.actions)
            exp_v = tf.transpose(
                tf.multiply(tf.transpose(log_prob), self.advantages))
            entropy = self.normal_dist.entropy()
            exp_v = 0.01 * entropy + exp_v
            self.policy_loss = tf.reduce_sum(-exp_v)

            self.value_loss = 0.5 * tf.reduce_sum(
                tf.square(self.target_v - tf.reshape(self.value, [-1])))

            self.loss = 0.5*self.value_loss + self.policy_loss

            local_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)

            self.gradients = tf.gradients(self.loss, local_vars)
            self.var_norms = tf.global_norm(local_vars)

            grads, self.grad_norms = tf.clip_by_global_norm(
                self.gradients, 40.0)

            global_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
            self.apply_grads = self.trainer.apply_gradients(
                zip(grads, global_vars))

    def predict(self, sess, state):
        action = sess.run(
            self.action,
            feed_dict={self.inputs: [state]})
        return action[0]


In [None]:
# %load ddpg.py
import os
import random
import numpy as np
import tensorflow as tf

from collections import deque
from networks import ActorNetwork, CriticNetwork
from gym_torcs_docker import TorcsDockerEnv, obs_to_state
from numpy.random import seed, randn


class ReplayBuffer(object):

    def __init__(self, buffer_size):
        self.buffer_size = buffer_size
        self.num_experiences = 0
        self.buffer = deque()

    def getBatch(self, batch_size):
        # Randomly sample batch_size examples
        if self.num_experiences < batch_size:
            return random.sample(self.buffer, self.num_experiences)
        else:
            return random.sample(self.buffer, batch_size)

    def size(self):
        return self.buffer_size

    def add(self, state, action, reward, new_state, done):
        experience = (state, action, reward, new_state, done)
        if self.num_experiences < self.buffer_size:
            self.buffer.append(experience)
            self.num_experiences += 1
        else:
            self.buffer.popleft()
            self.buffer.append(experience)

    def count(self):
        # if buffer is full, return buffer size
        # otherwise, return experience counter
        return self.num_experiences

    def erase(self):
        self.buffer = deque()
        self.num_experiences = 0


class DDPG(object):

    def __init__(
            self, docker_client, name='worker', port=3101,
            model_path='../models/ddpg', log_path='../logs/ddpg'):

        self.state_size = 29
        self.action_size = 2

        self.docker_client = docker_client

        self.buffer_size = 100000
        self.batch_size = 32
        self.gamma = 0.99
        self.tau = 0.001  # Target Network HyperParameters
        self.lra = 0.0001  # Learning rate for Actor
        self.lrc = 0.001  # Lerning rate for Critic
        seed(6486)

        self.explore = 100000.
        self.episode_count = 2000
        self.max_steps = 10000
        self.epsilon = 1

        self.model_path = model_path
        self.port = port
        self.name = name

        if not os.path.exists(self.model_path):
                os.makedirs(self.model_path)

        self.config = tf.ConfigProto()
        self.config.gpu_options.allow_growth = True
        tf.reset_default_graph()

        self.summary_writer = tf.summary.FileWriter(log_path)

        self.actor = ActorNetwork(
            self.state_size, self.action_size,
            tf.train.AdamOptimizer(self.lra), self.tau)

        self.critic = CriticNetwork(
            self.state_size, self.action_size,
            tf.train.AdamOptimizer(self.lrc), self.tau)

        self.buff = ReplayBuffer(self.buffer_size)
        self.saver = tf.train.Saver()
        self._create_summary()

    def _create_summary(self):
        with tf.name_scope('summary'):
            self.loss_summary_op = tf.summary.scalar(
                'loss', self.critic.loss, collections=['loss'])

            self.reward_ph = tf.placeholder(
                shape=[None, ], name='reward', dtype=tf.float32)
            self.target_q_values_ph = tf.placeholder(
                shape=[None, self.action_size], name='target_q_values',
                dtype=tf.float32)
            self.y_t_ph = tf.placeholder(
                shape=[None, self.action_size], name='target_y_t',
                dtype=tf.float32)

            tf.summary.scalar(
                'reward', tf.reduce_mean(
                    self.reward_ph), collections=['reward'])
            tf.summary.scalar(
                'target_q_values', tf.reduce_mean(self.target_q_values_ph),
                collections=['reward'])
            tf.summary.scalar(
                'y_t', tf.reduce_mean(self.y_t_ph), collections=['reward'])

            self.reward_summary_op = tf.summary.merge_all('reward')

    @staticmethod
    def addOUNoise(a, epsilon):

        def ou_func(x, mu, theta, sigma):
            return theta * (mu - x) + sigma * randn(1)

        a_new = np.zeros(np.shape(a))
        noise = np.zeros(np.shape(a))

        noise[0] = (max(epsilon, 0) * ou_func(a[0], 0.0, 0.60, 0.30))
        noise[1] = (max(epsilon, 0) * ou_func(a[1], 0.2, 1.00, 0.10))

        a_new[0] = a[0] + noise[0]
        a_new[1] = a[1] + noise[1]

        return a_new

    def train(self, track_name='', check_stuck=True):

        all_steps = 0

        if track_name == '':
            env = TorcsDockerEnv(
                self.docker_client, self.name, self.port, training=True)
        else:
            env = TorcsDockerEnv(
                self.docker_client, self.name, self.port,
                track_name=track_name)

        with tf.Session(config=self.config) as sess:
            sess.run(tf.global_variables_initializer())

            for i in range(self.episode_count):

                recent_rewards = np.ones(1000) * 1e9
                print("Episode : " + str(i) + " Replay Buffer "
                      + str(self.buff.count()))

                if np.mod(i, 3) == 0:
                    observation = env.reset(relaunch=True)
                else:
                    observation = env.reset()

                state_t = obs_to_state(observation)
                total_reward = 0

                for j in range(self.max_steps):
                    loss = 0
                    self.epsilon -= 1.0 / self.explore

                    action_t = self.actor.predict(
                        sess, state_t.reshape(1, state_t.shape[0]))

                    observation, reward_t, done, _ = env.step(
                        DDPG.addOUNoise(action_t[0], self.epsilon))
                    state_t1 = obs_to_state(observation)

                    recent_rewards[j % 1000] = reward_t

                    if (check_stuck and np.median(recent_rewards) < 1.0
                            and i/self.episode_count < 0.5):
                        break

                    self.buff.add(
                        state_t, action_t[0], reward_t, state_t1, done)
                    batch = self.buff.getBatch(self.batch_size)
                    states = np.asarray([e[0] for e in batch])
                    actions = np.asarray([e[1] for e in batch])
                    rewards = np.asarray([e[2] for e in batch])
                    new_states = np.asarray([e[3] for e in batch])
                    dones = np.asarray([e[4] for e in batch])
                    y_t = np.asarray([e[1] for e in batch])

                    target_q_values = self.critic.target_predict(
                        sess, new_states,
                        self.actor.target_predict(sess, new_states))

                    for k in range(len(batch)):
                        if dones[k]:
                            y_t[k] = rewards[k]
                        else:
                            y_t[k] = (
                                rewards[k] + self.gamma * target_q_values[k])

                    loss += self.critic.train(sess, y_t, states, actions)
                    actions_for_grad = self.actor.predict(sess, states)
                    grads = self.critic.gradients(
                        sess, states, actions_for_grad)
                    self.actor.train(sess, states, grads)
                    self.actor.target_train(sess)
                    self.critic.target_train(sess)

                    all_steps += 1

                    if j % 50:

                        loss_summary, reward_summary = sess.run(
                            [self.loss_summary_op,
                             self.reward_summary_op],
                            feed_dict={
                                self.critic.expected_critic: y_t,
                                self.critic.state: states,
                                self.critic.action: actions,
                                self.reward_ph: rewards,
                                self.target_q_values_ph: target_q_values,
                                self.y_t_ph: y_t})

                        self.summary_writer.add_summary(
                            loss_summary, all_steps)
                        self.summary_writer.add_summary(
                            reward_summary, all_steps)
                        self.summary_writer.flush()

                    total_reward += reward_t
                    state_t = state_t1
                    print(
                        "Episode", i, "Step", all_steps, "Action",
                        action_t, "Reward", reward_t, "Loss", loss)
                    if done:
                        break

                print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
                      str(total_reward))
                print("Total Step: " + str(all_steps))
                print("")

                if np.mod(i, 50) == 0:
                    self.saver.save(
                        sess, self.model_path+'/model-{:d}.cptk'.format(i))
        env.end()


if __name__ == "__main__":
    import docker

    docker_client = docker.from_env()

    ddpg = DDPG(
        docker_client, 3101, '../models/ddpg_gtrack1', '../logs/ddpg_gtrack1')
    ddpg.train('g-track-1')

    ddpg = DDPG(
        docker_client, 3101, '../models/ddpg_traintracks',
        '../logs/ddpg_traintracks')
    ddpg.train()

    ddpg = DDPG(
        docker_client, 3101, '../models/ddpg_gtrack1_nostuck',
        '../logs/ddpg_gtrack1_nostuck')
    ddpg.train('g-track-1', False)

    ddpg.train()


In [5]:
# 
import os
import threading
import numpy as np
import tensorflow as tf
import scipy.signal

from time import sleep
from gym_torcs_docker import TorcsDockerEnv, obs_to_state
from networks import A3CNetwork


class Worker(object):

    def __init__(self, s_size, action_size, trainer, number, global_episodes,
                 docker_client, docker_port, modeldir, logdir):

        self.s_size = s_size
        self.action_size = action_size
        self.number = number
        self.trainer = trainer
        self.global_episodes = global_episodes
        self.docker_client = docker_client
        self.modeldir = modeldir
        self.logdir = logdir

        self.name = 'worker_'+str(self.number)
        self.docker_port = docker_port

        self.increment = self.global_episodes.assign_add(1)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        self.summary_writer = tf.summary.FileWriter(
            self.logdir + '/train_' + str(self.number))

        self.local_AC = A3CNetwork(
            self.s_size, self.action_size, self.trainer, self.name)
        self.update_local_ops = A3CNetwork.update_target_graph(
            'global', self.name)

    def train(self, rollout, sess, gamma, bootstrap_value):
        def discount(x, gamma):
            return scipy.signal.lfilter(
                [1], [1, -gamma], x[::-1], axis=0)[::-1]

        self.local_AC.is_training = True
        rollout = np.array(rollout)
        observations = rollout[:, 0]
        actions = np.stack(rollout[:, 1], 0)[0][0]
        rewards = rollout[:, 2]
        values = rollout[:, 5]
        self.rewards_plus = np.asarray(
            rewards.tolist() + [bootstrap_value])

        discounted_rewards = discount(self.rewards_plus, gamma)[:-1]
        self.value_plus = np.asarray(values.tolist() + [bootstrap_value])
        advantages = (
            rewards + gamma * self.value_plus[1:] - self.value_plus[:-1])
        feed_dict = {self.local_AC.target_v: discounted_rewards,
                     self.local_AC.actions: actions,
                     self.local_AC.inputs: np.vstack(observations),
                     self.local_AC.advantages: advantages}
        value_loss, policy_loss, gradient_norm, value_norm, _ = sess.run(
            [self.local_AC.value_loss, self.local_AC.policy_loss,
             self.local_AC.grad_norms, self.local_AC.var_norms,
             self.local_AC.apply_grads],
            feed_dict=feed_dict)
        self.local_AC.is_training = False

        return (value_loss/len(rollout), policy_loss/len(rollout),
                gradient_norm, value_norm)

    def work(self, max_episode_length, gamma, sess, coord, saver):
        self.local_AC.is_training = False
        env = TorcsDockerEnv(
            self.docker_client, self.name, self.docker_port, training=True)

        episode_count = sess.run(self.global_episodes)
        total_steps = 0
        print("Starting {}".format(self.name))

        with sess.as_default(), sess.graph.as_default():
            while not coord.should_stop():
                sess.run(self.update_local_ops)
                episode_buffer = []
                episode_values = []
                episode_frames = []
                episode_reward = 0
                episode_step_count = 0

                # reset docker every third episode
                local_episodes = 0
                if np.mod(local_episodes, 3) == 0:
                    observation = env.reset(relaunch=True)
                else:
                    observation = env.reset()
                state_t = obs_to_state(observation)
                done = False

                epsilon = 1

                while not done:

                    action_t, value_t = sess.run(
                        [self.local_AC.action, self.local_AC.value],
                        feed_dict={self.local_AC.inputs: [state_t]})

                    epsilon -= 1.0 / max_episode_length

                    observation, reward_t, done, _ = env.step(action_t[0][0])

                    if not done:
                        state_t1 = obs_to_state(observation)
                        episode_frames.append(state_t1)
                    else:
                        state_t1 = state_t

                    episode_buffer.append(
                        [state_t, action_t, reward_t, state_t1, done,
                         value_t[0, 0]])
                    episode_values.append(value_t[0, 0])

                    episode_reward += reward_t

                    state_t = state_t1
                    total_steps += 1
                    episode_step_count += 1

                    if total_steps % 20:
                        print(
                            "Worker", self.name,
                            "Episode", episode_count, "Step",
                            episode_step_count, "Total_Steps",
                            total_steps, "Action", action_t[0][0],
                            "Reward", reward_t)
                        summary = tf.Summary()
                        summary.value.add(
                            tag='summary/reward_1',
                            simple_value=float(reward_t))
                        self.summary_writer.add_summary(
                            summary, total_steps)

                    self.summary_writer.flush()

                    if (len(episode_buffer) == 30 and not done
                            and episode_step_count != max_episode_length-1):

                        value_t1 = sess.run(
                            self.local_AC.value,
                            feed_dict={self.local_AC.inputs: [state_t]})[0, 0]

                        (value_loss, policy_loss, gradient_norm,
                            variable_norm) = self.train(
                                episode_buffer, sess, gamma, value_t1)
                        episode_buffer = []
                        sess.run(self.update_local_ops)
                    if (done or episode_step_count != max_episode_length):
                        break

                local_episodes += 1
                self.episode_rewards.append(episode_reward)
                self.episode_lengths.append(episode_step_count)
                self.episode_mean_values.append(
                    np.mean(episode_values))

                if len(episode_buffer) != 0:
                    (value_loss, policy_loss, gradient_norm,
                     variable_norm) = self.train(
                        episode_buffer, sess, gamma, 0.0)

                if episode_count != 0:
                    if (self.name == 'worker_0'):
                        saver.save(
                            sess,
                            os.path.join(self.modeldir,
                                         'model-{:d}.cptk'.format(
                                             episode_count)))

                    mean_reward = np.mean(self.episode_rewards[-5:])
                    mean_length = np.mean(self.episode_lengths[-5:])
                    mean_value = np.mean(self.episode_mean_values[-5:])

                    print(
                        "Worker", self.name, "Episode", episode_count,
                        "Reward", mean_reward, "value_Loss", value_loss,
                        "policy_loss", policy_loss)

                    summary = tf.Summary()
                    summary.value.add(
                        tag='Perf/Reward',
                        simple_value=float(mean_reward))
                    summary.value.add(
                        tag='Perf/Length',
                        simple_value=float(mean_length))
                    summary.value.add(
                        tag='Perf/Value',
                        simple_value=float(mean_value))
                    summary.value.add(
                        tag='Losses/Value Loss',
                        simple_value=float(value_loss))
                    summary.value.add(
                        tag='Losses/Policy Loss',
                        simple_value=float(policy_loss))
                    summary.value.add(
                        tag='Losses/Grad Norm',
                        simple_value=float(gradient_norm))
                    summary.value.add(
                        tag='Losses/Var Norm',
                        simple_value=float(variable_norm))

                    self.summary_writer.add_summary(
                        summary, episode_count)

                    self.summary_writer.flush()

                if self.name == 'worker_0':
                    sess.run(self.increment)
                episode_count += 1
        env.end()


class A3C(object):

    def __init__(
            self, docker_client, docker_start_port=3101,
            modeldir='../models/a3c', logdir='../logs/a3c'):

        self.docker_client = docker_client

        self.docker_start_port = docker_start_port

        self.max_episode_length = 4000
        self.gamma = .99
        self.logdir = logdir
        self.modeldir = modeldir
        self.state_size = 29
        self.action_size = 2

        self.config = tf.ConfigProto()
        self.config.gpu_options.allow_growth = True

        tf.reset_default_graph()

        self.global_episodes = tf.Variable(
                0, dtype=tf.int32, name='global_episodes', trainable=False)

        if not os.path.exists(self.modeldir):
                os.makedirs(self.modeldir)

    def train(self, num_workers, load_model=False):
        with tf.device("/cpu:0"):

            trainer = tf.train.AdamOptimizer(learning_rate=1e-4)
            master_network = A3CNetwork(
                self.state_size, self.action_size, None, 'global')

            workers = []
            for i in range(num_workers):
                workers.append(
                    Worker(
                        self.state_size, self.action_size, trainer, i,
                        self.global_episodes, self.docker_client,
                        self.docker_start_port + i,
                        self.modeldir, self.logdir))

            saver = tf.train.Saver(max_to_keep=5)

        with tf.Session(config=self.config) as sess:

            coord = tf.train.Coordinator()

            if load_model:
                print('Loading Model...')
                ckpt = tf.train.get_checkpoint_state(self.model_path)
                saver.restore(sess, ckpt.model_checkpoint_path)
            else:
                sess.run(tf.global_variables_initializer())

            worker_threads = []
            for worker in workers:
                t = threading.Thread(
                    target=(
                        lambda: worker.work(
                            self.max_episode_length, self.gamma, sess, coord,
                            saver)))
                t.start()
                sleep(0.5)
                worker_threads.append(t)
            coord.join(worker_threads)


if __name__ == "__main__":
    import docker

    docker_client = docker.from_env()

    a3c = A3C(docker_client)
    a3c.train(1)


Exception in thread Thread-5:
Traceback (most recent call last):
  File "/usr/local/lib/python3.5/dist-packages/docker/api/client.py", line 214, in _raise_for_status
    response.raise_for_status()
  File "/usr/local/lib/python3.5/dist-packages/requests/models.py", line 909, in raise_for_status
    raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 404 Client Error: Not Found for url: http+docker://localunixsocket/v1.26/containers/worker_0/json

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.5/threading.py", line 862, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-5-07433b9e6a66>", line 286, in <lambda>
    saver)))
  File "<ipython-input-5-07433b9e6a66>", line 77, in work
    self.docker_client, self.name, self.docker_port, training=True)
  File "/root/rl_torcs/sr

In [None]:
# %load ddpg.py
import os
import random
import numpy as np
import tensorflow as tf

from collections import deque
from networks import ActorNetwork, CriticNetwork
from gym_torcs_docker import TorcsDockerEnv, obs_to_state
from numpy.random import seed, randn


class ReplayBuffer(object):

    def __init__(self, buffer_size):
        self.buffer_size = buffer_size
        self.num_experiences = 0
        self.buffer = deque()

    def getBatch(self, batch_size):
        # Randomly sample batch_size examples
        if self.num_experiences < batch_size:
            return random.sample(self.buffer, self.num_experiences)
        else:
            return random.sample(self.buffer, batch_size)

    def size(self):
        return self.buffer_size

    def add(self, state, action, reward, new_state, done):
        experience = (state, action, reward, new_state, done)
        if self.num_experiences < self.buffer_size:
            self.buffer.append(experience)
            self.num_experiences += 1
        else:
            self.buffer.popleft()
            self.buffer.append(experience)

    def count(self):
        # if buffer is full, return buffer size
        # otherwise, return experience counter
        return self.num_experiences

    def erase(self):
        self.buffer = deque()
        self.num_experiences = 0


class DDPG(object):

    def __init__(
            self, docker_client, name='worker', port=3101,
            model_path='../models/ddpg', log_path='../logs/ddpg'):

        self.state_size = 29
        self.action_size = 2

        self.docker_client = docker_client

        self.buffer_size = 100000
        self.batch_size = 32
        self.gamma = 0.99
        self.tau = 0.001  # Target Network HyperParameters
        self.lra = 0.0001  # Learning rate for Actor
        self.lrc = 0.001  # Lerning rate for Critic
        seed(6486)

        self.explore = 100000.
        self.episode_count = 2000
        self.max_steps = 10000
        self.epsilon = 1

        self.model_path = model_path
        self.port = port
        self.name = name

        if not os.path.exists(self.model_path):
                os.makedirs(self.model_path)

        self.config = tf.ConfigProto()
        self.config.gpu_options.allow_growth = True
        tf.reset_default_graph()

        self.summary_writer = tf.summary.FileWriter(log_path)

        self.actor = ActorNetwork(
            self.state_size, self.action_size,
            tf.train.AdamOptimizer(self.lra), self.tau)

        self.critic = CriticNetwork(
            self.state_size, self.action_size,
            tf.train.AdamOptimizer(self.lrc), self.tau)

        self.buff = ReplayBuffer(self.buffer_size)
        self.saver = tf.train.Saver()
        self._create_summary()

    def _create_summary(self):
        with tf.name_scope('summary'):
            self.loss_summary_op = tf.summary.scalar(
                'loss', self.critic.loss, collections=['loss'])

            self.reward_ph = tf.placeholder(
                shape=[None, ], name='reward', dtype=tf.float32)
            self.target_q_values_ph = tf.placeholder(
                shape=[None, self.action_size], name='target_q_values',
                dtype=tf.float32)
            self.y_t_ph = tf.placeholder(
                shape=[None, self.action_size], name='target_y_t',
                dtype=tf.float32)

            tf.summary.scalar(
                'reward', tf.reduce_mean(
                    self.reward_ph), collections=['reward'])
            tf.summary.scalar(
                'target_q_values', tf.reduce_mean(self.target_q_values_ph),
                collections=['reward'])
            tf.summary.scalar(
                'y_t', tf.reduce_mean(self.y_t_ph), collections=['reward'])

            self.reward_summary_op = tf.summary.merge_all('reward')

    @staticmethod
    def addOUNoise(a, epsilon):

        def ou_func(x, mu, theta, sigma):
            return theta * (mu - x) + sigma * randn(1)

        a_new = np.zeros(np.shape(a))
        noise = np.zeros(np.shape(a))

        noise[0] = (max(epsilon, 0) * ou_func(a[0], 0.0, 0.60, 0.30))
        noise[1] = (max(epsilon, 0) * ou_func(a[1], 0.2, 1.00, 0.10))

        a_new[0] = a[0] + noise[0]
        a_new[1] = a[1] + noise[1]

        return a_new

    def train(self, track_name='', check_stuck=True):

        all_steps = 0

        if track_name == '':
            env = TorcsDockerEnv(
                self.docker_client, self.name, self.port, training=True)
        else:
            env = TorcsDockerEnv(
                self.docker_client, self.name, self.port,
                track_name=track_name)

        with tf.Session(config=self.config) as sess:
            sess.run(tf.global_variables_initializer())

            for i in range(self.episode_count):

                recent_rewards = np.ones(1000) * 1e9
                print("Episode : " + str(i) + " Replay Buffer "
                      + str(self.buff.count()))

                if np.mod(i, 3) == 0:
                    observation = env.reset(relaunch=True)
                else:
                    observation = env.reset()

                state_t = obs_to_state(observation)
                total_reward = 0

                for j in range(self.max_steps):
                    loss = 0
                    self.epsilon -= 1.0 / self.explore

                    action_t = self.actor.predict(
                        sess, state_t.reshape(1, state_t.shape[0]))

                    observation, reward_t, done, _ = env.step(
                        DDPG.addOUNoise(action_t[0], self.epsilon))
                    state_t1 = obs_to_state(observation)

                    recent_rewards[j % 1000] = reward_t

                    if (check_stuck and np.median(recent_rewards) < 1.0
                            and i/self.episode_count < 0.5):
                        break

                    self.buff.add(
                        state_t, action_t[0], reward_t, state_t1, done)
                    batch = self.buff.getBatch(self.batch_size)
                    states = np.asarray([e[0] for e in batch])
                    actions = np.asarray([e[1] for e in batch])
                    rewards = np.asarray([e[2] for e in batch])
                    new_states = np.asarray([e[3] for e in batch])
                    dones = np.asarray([e[4] for e in batch])
                    y_t = np.asarray([e[1] for e in batch])

                    target_q_values = self.critic.target_predict(
                        sess, new_states,
                        self.actor.target_predict(sess, new_states))

                    for k in range(len(batch)):
                        if dones[k]:
                            y_t[k] = rewards[k]
                        else:
                            y_t[k] = (
                                rewards[k] + self.gamma * target_q_values[k])

                    loss += self.critic.train(sess, y_t, states, actions)
                    actions_for_grad = self.actor.predict(sess, states)
                    grads = self.critic.gradients(
                        sess, states, actions_for_grad)
                    self.actor.train(sess, states, grads)
                    self.actor.target_train(sess)
                    self.critic.target_train(sess)

                    all_steps += 1

                    if j % 50:

                        loss_summary, reward_summary = sess.run(
                            [self.loss_summary_op,
                             self.reward_summary_op],
                            feed_dict={
                                self.critic.expected_critic: y_t,
                                self.critic.state: states,
                                self.critic.action: actions,
                                self.reward_ph: rewards,
                                self.target_q_values_ph: target_q_values,
                                self.y_t_ph: y_t})

                        self.summary_writer.add_summary(
                            loss_summary, all_steps)
                        self.summary_writer.add_summary(
                            reward_summary, all_steps)
                        self.summary_writer.flush()

                    total_reward += reward_t
                    state_t = state_t1
                    print(
                        "Episode", i, "Step", all_steps, "Action",
                        action_t, "Reward", reward_t, "Loss", loss)
                    if done:
                        break

                print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
                      str(total_reward))
                print("Total Step: " + str(all_steps))
                print("")

                if np.mod(i, 50) == 0:
                    self.saver.save(
                        sess, self.model_path+'/model-{:d}.cptk'.format(i))
        env.end()


if __name__ == "__main__":
    import docker

    docker_client = docker.from_env()

    ddpg = DDPG(
        docker_client, 3101, '../models/ddpg_gtrack1', '../logs/ddpg_gtrack1')
    ddpg.train('g-track-1')

    ddpg = DDPG(
        docker_client, 3101, '../models/ddpg_traintracks',
        '../logs/ddpg_traintracks')
    ddpg.train()

    ddpg = DDPG(
        docker_client, 3101, '../models/ddpg_gtrack1_nostuck',
        '../logs/ddpg_gtrack1_nostuck')
    ddpg.train('g-track-1', False)

    ddpg.train()
