In [1]:
import gym
import numpy as np
import keras.backend as K
import tensorflow as tf
import keras
import random
from keras.models import Model
from keras.initializers import RandomUniform, normal
from keras.layers import Dense, Conv1D, Conv2D, Flatten, Input, MaxPool2D, concatenate, merge
from keras.models import Sequential
from keras.optimizers import Adam
from collections import deque
from rl.random import OrnsteinUhlenbeckProcess

Using TensorFlow backend.


Importing the racing game as environment for the Reinforcement Learning Model.

In [2]:
########### Game Setup ######### 
env = gym.make('CarRacing-v0')

In [3]:
class Actor:
    '''
    Set basic parameters for the model
    '''
    def __init__(self, state_size, action_size, learning_rate, tau):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.tau = tau
        # Actual model
        self.model = self._build_model()
        # Target net
        self.target_model = self._build_model()
        self.adam_optimizer = self.optimizer()
    '''
    Build a convolutional neural net with 3 output neurons
    '''
    def _build_model(self):
        
        state = Input((self.state_size))
        # Convolutions
        x = Conv2D(64, kernel_size=9, activation='relu', input_shape=self.state_size)(state)
        x = MaxPool2D(pool_size=(2, 2))(x)
        x = Conv2D(128, kernel_size=9, activation='relu') (x)
        x = MaxPool2D(pool_size=(2, 2))(x)
        # 256 Features with around 1/16 of the initial picture size
        x = Conv2D(256, kernel_size=9, activation='relu') (x)
        x = MaxPool2D(pool_size=(2, 2)) (x)
        
        # Connect convolution and dense layers
        # 2D -> 1D (Linearization)
        x = Flatten()(x)
        
        # 3 hidden layers
        x = Dense(512, activation='relu')(x)
        # Creates 512 x 512 weights
        x = Dense(512, activation='relu')(x)
        
        # Defining the output for each dimension seperately
        # TODO: Using normal initialization to keep initial outputs near zero
        steering = Dense(1,activation='tanh',kernel_initializer=RandomUniform())(x)   
        acceleration = Dense(1,activation='sigmoid',kernel_initializer=RandomUniform())(x)   
        brake = Dense(1,activation='sigmoid',kernel_initializer=RandomUniform())(x) 
        out = concatenate([steering,acceleration,brake],axis=-1)
        
        model = Model(input=state,output=out)        
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model
    def predict(self, state):
        """ Prediction of actor network
        """
        action = self.model.predict(np.expand_dims(state, axis=0))
        print(action)
        return action
    def target_predict(self, inp):
        """ Prediction of target network
        """
        return self.target_model.predict(inp)
    def transfer_weights(self):
        """ Transfer model weights to target model with a factor of Tau
        """
        W, target_W = self.model.get_weights(), self.target_model.get_weights()
        for i in range(len(W)):
            target_W[i] = self.tau * W[i] + (1 - self.tau)* target_W[i]
        self.target_model.set_weights(target_W)
    def train(self, states, actions, grads):
        """ Actor Training
        """
        self.adam_optimizer([states, grads])
    def optimizer(self):
        """ Actor Optimizer
        """
        action_gdts = K.placeholder(shape=(None, self.action_size))
        params_grad = tf.gradients(self.model.output, self.model.trainable_weights, -action_gdts)
        grads = zip(params_grad, self.model.trainable_weights)
        return K.function([self.model.input, action_gdts], [tf.train.AdamOptimizer(self.learning_rate).apply_gradients(grads)][1:])
    def save(self, path):
        self.model.save_weights(path + '_actor.h5')

    def load_weights(self, path):
        self.model.load_weights(path)

In [4]:
class Critic:
    '''
    Set basic parameters for the model
    '''
    def __init__(self, state_size, action_size, learning_rate, tau):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.tau = tau
        # Actual model
        self.model = self._build_model()
        self.model.compile(Adam(self.learning_rate), 'mse')
        # Target net for stability
        self.target_model = self._build_model()        
        self.target_model.compile(Adam(self.learning_rate), 'mse')
        # Function to compute Q-value gradients (Actor Optimization)
        self.action_grads = K.function([self.model.input[0], self.model.input[1]], K.gradients(self.model.output, [self.model.input[1]]))
    '''
    Build a convolutional neural net with 3 output neurons
    '''
    def _build_model(self):
        state = Input((self.state_size))
        x = Conv2D(64, kernel_size=9, activation='relu', input_shape=self.state_size)(state)
        x = MaxPool2D(pool_size=(2, 2))(x)
        x = Conv2D(128, kernel_size=9, activation='relu')(x)
        x = MaxPool2D(pool_size=(2, 2))(x)
        x = Conv2D(256, kernel_size=9, activation='relu')(x)
        x = MaxPool2D(pool_size=(2, 2))(x)
        
        # Actions
        action_shape = (self.action_size,)
        action_layer = Input(shape=action_shape)
        
        # TODO: In the original paper the actions are merged in the second hidden layer
        x = concatenate([Flatten()(x), action_layer])
        x = Dense(512, activation='relu')(x)
        x = Dense(512, activation='relu')(x)
        out = Dense(1, activation='linear', kernel_initializer=RandomUniform())(x)
        return Model([state, action_layer], out)
    
    def gradients(self, states, actions):
        """ Compute Q-value gradients w.r.t. states and policy-actions
        """
        return self.action_grads([states, actions])
    
    def target_predict(self, inp):
        """ Prediction of target network
        """
        return self.target_model.predict(inp)
    # Why does the Critic have no predict function
    
    def train_on_batch(self, states, actions, critic_target):
        """ Train the critic network on batch of sampled experience
            using the keras function train_on_batch
        """
        return self.model.train_on_batch([states, actions], critic_target)
    
    def transfer_weights(self):
        """ Transfer model weights to target model with a factor of Tau
        """
        W, target_W = self.model.get_weights(), self.target_model.get_weights()
        for i in range(len(W)):
            target_W[i] = self.tau * W[i] + (1 - self.tau)* target_W[i]
        self.target_model.set_weights(target_W)
    def save(self, path):
        self.model.save_weights(path + '_critic.h5')

    def load_weights(self, path):
        self.model.load_weights(path)

In [5]:
############## Utils
import numpy

""" Original Code by @jaara: https://github.com/jaara/AI-blog/blob/master/SumTree.py
"""

class SumTree:
    write = 0

    def __init__(self, capacity):
        self.capacity = capacity
        self.tree = numpy.zeros( 2*capacity - 1 )
        self.data = numpy.zeros( capacity, dtype=object )

    def _propagate(self, idx, change):
        parent = (idx - 1) // 2

        self.tree[parent] += change

        if parent != 0:
            self._propagate(parent, change)

    def _retrieve(self, idx, s):
        left = 2 * idx + 1
        right = left + 1

        if left >= len(self.tree):
            return idx

        if s <= self.tree[left]:
            return self._retrieve(left, s)
        else:
            return self._retrieve(right, s-self.tree[left])

    def total(self):
        return self.tree[0]

    def add(self, p, data):
        idx = self.write + self.capacity - 1

        self.data[self.write] = data
        self.update(idx, p)

        self.write += 1
        if self.write >= self.capacity:
            self.write = 0

    def update(self, idx, p):
        change = p - self.tree[idx]

        self.tree[idx] = p
        self._propagate(idx, change)

    def get(self, s):
        idx = self._retrieve(0, s)
        dataIdx = idx - self.capacity + 1

        return (idx, self.tree[idx], self.data[dataIdx])

In [6]:
############## Utils
from collections import deque

class MemoryBuffer(object):
    """ Memory Buffer Helper class for Experience Replay
    using a double-ended queue or a Sum Tree (for PER)
    """
    def __init__(self, buffer_size, with_per = False):
        """ Initialization
        """
        if(with_per):
            # Prioritized Experience Replay
            self.alpha = 0.5
            self.epsilon = 0.01
            self.buffer = SumTree(buffer_size)
        else:
            # Standard Buffer
            self.buffer = deque()
        self.count = 0
        self.with_per = with_per
        self.buffer_size = buffer_size

    def memorize(self, state, action, reward, done, new_state, error=None):
        """ Save an experience to memory, optionally with its TD-Error
        """

        experience = (state, action, reward, done, new_state)
        if(self.with_per):
            priority = self.priority(error[0])
            self.buffer.add(priority, experience)
            self.count += 1
        else:
            # Check if buffer is already full
            if self.count < self.buffer_size:
                self.buffer.append(experience)
                self.count += 1
            else:
                self.buffer.popleft()
                self.buffer.append(experience)

    def priority(self, error):
        """ Compute an experience priority, as per Schaul et al.
        """
        return (error + self.epsilon) ** self.alpha

    def size(self):
        """ Current Buffer Occupation
        """
        return self.count

    def sample_batch(self, batch_size):
        """ Sample a batch, optionally with (PER)
        """
        batch = []

        # Sample using prorities
        if(self.with_per):
            T = self.buffer.total() // batch_size
            for i in range(batch_size):
                a, b = T * i, T * (i + 1)
                s = random.uniform(a, b)
                idx, error, data = self.buffer.get(s)
                batch.append((*data, idx))
            idx = np.array([i[5] for i in batch])
        # Sample randomly from Buffer
        elif self.count < batch_size:
            idx = None
            batch = random.sample(self.buffer, self.count)
        else:
            idx = None
            batch = random.sample(self.buffer, batch_size)

        # Return a batch of experience
        s_batch = np.array([i[0] for i in batch])
        a_batch = np.array([i[1] for i in batch])
        r_batch = np.array([i[2] for i in batch])
        d_batch = np.array([i[3] for i in batch])
        new_s_batch = np.array([i[4] for i in batch])
        return s_batch, a_batch, r_batch, d_batch, new_s_batch, idx

    def update(self, idx, new_error):
        """ Update priority for idx (PER)
        """
        self.buffer.update(idx, self.priority(new_error))

    def clear(self):
        """ Clear buffer / Sum Tree
        """
        if(self.with_per): self.buffer = SumTree(buffer_size)
        else: self.buffer = deque()
        self.count = 0

In [7]:
class DDPG:
    """ Deep Deterministic Policy Gradient (DDPG) Helper Class
    """

    def __init__(self, state_size, action_size, batch_no):
        """ Initialization
        """
        # Environment and A2C parameters
        self.state_size = (batch_no,) + state_size
        self.action_size = action_size
        self.gamma = 0.99
        self.learning_rate = 0.00005
        # Create actor and critic networks
        self.actor = Actor(state_size, self.action_size, 0.1 * self.learning_rate, 0.001)
        self.critic = Critic(state_size, self.action_size, self.learning_rate, 0.001)
        self.buffer = MemoryBuffer(20000)

    def policy_action(self, s):
        """ Use the actor to predict value
        """
        return self.actor.predict(s)[0]

    def bellman(self, rewards, q_values, dones):
        """ Use the Bellman Equation to compute the critic target
        """
        critic_target = np.asarray(q_values)
        for i in range(q_values.shape[0]):
            if dones[i]:
                critic_target[i] = rewards[i]
            else:
                critic_target[i] = rewards[i] + self.gamma * q_values[i]
        return critic_target

    def memorize(self, state, action, reward, done, new_state):
        """ Store experience in memory buffer
        """
        self.buffer.memorize(state, action, reward, done, new_state)

    def sample_batch(self, batch_size):
        return self.buffer.sample_batch(batch_size)

    def update_models(self, states, actions, critic_target):
        """ Update actor and critic networks from sampled experience
        """
        # Train critic
        self.critic.train_on_batch(states, actions, critic_target)
        # Q-Value Gradients under Current Policy
        actions = self.actor.model.predict(states)
        grads = self.critic.gradients(states, actions)
        # Train actor
        self.actor.train(states, actions, np.array(grads).reshape((-1, self.action_size)))
        # Transfer weights to target networks at rate Tau
        self.actor.transfer_weights()
        self.critic.transfer_weights()

    def performance(self, env, nb_episodes):
        for e in range(nb_episodes):
            # Reset episode
            time, cumul_reward, done = 0, 0, False
            old_state = env.reset()
            actions, states, rewards = [], [], []
            noise = OrnsteinUhlenbeckProcess(size=self.action_size, theta=.15, mu=0., sigma=.3)

            while not done:
                env.render()
                a = self.policy_action(old_state)
                a = np.clip(a+noise.sample(), -1, 1)
                new_state, r, done, _ = env.step(a)
                cumul_reward += r
                time += 1
            print("Score: " + str(cumul_reward))
        return results

    def save_weights(self, path):
        path += '_LR_{}'.format(self.learning_rate)
        self.actor.save(path)
        self.critic.save(path)

    def load_weights(self, path_actor, path_critic):
        self.critic.load_weights(path_critic)
        self.actor.load_weights(path_actor)

In [8]:
############## Training #################

ddpg = DDPG(env.observation_space.shape, 3, 10)

'''
for episode in range(2):
    score=0
    done=False
    state = env.reset()
    while not done:
        action = ddpg.policy_action(state)
        observation, reward, done, info = env.step(action)
        ddpg.memorize(observation, action, reward, done)
        env.render()
        score+=reward
        state = observation
    ddpg.train()
    print("episode {} score {}".format(episode, score))
'''
ddpg.load_weights('_LR_5e-05_actor.h5', '_LR_5e-05_critic.h5')
ddpg.performance(env,nb_episodes=100)
env.close()

Instructions for updating:
Colocations handled automatically by placer.




Track generation: 1248..1564 -> 316-tiles track
[[ 0.9949165   0.9728229   0.05939214]]
[[ 0.9949165   0.9728229   0.05939214]]
[[ 0.9949165   0.9728229   0.05939214]]
[[ 0.9949165   0.9728229   0.05939214]]
[[ 0.9949165   0.9728229   0.05939214]]
[[ 0.9949165   0.9728229   0.05939214]]
[[ 0.9949165   0.9728229   0.05939214]]
[[ 0.9949165   0.9728229   0.05939214]]
[[ 0.9949165   0.9728229   0.05939214]]
[[ 0.9949165   0.9728229   0.05939214]]
[[ 0.9949165   0.9728229   0.05939214]]
[[ 0.9949165   0.9728229   0.05939214]]
[[ 0.9949165   0.9728229   0.05939214]]
[[ 0.9949165   0.9728229   0.05939214]]
[[ 0.9949165   0.9728229   0.05939214]]
[[ 0.9949165   0.9728229   0.05939214]]
[[ 0.9949165   0.9728229   0.05939214]]
[[ 0.9949165   0.9728229   0.05939214]]
[[ 0.9949165   0.9728229   0.05939214]]
[[ 0.9949165   0.9728229   0.05939214]]
[[ 0.9949165   0.9728229   0.05939214]]
[[ 0.9949165   0.9728229   0.05939214]]
[[ 0.9949165   0.9728229   0.05939214]]
[[ 0.9949165   0.9728229   0.059

KeyboardInterrupt: 