<h2 style='color:red'> Work in progress - Unfortunately this code is not running at the moment </h2>

### This notebook presents the early results working on the bipedal walker environment with a deterministic policy gradient approach: actor-critic model

The code to implement the algorithm is inspired from:  

https://github.com/artem-oppermann/Deep-Reinforcement-Learning

In [3]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
import math
import gym
import numpy as np
import copy
import pandas as pd
import seaborn as sns
import time
import tensorflow as tf
import imageio
import IPython
import sys
from IPython.display import Image

### Create the classes for the actor-critic algorithm

In [4]:
class Environment:
    
    def __init__(self):
        """
        This class implements the OpenAI Gym Bipedal Walker v3
        """
        self.env = gym.make('BipedalWalker-v3')
        self.state_size = len(self.env.observation_space.sample())
        self.action_size = len(self.env.action_space.sample())
        self.images = []
        
    def get_env(self):
        '''Getter function for the OpenAI Gym instance '''
        return self.env
    
    def reset(self):
        self.images = []
        initial_state = self.env.reset()
        return initial_state
    
    def get_state_size(self):
        '''Getter function for the state-size in the environment '''
        return self.state_size

    def get_action_size(self):
        '''Getter function for the state-size in the environment '''
        return self.action_size
    
    def render(self):
        '''Adds the every image of the rendering to a list'''
        img = self.env.render(mode='rgb_array')
        self.images.append(img)

    def make_gif(self, filename="render"):
        imageio.mimsave(filename + '.gif', [np.array(img) for i, img in enumerate(self.images) if i%2 == 0], fps=29)
        return Image(open(filename + '.gif','rb').read())
    
    def run_random_episode(self):
        current_state = self.reset()
        final_state = False
        iters = 0
        
        while not final_state and iters < 1000:
            self.render()
            action = self.env.action_space.sample()
            current_state, reward, final_state, info = self.env.step(action)
            iters += 1
        env.close()

        return self.make_gif("test")

In [5]:
class Actor:
    
    def __init__(self, scope, target_network, env, flags):
        
        """
        This class implements the actor for the deterministic policy gradients model.
        The actor class determines the action that the agent must take in a environment.
    
        :param scope: within this scope the parameters will be defined
        :param target_network: instance of the Actor(target-network class)
        :param env: instance of the openAI environment
        :param FLAGS: TensorFlow flags which contain thevalues for hyperparameters
    
        """
        
        self.TF_FLAGS = flags
        self.env = env
        
        if scope=='target':
            
            with tf.variable_scope(scope):
                
                self.state = tf.placeholder(tf.float32, shape=(None, self.env.get_state_size()), name='state')
                self.action = self.action_estimator()
                self.param = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)

        elif scope=='actor':
            
            with tf.variable_scope(scope):
                
                self.state = tf.placeholder(tf.float32, shape=(None, self.env.get_state_size()), name='state')
                self.target_network = target_network

                self.q_network_gradient = tf.placeholder(tf.float32, shape=(None,self.env.get_action_size()), name='q_network_gradients')
                self.action=self.action_estimator()
                
                self.param = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
                
                with tf.name_scope('policy_gradients'):
                    self.unnormalized_gradients = tf.gradients(self.action, self.param, -self.q_network_gradient)
                    self.policy_gradient=list(map(lambda x: tf.div(x, 1), self.unnormalized_gradients))
                    
                with tf.name_scope('train_policy_network'):
                    self.train_opt=tf.train.AdamOptimizer(self.TF_FLAGS.learning_rate_Actor).apply_gradients(zip(self.policy_gradient,self.param))
                
                with tf.name_scope('update_actor_target'):     
                    self.update_opt=[tp.assign(tf.multiply(self.TF_FLAGS.tau,lp)+tf.multiply(1-self.TF_FLAGS.tau,tp)) for tp, lp in zip(self.target_network.param,self.param)]
                          
                with tf.name_scope('initialize_actor_target_network'):
                     self.init_target_op=[tp.assign(lp) for tp, lp in zip(self.target_network.param,self.param)]
                    
    def action_estimator(self):
        '''Build the neural network that estimates the action for a given state '''
        
        h1 = tf.layers.dense(
                             self.state, 8, tf.nn.relu, use_bias=None,
                             kernel_initializer=tf.random_normal_initializer(),
                             bias_initializer=tf.zeros_initializer()
                             )

        actions = tf.layers.dense(h1, self.env.get_action_size(), None, kernel_initializer=tf.random_normal_initializer())  
        
        min_action = self.env.get_env().action_space.low
        max_action = self.env.get_env().action_space.high

        scalled_actions = min_action + tf.nn.sigmoid(actions)*(max_action - min_action)
        
        return scalled_actions
    
    def set_session(self, session):
        '''Set the session '''
        self.session=session
    
    def init_target_network(self):
        '''Initialize the parameters of the target-network '''
        self.session.run(self.init_target_op)

    def update_target_parameter(self):
        '''Update the parameters of the target-network '''
        self.session.run(self.update_opt)

    def get_action(self, state):
        '''Get an action for a certain state '''
        return self.session.run(self.action, feed_dict={self.state: state})

    def train(self, state, q_gradient):
        '''Train the actor network '''
        feed_dict={
                    self.q_network_gradient: q_gradient,
                    self.state: state }
        self.session.run(self.train_opt,feed_dict)

In [6]:
class Critic:
    
    def __init__(self, scope, target_network, env, flags):
        """
        This class implements the Critic for the stochastic policy gradient model.
        The critic provides a state-value for the current state environment where 
        the agent operates.
        
        :param scope: within this scope the parameters will be defined
        :param target_network: instance of the Actor(target-network class)
        :param env: instance of the openAI environment
        :param FLAGS: TensorFlow flags which contain thevalues for hyperparameters
        
        """
        
        self.TF_FLAGS = flags
        self.env = env
        
        if scope=='target':
            
            with tf.variable_scope(scope):
                
                self.gamma = 0.99
                self.state = tf.placeholder(tf.float32, shape=(None,self.env.state_size), name='state')
                self.actions = tf.placeholder(tf.float32, shape=(None,self.env.get_action_size()), name='actions')
                self.q = self.action_value_estimator(scope='q_target_network')
                self.param = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/q_target_network')

        else:
            
            with tf.variable_scope(scope):
                
                self.target_network=target_network

                self.state = tf.placeholder(tf.float32, shape=(None,self.env.get_state_size()), name='state')
                self.target = tf.placeholder(tf.float32, shape=(None,self.env.get_action_size()), name='target')
                self.actions = tf.placeholder(tf.float32, shape=(None,self.env.get_action_size()), name='actions')
                
                self.q = self.action_value_estimator(scope='q_network')

                self.param = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/q_network')
                
                with tf.name_scope('q_network_loss'):
                    loss=tf.losses.mean_squared_error(self.target, self.q)
            
                with tf.name_scope('q_network_gradient'):
                    self.gradients=tf.gradients(self.q, self.actions)
            
                with tf.name_scope('train_q_network'):
                    self.train_opt=tf.train.AdamOptimizer(self.TF_FLAGS.learning_rate_Critic).minimize(loss)
            
                with tf.name_scope('update_q_target'):    
                    self.update_opt=[tp.assign(tf.multiply(self.TF_FLAGS.tau,lp)+tf.multiply(1-self.TF_FLAGS.tau,tp)) for tp, lp in zip(self.target_network.param,self.param)]
                    
                with tf.name_scope('initialize_q_target_network'):
                     self.init_target_op=[tp.assign(lp) for tp, lp in zip(self.target_network.param,self.param)]
 
    def action_value_estimator(self, scope):    
        '''Build the neural network that estimates the action-values '''
        
        state_action = tf.concat([self.state, self.actions], axis=1)
        
        with tf.variable_scope(scope):
             
            h1 = tf.layers.dense(state_action, 8, tf.nn.relu,use_bias=None,
                                 kernel_initializer=tf.random_normal_initializer(),
                                 bias_initializer=tf.zeros_initializer()
                                 )
            
            q = tf.layers.dense(h1, self.env.get_action_size(), None,
                                kernel_initializer=tf.random_normal_initializer())                     
        return q
    
    
    def compute_gradients(self, state, actions):
        '''Compute the gradients of the action_value estimator neural network '''
        
        feed_dict={self.state: state, 
                   self.actions:actions
                   }
        
        q_gradient=self.session.run(self.gradients, feed_dict)
        q_gradient=np.array(q_gradient).reshape(1, -1)
        
        return q_gradient
    
    
    def calculate_Q(self, state, actions):
        '''Compute the action-value '''

        feed_dict={self.state: state,
                   self.actions:actions}
        
        q_next=self.session.run(self.q,feed_dict)
        
        return q_next
    
    
    def train(self, state, targets, action):
        '''Train the actor network '''
        
        feed_dict={
                    self.state: state, 
                    self.target: targets, 
                    self.actions: action
                   }
        self.session.run(self.train_opt,feed_dict)
    
    
    def set_session(self, session):
        '''Set the session '''
        self.session=session
    
    
    def init_target_network(self):
        '''Initialize the parameters of the target-network '''
        self.session.run(self.init_target_op)
             
       
    def update_target_parameter(self):
        '''Update the parameters of the target-network '''
        self.session.run(self.update_opt)

In [7]:
class Agent:
    
    def __init__(self, TF_FLAGS):
        ''' This class build the Agent that learns in the environment via the actor-critic algorithm. '''
        self.env = Environment()
        self.TF_FLAGS = TF_FLAGS
                
        self.actor_target = Actor(scope='target',target_network=None,env=self.env, flags=TF_FLAGS)
        self.actor = Actor(scope='actor',target_network=self.actor_target,env=self.env, flags=TF_FLAGS)
        
        self.critic_target = Critic(scope='target',target_network=None,env=self.env, flags=TF_FLAGS)
        self.critic = Critic(scope='critic',target_network=self.critic_target,env=self.env, flags=TF_FLAGS)
        
        self.session = tf.InteractiveSession()
        self.session.run(tf.global_variables_initializer())
        
        self.critic.set_session(self.session)
        self.actor.set_session(self.session)
        self.actor_target.set_session(self.session)
        self.critic_target.set_session(self.session)
        
        self.critic.init_target_network()
        self.actor.init_target_network()
                
    def update_critic(self, state, action, reward, next_state, done):
        next_action = self.actor_target.get_action(next_state)
        q_next = self.critic.target_network.calculate_Q(next_state, next_action)
#       target = np.array([r+self.FLAGS.gamma*q if not done else r for r, q, done in zip(reward, q_next, done)])
        target = np.array(reward+self.TF_FLAGS.gamma*q_next)
        self.critic.train(state, target, action)
        self.critic.update_target_parameter()
    
    def update_actor(self, state, action, reward, next_state, done):
        current_action = self.actor.get_action(state)        
        q_gradient = self.critic.compute_gradients(state, current_action)
        self.actor.train(state, q_gradient)
        self.actor.update_target_parameter()
        
    def train_one_episode(self):
        ''' Play an episode in the OpenAI Gym '''
        # Get the initial state and reshape it
        state = self.env.reset()
        state = state.reshape(1,self.env.get_state_size())
        
        done=False
        total_reward=0
        iters = 0
        
        # Loop for the episode
        while not done and iters < 2000:
            
            # Sample an action from the actor distribution
            action = self.actor.get_action(state)
            prev_state = state
            
            # Obtain a <state, reward, done> tuple from the environment
            state, reward, done, _ = self.env.get_env().step(action.reshape(-1))
            total_reward = total_reward + reward
            
            state = state.reshape(1, self.env.get_state_size())
            prev_state = prev_state.reshape(1, self.env.get_state_size())
            action = action.reshape(1, self.env.get_action_size())
            
            self.update_critic(prev_state, action, reward, state, done)
            self.update_actor(prev_state, action, reward, state, done)
            
            iters += 1
            
        return total_reward
            
    def train(self, num_episodes=100):
        '''Run the environment for a particular number of episodes. '''
        total_rewards = np.empty(num_episodes+1)
        n_steps=1
        
        for n in range(0, num_episodes+1):

            total_reward = self.train_one_episode()
            total_rewards[n] = total_reward 
            
            if n%10==0:
                print("episodes: %i, avg_reward (last: %i episodes): %.2f" %(n, n_steps, total_rewards[max(0, n-n_steps):(n+1)].mean()))
                
    def record_episode(self):
        '''Runs and records one episode using the trained actor and critic'''
        # Get the initial state and reshape it
        state=self.env.reset()
        state=state.reshape(1,self.env.get_state_size())
        done=False
        iters = 0
        total_reward = 0

        # Loop for the episode
        while not done and iters < 2000:
            
            # Sample an action from the gauss distribution
            action = self.actor.get_action(state)

            # Obtain a <state, reward, done> tuple from the environment
            state, reward, done, _ = self.env.get_env().step(action.reshape(-1))
            state = state.reshape(1, self.env.get_state_size())
            total_reward += reward
            
            self.env.render()
            iters += 1
        
        return self.env.make_gif()

### Train the agent and plot results

In [9]:
# Set the parameters
sys.argv = sys.argv[:1]
tf.reset_default_graph()

def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()
    keys_list = [keys for keys in flags_dict]
    for keys in keys_list:
        FLAGS.__delattr__(keys)
        
try:
    del_all_flags(TF_FLAGS)
except:
    pass

tf.app.flags.DEFINE_float('learning_rate_Actor', 0.001, 'Learning rate for the policy estimator')
tf.app.flags.DEFINE_float('learning_rate_Critic', 0.001, 'Learning rate for the state-value estimator')
tf.app.flags.DEFINE_float('gamma', 0.99, 'Future discount factor')
tf.app.flags.DEFINE_float('tau', 1e-2, 'Update rate for the target networks parameter')

TF_FLAGS = tf.app.flags.FLAGS

agent = Agent(TF_FLAGS)

agent.train(num_episodes = 100)



episodes: 0, avg_reward (last: 1 episodes): -100.34
episodes: 10, avg_reward (last: 1 episodes): -150.46
episodes: 20, avg_reward (last: 1 episodes): -115.61
episodes: 30, avg_reward (last: 1 episodes): -106.33
episodes: 40, avg_reward (last: 1 episodes): -101.26
episodes: 50, avg_reward (last: 1 episodes): -101.91
episodes: 60, avg_reward (last: 1 episodes): -123.29
episodes: 70, avg_reward (last: 1 episodes): -124.36
episodes: 80, avg_reward (last: 1 episodes): -116.51
episodes: 90, avg_reward (last: 1 episodes): -143.95
episodes: 100, avg_reward (last: 1 episodes): -102.04


In [None]:
agent.record_episode()