In [1]:
import sys
sys.path.append("../src/")

In [2]:
from config import *

In [3]:
import os
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.keras.layers import Dense 

class Critic(tf.keras.Model):
    def __init__(self, name, hidden_0=CRITIC_HIDDEN_0, hidden_1=CRITIC_HIDDEN_1):
        super(Critic, self).__init__()
        self.hidden_0 = hidden_0
        self.hidden_1 = hidden_1
        self.net_name = name

        self.dense_0 = Dense(self.hidden_0, activation='relu')
        self.dense_1 = Dense(self.hidden_1, activation='relu')
        self.q_value = Dense(1, activation=None)

    def call(self, state, action):
        state_action_value = self.dense_0(tf.concat([state, action], axis=1))
        state_action_value = self.dense_1(state_action_value)

        q_value = self.q_value(state_action_value)

        return q_value

class CriticValue(tf.keras.Model):
    def __init__(self, name, hidden_0=CRITIC_HIDDEN_0, hidden_1=CRITIC_HIDDEN_1):
        super(CriticValue, self).__init__()
        self.hidden_0 = hidden_0
        self.hidden_1 = hidden_1
        self.net_name = name
        
        self.dense_0 = Dense(self.hidden_0, activation='relu')
        self.dense_1 = Dense(self.hidden_1, activation='relu')
        self.value = Dense(1, activation=None)

    def call(self, state):
        value = self.dense_0(state)
        value = self.dense_1(value)

        value = self.value(value)

        return value

class Actor(tf.keras.Model):
    def __init__(self, name, upper_bound, actions_dim, hidden_0=CRITIC_HIDDEN_0, hidden_1=CRITIC_HIDDEN_1, noise=NOISE):
        super(Actor, self).__init__()
        self.hidden_0 = hidden_0
        self.hidden_1 = hidden_1
        self.actions_dim = actions_dim
        self.net_name = name
        self.upper_bound = upper_bound
        self.noise = noise

        self.dense_0 = Dense(self.hidden_0, activation='relu')
        self.dense_1 = Dense(self.hidden_1, activation='relu')
        self.mean = Dense(self.actions_dim, activation=None)
        self.std = Dense(self.actions_dim, activation=None)

    def call(self, state):
        policy = self.dense_0(state)
        policy = self.dense_1(policy)

        mean = self.mean(policy)
        std = self.std(policy)

        std = tf.clip_by_value(std, self.noise, 1)

        return mean, std

    def get_action_log_probs(self, state, reparameterization_trick=True):
        mean, std = self.call(state)
        normal_distr = tfp.distributions.Normal(mean, std)

        if reparameterization_trick:
            actions = normal_distr.sample()
        else:
            actions = normal_distr.sample()

        action = tf.math.tanh(actions) * self.upper_bound
        log_probs = normal_distr.log_prob(actions) - tf.math.log(1-tf.math.pow(action,2)+self.noise)
        log_probs = tf.math.reduce_sum(log_probs, axis=1, keepdims=True)

        return action, log_probs

In [4]:
import os
import time
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.optimizers import Adam

In [5]:
from replay_buffer import *

In [6]:
class Agent:
    def __init__(self, env, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR, gamma=GAMMA, tau=TAU, reward_scale=REWARD_SCALE):
        self.gamma = gamma
        self.tau = tau
        self.replay_buffer = ReplayBuffer(env)
        self.actions_dim = env.action_space.shape[0]
        self.upper_bound = env.action_space.high[0]
        self.lower_bound = env.action_space.low[0]
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        self.actor = Actor(actions_dim=self.actions_dim, name='actor', 
                                    upper_bound=env.action_space.high)
        self.critic_0 = Critic(name='critic_0')
        self.critic_1 = Critic(name='critic_1')
        self.critic_value = CriticValue(name='value')
        self.critic_target_value = CriticValue(name='target_value')

        self.actor.compile(optimizer=Adam(learning_rate=self.actor_lr))
        self.critic_0.compile(optimizer=Adam(learning_rate=self.critic_lr))
        self.critic_1.compile(optimizer=Adam(learning_rate=self.critic_lr))
        self.critic_value.compile(optimizer=Adam(learning_rate=self.critic_lr))
        self.critic_target_value.compile(optimizer=Adam(learning_rate=self.critic_lr))

        self.reward_scale = reward_scale

        self.critic_target_value.set_weights(self.critic_value.weights)
        
    def update_target_networks(self, tau):
        critic_value_weights = self.critic_value.weights
        critic_target_value_weights = self.critic_target_value.weights
        for index in range(len(critic_value_weights)):
            critic_target_value_weights[index] = tau * critic_value_weights[index] + (1 - tau) * critic_target_value_weights[index]

        self.critic_target_value.set_weights(critic_target_value_weights)
        
    def add_to_replay_buffer(self, state, action, reward, new_state, done):
        self.replay_buffer.add_record(state, action, reward, new_state, done)
        
    def save(self):
        date_now = time.strftime("%Y%m%d%H%M")
        if not os.path.isdir(f"{self.path_save}/save_agent_{date_now}"):
            os.makedirs(f"{self.path_save}/save_agent_{date_now}")
        self.actor.save_weights(f"{self.path_save}/save_agent_{date_now}/{self.actor.net_name}.h5")
        self.critic_0.save_weights(f"{self.path_save}/save_agent_{date_now}/{self.critic_0.net_name}.h5")
        self.critic_1.save_weights(f"{self.path_save}/save_agent_{date_now}/{self.critic_1.net_name}.h5")
        self.value_critic.save_weights(f"{self.path_save}/save_agent_{date_now}/{self.value_critic.net_name}.h5")
        self.target_value_critic.save_weights(f"{self.path_save}/save_agent_{date_now}/{self.target_value_critic.net_name}.h5")
        
        self.replay_buffer.save(f"{self.path_save}/save_agent_{date_now}")

    def load(self):
        self.actor.load_weights(f"{self.path_load}/{self.actor.net_name}.h5")
        self.critic_0.load_weights(f"{self.path_load}/{self.critic_0.net_name}.h5")
        self.critic_1.load_weights(f"{self.path_load}/{self.critic_1.net_name}.h5")
        self.value_critic.load_weights(f"{self.path_load}/{self.value_critic.net_name}.h5")
        self.target_value_critic.load_weights(f"{self.path_load}/{self.target_value_critic.net_name}.h5")
        
        self.replay_buffer.load(f"{self.path_load}")

    def get_action(self, observation):
        state = tf.convert_to_tensor([observation])
        actions, _ = self.actor.get_action_log_probs(state, reparameterization_trick=False)

        return actions[0]

    def learn(self):
        if self.replay_buffer.check_buffer_size() == False:
            return

        state, action, reward, new_state, done = self.replay_buffer.get_minibatch()

        states = tf.convert_to_tensor(state, dtype=tf.float32)
        new_states = tf.convert_to_tensor(new_state, dtype=tf.float32)
        rewards = tf.convert_to_tensor(reward, dtype=tf.float32)
        actions = tf.convert_to_tensor(action, dtype=tf.float32)

        with tf.GradientTape() as tape:
            value = tf.squeeze(self.critic_value(states), 1)
            target_value = tf.squeeze(self.critic_target_value(new_states), 1)

            policy_actions, log_probs = self.actor.get_action_log_probs(states, reparameterization_trick=False)
            log_probs = tf.squeeze(log_probs,1)
            q_value_0 = self.critic_0(states, policy_actions)
            q_value_1 = self.critic_1(states, policy_actions)
            q_value = tf.squeeze(tf.math.minimum(q_value_0, q_value_1), 1)

            value_target = q_value - log_probs
            value_critic_loss = 0.5 * keras.losses.MSE(value, value_target)

        value_critic_gradient = tape.gradient(value_critic_loss, self.critic_value.trainable_variables)
        self.critic_value.optimizer.apply_gradients(zip(value_critic_gradient, self.critic_value.trainable_variables))


        with tf.GradientTape() as tape:
            new_policy_actions, log_probs = self.actor.get_action_log_probs(states, reparameterization_trick=True)
            log_probs = tf.squeeze(log_probs, 1)
            new_q_value_0 = self.critic_0(states, new_policy_actions)
            new_q_value_1 = self.critic_1(states, new_policy_actions)
            new_q_value = tf.squeeze(tf.math.minimum(new_q_value_0, new_q_value_1), 1)
        
            actor_loss = log_probs - new_q_value
            actor_loss = tf.math.reduce_mean(actor_loss)

        actor_gradient = tape.gradient(actor_loss, self.actor.trainable_variables)
        self.actor.optimizer.apply_gradients(zip(actor_gradient, self.actor.trainable_variables))
        

        with tf.GradientTape(persistent=True) as tape:
            q_pred = self.reward_scale * reward + self.gamma * target_value * (1-done)
            old_q_value_0 = tf.squeeze(self.critic_0(state, action), 1)
            old_q_value_1 = tf.squeeze(self.critic_1(state, action), 1)
            critic_0_loss = 0.5 * keras.losses.MSE(old_q_value_0, q_pred)
            critic_1_loss = 0.5 * keras.losses.MSE(old_q_value_1, q_pred)
    
        critic_0_network_gradient = tape.gradient(critic_0_loss, self.critic_0.trainable_variables)
        critic_1_network_gradient = tape.gradient(critic_1_loss, self.critic_1.trainable_variables)

        self.critic_0.optimizer.apply_gradients(zip(critic_0_network_gradient, self.critic_0.trainable_variables))
        self.critic_1.optimizer.apply_gradients(zip(critic_1_network_gradient, self.critic_1.trainable_variables))

        self.update_target_networks(tau=self.tau)
        
        self.replay_buffer.update_n_games()

In [7]:
import pybullet_envs
import gym

In [8]:
env = gym.make('LunarLanderContinuous-v2')
env = gym.make('InvertedPendulumBulletEnv-v0')
agent = Agent(env=env)
n_games = 250



In [9]:
best_score = env.reward_range[0]
score_history = []
load_checkpoint = False

if load_checkpoint:
    agent.load_models()
    env.render(mode='human')

for i in range(n_games):
    observation = env.reset()
    done = False
    score = 0
    while not done:
        action = agent.get_action(observation)
        observation_, reward, done, info = env.step(action)
        score += reward
        agent.add_to_replay_buffer(observation, action, reward, observation_, done)
        if not load_checkpoint:
            agent.learn()
        observation = observation_
    score_history.append(score)
    avg_score = np.mean(score_history[-100:])

    if avg_score > best_score:
        best_score = avg_score
    print('episode ', i, 'score %.1f' % score, 'avg_score %.1f' % avg_score)

episode  0 score 44.0 avg_score 44.0
episode  1 score 43.0 avg_score 43.5
episode  2 score 21.0 avg_score 36.0
episode  3 score 41.0 avg_score 37.2
episode  4 score 16.0 avg_score 33.0
episode  5 score 20.0 avg_score 30.8
episode  6 score 19.0 avg_score 29.1
episode  7 score 16.0 avg_score 27.5
episode  8 score 20.0 avg_score 26.7
episode  9 score 19.0 avg_score 25.9
episode  10 score 32.0 avg_score 26.5
episode  11 score 31.0 avg_score 26.8
episode  12 score 11.0 avg_score 25.6
episode  13 score 31.0 avg_score 26.0
episode  14 score 48.0 avg_score 27.5
episode  15 score 15.0 avg_score 26.7
episode  16 score 112.0 avg_score 31.7
episode  17 score 206.0 avg_score 41.4
episode  18 score 78.0 avg_score 43.3
episode  19 score 142.0 avg_score 48.2
episode  20 score 96.0 avg_score 50.5


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-80b073598af1>", line 19, in <module>
    agent.learn()
  File "<ipython-input-6-822884a28e2b>", line 82, in learn
    policy_actions, log_probs = self.actor.get_action_log_probs(states, reparameterization_trick=False)
  File "<ipython-input-3-3bff3b1f0ef2>", line 72, in get_action_log_probs
    mean, std = self.call(state)
  File "<ipython-input-3-3bff3b1f0ef2>", line 64, in call
    mean = self.mean(policy)
  File "/Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py", line 996, in __call__
    inputs = self._maybe_cast_inputs(inputs, input_list)
  File "/Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_la

TypeError: object of type 'NoneType' has no len()