In [1]:
import sys
sys.path.append("../src")
from plugin_write_and_run import *

In [2]:
%%write_and_run ../src/agent.py
import sys
sys.path.append("../src")
import tensorflow as tf
import numpy as np
import random
from tqdm import tqdm
from config import *
from replay_buffer import *
from networks import *
from tensorflow.keras.optimizers import Adam

In [3]:
ENV_NAME

'BipedalWalkerHardcore-v3'

In [4]:
import gym
env = gym.make(ENV_NAME)



In [5]:
%%write_and_run -a ../src/agent.py

class Agent:
    def __init__(self, env, alpha=0.001, beta=0.002,
            gamma=0.99, max_size=1000000, tau=0.005, 
            fc1=400, fc2=300, noise=0.1):
        
        self.gamma = gamma
        self.tau = tau
        self.replay_buffer = ReplayBuffer(env, max_size)
        self.actions_dim = env.action_space.shape[0]
        self.noise = noise
        self.max_action = env.action_space.high[0]
        self.min_action = env.action_space.low[0]
        
        self.actor = Actor(actions_dim=self.actions_dim, name='actor')
        self.critic = Critic(name='critic')
        self.target_actor = Actor(actions_dim=self.actions_dim, name='target_actor')
        self.target_critic = Critic(name='target_critic')

        self.actor.compile(optimizer=Adam(learning_rate=alpha))
        self.critic.compile(optimizer=Adam(learning_rate=beta))
        self.target_actor.compile(optimizer=Adam(learning_rate=alpha))
        self.target_critic.compile(optimizer=Adam(learning_rate=beta))

        self.update_network_parameters(tau=1)

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        weights = []
        targets = self.target_actor.weights
        for i, weight in enumerate(self.actor.weights):
            weights.append(weight * tau + targets[i]*(1-tau))
        self.target_actor.set_weights(weights)

        weights = []
        targets = self.target_critic.weights
        for i, weight in enumerate(self.critic.weights):
            weights.append(weight * tau + targets[i]*(1-tau))
        self.target_critic.set_weights(weights)

    def remember(self, state, action, reward, new_state, done):
        self.replay_buffer.add_record(state, action, reward, new_state, done)

    def save_models(self):
        print('... saving models ...')
        self.actor.save_weights(self.actor.checkpoint_file)
        self.target_actor.save_weights(self.target_actor.checkpoint_file)
        self.critic.save_weights(self.critic.checkpoint_file)
        self.target_critic.save_weights(self.target_critic.checkpoint_file)

    def load_models(self):
        print('... loading models ...')
        self.actor.load_weights(self.actor.checkpoint_file)
        self.target_actor.load_weights(self.target_actor.checkpoint_file)
        self.critic.load_weights(self.critic.checkpoint_file)
        self.target_critic.load_weights(self.target_critic.checkpoint_file)

    def choose_action(self, observation, evaluate=False):
        state = tf.convert_to_tensor([observation], dtype=tf.float32)
        actions = self.actor(state)
        if not evaluate:
            actions += tf.random.normal(shape=[self.actions_dim],
                    mean=0.0, stddev=self.noise)
        # note that if the environment has an action > 1, we have to multiply by
        # max action at some point
        actions = tf.clip_by_value(actions, self.min_action, self.max_action)

        return actions[0]

    def learn(self):
        if self.replay_buffer.buffer_counter < self.replay_buffer.batch_size:
            return

        state, action, reward, new_state, done = \
                self.replay_buffer.get_minibatch()

        states = tf.convert_to_tensor(state, dtype=tf.float32)
        new_states = tf.convert_to_tensor(new_state, dtype=tf.float32)
        rewards = tf.convert_to_tensor(reward, dtype=tf.float32)
        actions = tf.convert_to_tensor(action, dtype=tf.float32)

        with tf.GradientTape() as tape:
            target_actions = self.target_actor(new_states)
            critic_value_ = tf.squeeze(self.target_critic(
                                new_states, target_actions), 1)
            critic_value = tf.squeeze(self.critic(states, actions), 1)
            target = reward + self.gamma*critic_value_*(1-done)
            critic_loss = keras.losses.MSE(target, critic_value)

        critic_network_gradient = tape.gradient(critic_loss,
                                            self.critic.trainable_variables)
        self.critic.optimizer.apply_gradients(zip(
            critic_network_gradient, self.critic.trainable_variables))

        with tf.GradientTape() as tape:
            new_policy_actions = self.actor(states)
            actor_loss = -self.critic(states, new_policy_actions)
            actor_loss = tf.math.reduce_mean(actor_loss)

        actor_network_gradient = tape.gradient(actor_loss, 
                                    self.actor.trainable_variables)
        self.actor.optimizer.apply_gradients(zip(
            actor_network_gradient, self.actor.trainable_variables))

        self.update_network_parameters()

In [5]:
agent = Agent(env, alpha=0.005, beta=0.01, fc1=800, fc2=500)
n_games = 250

best_score = env.reward_range[0]
score_history = []
evaluate = False

for i in range(n_games):
    observation = env.reset()
    done = False
    score = 0
    while not done:
        action = agent.choose_action(observation, evaluate)
        observation_, reward, done, info = env.step(action)
        score += reward
        agent.remember(observation, action, reward, observation_, done)
        agent.learn()
        observation = observation_

    score_history.append(score)
    avg_score = np.mean(score_history[-100:])

    if avg_score > best_score:
        best_score = avg_score
    #    if not load_checkpoint:
            #agent.save_models()
    print('episode ', i, 'score %.1f' % score, 'avg score %.1f' % avg_score)

episode  0 score -100.6 avg score -100.6
episode  1 score -215.8 avg score -158.2
episode  2 score -215.5 avg score -177.3
episode  3 score -215.3 avg score -186.8
episode  4 score -214.8 avg score -192.4
episode  5 score -110.5 avg score -178.7
episode  6 score -215.9 avg score -184.0
episode  7 score -107.7 avg score -174.5
episode  8 score -106.5 avg score -166.9
episode  9 score -216.4 avg score -171.9
episode  10 score -107.8 avg score -166.1
episode  11 score -106.5 avg score -161.1
episode  12 score -106.6 avg score -156.9
episode  13 score -107.8 avg score -153.4
episode  14 score -107.6 avg score -150.4


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-3c664a42949e>", line 17, in <module>
    agent.learn()
  File "<ipython-input-5-659fb7a7275e>", line 91, in learn
    critic_network_gradient = tape.gradient(critic_loss,
  File "/Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/tensorflow/python/eager/backprop.py", line 1080, in gradient
    flat_grad = imperative_grad.imperative_grad(
  File "/Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/tensorflow/python/eager/imperative_grad.py", line 71, in imperative_grad
    return pywrap_tfe.TFE_Py_TapeGradient(
  File "/Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/tensorflow/python/eager/backprop.py", line 162, in _gradient_function
    return grad_fn(mock_o

TypeError: object of type 'NoneType' has no len()