In [1]:
import sys
sys.path.append("../src")
from plugin_write_and_run import *
from tqdm import tqdm

In [2]:
%%write_and_run ../src/agent.py
import sys
sys.path.append("../src")
import tensorflow as tf
from tensorflow.keras import optimizers as opt
import numpy as np
import random
import time
from config import *
from replay_buffer import *
from networks import *

In [3]:
ENV_NAME

'LunarLanderContinuous-v2'

In [4]:
import gym
env = gym.make(ENV_NAME)

In [5]:
%%write_and_run -a ../src/agent.py

class Agent:
    def __init__(self, env, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR, gamma=GAMMA, max_size=BUFFER_CAPACITY, tau=TAU, path_save=PATH_SAVE, path_load=PATH_LOAD):
        
        self.gamma = gamma
        self.tau = tau
        self.replay_buffer = ReplayBuffer(env, max_size)
        self.actions_dim = env.action_space.shape[0]
        self.upper_bound = env.action_space.high[0]
        self.lower_bound = env.action_space.low[0]
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.path_save = path_save
        self.path_load = path_load
        
        self.actor = Actor(name='actor', actions_dim=self.actions_dim, upper_bound=self.upper_bound)
        self.critic = Critic(name='critic')
        self.target_actor = Actor(name='target_actor', actions_dim=self.actions_dim, upper_bound=self.upper_bound)
        self.target_critic = Critic(name='target_critic')

        self.actor.compile(optimizer=opt.Adam(learning_rate=actor_lr))
        self.critic.compile(optimizer=opt.Adam(learning_rate=critic_lr))
        self.target_actor.compile(optimizer=opt.Adam(learning_rate=actor_lr))
        self.target_critic.compile(optimizer=opt.Adam(learning_rate=critic_lr))

        actor_weights = self.actor.get_weights()
        critic_weights = self.critic.get_weights()
        
        self.target_actor.set_weights(actor_weights)
        self.target_critic.set_weights(critic_weights)
        
        self.noise = np.zeros(self.actions_dim)

    def update_target_networks(self, tau):
        actor_weights = self.actor.weights
        target_actor_weights = self.target_actor.weights
        for index in range(len(actor_weights)):
            target_actor_weights[index] = tau * actor_weights[index] + (1 - tau) * target_actor_weights[index]

        self.target_actor.set_weights(target_actor_weights)
        
        critic_weights = self.critic.weights
        target_critic_weights = self.target_critic.weights
    
        for index in range(len(critic_weights)):
            target_critic_weights[index] = tau * critic_weights[index] + (1 - tau) * target_critic_weights[index]

        self.target_critic.set_weights(target_critic_weights)
    
    def add_to_replay_buffer(self, state, action, reward, new_state, done):
        self.replay_buffer.add_record(state, action, reward, new_state, done)

    def save(self):
        date_now = time.strftime("%Y%m%d%H%M")
        if not os.path.isdir(f"{self.path_save}/save_agent_{date_now}"):
            os.makedirs(f"{self.path_save}/save_agent_{date_now}")
        self.actor.save_weights(f"{self.path_save}/save_agent_{date_now}/{self.actor.net_name}.h5")
        self.target_actor.save_weights(f"{self.path_save}/save_agent_{date_now}/{self.target_actor.net_name}.h5")
        self.critic.save_weights(f"{self.path_save}/save_agent_{date_now}/{self.critic.net_name}.h5")
        self.target_critic.save_weights(f"{self.path_save}/save_agent_{date_now}/{self.target_critic.net_name}.h5")
        
        np.save(f"{self.path_save}/save_agent_{date_now}/noise.npy", self.noise)
        
        self.replay_buffer.save(f"{self.path_save}/save_agent_{date_now}")

    def load(self):
        self.actor.load_weights(f"{self.path_load}/{self.actor.net_name}.h5")
        self.target_actor.load_weights(f"{self.path_load}/{self.target_actor.net_name}.h5")
        self.critic.load_weights(f"{self.path_load}/{self.critic.net_name}.h5")
        self.target_critic.load_weights(f"{self.path_load}/{self.target_critic.net_name}.h5")
        
        self.noise = np.load(f"{self.path_load}/noise.npy")
        
        self.replay_buffer.load(f"{self.path_load}")
        
        
        
    def _ornstein_uhlenbeck_process(self, x, theta=THETA, mu=0, dt=DT, std=0.2):
        """
        Ornstein–Uhlenbeck process
        """
        return x + theta * (mu-x) * dt + std * np.sqrt(dt) * np.random.normal(size=self.actions_dim)

    def get_action(self, observation, noise, evaluation=False):
        state = tf.convert_to_tensor([observation], dtype=tf.float32)
        actions = self.actor(state)
        if not evaluation:
            self.noise = self._ornstein_uhlenbeck_process(noise)
            actions += self.noise

        actions = tf.clip_by_value(actions, self.lower_bound, self.upper_bound)

        return actions[0]

    def learn(self):
        if self.replay_buffer.check_buffer_size() == False:
            return None

        state, action, reward, new_state, done = self.replay_buffer.get_minibatch()

        states = tf.convert_to_tensor(state, dtype=tf.float32)
        new_states = tf.convert_to_tensor(new_state, dtype=tf.float32)
        rewards = tf.convert_to_tensor(reward, dtype=tf.float32)
        actions = tf.convert_to_tensor(action, dtype=tf.float32)

        with tf.GradientTape() as tape:
            target_actions = self.target_actor(new_states)
            target_critic_values = tf.squeeze(self.target_critic(
                                new_states, target_actions), 1)
            critic_value = tf.squeeze(self.critic(states, actions), 1)
            target = reward + self.gamma * target_critic_values * (1-done)
            critic_loss = tf.keras.losses.MSE(target, critic_value)

        critic_gradient = tape.gradient(critic_loss,
                                            self.critic.trainable_variables)
        self.critic.optimizer.apply_gradients(zip(
            critic_gradient, self.critic.trainable_variables))

        with tf.GradientTape() as tape:
            policy_actions = self.actor(states)
            actor_loss = -self.critic(states, policy_actions)
            actor_loss = tf.math.reduce_mean(actor_loss)

        actor_gradient = tape.gradient(actor_loss, 
                                    self.actor.trainable_variables)
        self.actor.optimizer.apply_gradients(zip(
            actor_gradient, self.actor.trainable_variables))

        self.update_target_networks(self.tau)

In [6]:
agent = Agent(env)
n_games = 250

best_score = env.reward_range[0]
score_history = []
evaluation = False

if PATH_LOAD is not None:
    print("loading weights")
    observation = env.reset()
    action = agent.actor(observation[None, :])
    agent.target_actor(observation[None, :])
    agent.critic(observation[None, :], action)
    agent.target_critic(observation[None, :], action)
    agent.load()
    print(agent.replay_buffer.buffer_counter)
    print(agent.replay_buffer.n_episodes)
    print(agent.noise)

for i in range(n_games):
    observation = env.reset()
    done = False
    score = 0
    while not done:
        action = agent.get_action(observation, evaluation)
        observation_, reward, done, info = env.step(action)
        score += reward
        agent.add_to_replay_buffer(observation, action, reward, observation_, done)
        agent.learn()
        observation = observation_
    
    agent.replay_buffer.update_n_games()
    
    score_history.append(score)
    avg_score = np.mean(score_history[-100:])

    if avg_score > best_score:
        best_score = avg_score
    if (i + 1) % 200 == 0:
        print("saving")
        agent.save()
    print('episode ', i, 'score %.1f' % score, 'avg score %.1f' % avg_score)

episode  0 score -235.4 avg score -235.4
episode  1 score -66.1 avg score -150.8
episode  2 score -405.9 avg score -235.8
episode  3 score -203.8 avg score -227.8
episode  4 score -472.0 avg score -276.6
episode  5 score -368.1 avg score -291.9
episode  6 score -411.2 avg score -308.9
episode  7 score -161.5 avg score -290.5
episode  8 score -160.4 avg score -276.0
episode  9 score -216.7 avg score -270.1
episode  10 score -3.1 avg score -245.8
episode  11 score -131.5 avg score -236.3
episode  12 score 19.0 avg score -216.7
episode  13 score -70.9 avg score -206.3
episode  14 score -10.7 avg score -193.2
episode  15 score -213.4 avg score -194.5
episode  16 score -36.7 avg score -185.2
episode  17 score -135.6 avg score -182.4
episode  18 score -233.4 avg score -185.1
episode  19 score -527.1 avg score -202.2
episode  20 score -185.6 avg score -201.4
episode  21 score -44.5 avg score -194.3
episode  22 score -311.3 avg score -199.4
episode  23 score -510.7 avg score -212.4
episode  24

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-30742d3ad809>", line 26, in <module>
    observation_, reward, done, info = env.step(action)
  File "/Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/gym/wrappers/time_limit.py", line 16, in step
    observation, reward, done, info = self.env.step(action)
  File "/Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/gym/envs/box2d/lunar_lander.py", line 302, in step
    self.world.Step(1.0/FPS, 6*30, 2*30)
  File "/Users/anton/.conda/envs/reinforcement_learning/lib/python3.8/site-packages/gym/envs/box2d/lunar_lander.py", line 68, in BeginContact
    def BeginContact(self, contact):
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most re

TypeError: object of type 'NoneType' has no len()