In [1]:
import sys
sys.path.append("../src/")

In [2]:
from plugin_write_and_run import *

In [3]:
%%write_and_run ../src/super_agent.py
import numpy as np
import tensorflow as tf
from tensorflow.keras import optimizers as opt
import random
import time
import json
import os
import sys
sys.path.append("../src")
from config import *
from make_env import *
from replay_buffer import *
from agent import *

In [4]:
%%write_and_run -a ../src/super_agent.py

class SuperAgent:
    def __init__(self, env, path_save=PATH_SAVE_MODEL, path_load=PATH_LOAD_FOLDER):
        self.path_save = path_save
        self.path_load = path_load
        self.replay_buffer = ReplayBuffer(env)
        self.n_agents = len(env.agents)
        self.agents = [Agent(env, agent) for agent in range(self.n_agents)]
        
    def get_actions(self, agents_states):
        list_actions = [self.agents[index].get_actions(agents_states[index]) for index in range(self.n_agents)]
        return list_actions
    
    def save(self):
        date_now = time.strftime("%Y%m%d%H%M")
        full_path = f"{self.path_save}/save_agent_{date_now}"
        if not os.path.isdir(full_path):
            os.makedirs(full_path)
        
        for agent in self.agents:
            agent.save(full_path)
            
        self.replay_buffer.save(full_path)
    
    def load(self):
        full_path = self.path_load
        for agent in self.agents:
            agent.load(full_path)
            
        self.replay_buffer.load(full_path)
    
    def train(self):
        if self.replay_buffer.check_buffer_size() == False:
            return
        
        state, reward, next_state, done, actors_state, actors_next_state, actors_action = self.replay_buffer.get_minibatch()
        
        states = tf.convert_to_tensor(state, dtype=tf.float32)
        rewards = tf.convert_to_tensor(reward, dtype=tf.float32)
        next_states = tf.convert_to_tensor(next_state, dtype=tf.float32)
        
        actors_states = [tf.convert_to_tensor(s, dtype=tf.float32) for s in actors_state]
        actors_next_states = [tf.convert_to_tensor(s, dtype=tf.float32) for s in actors_next_state]
        actors_actions = [tf.convert_to_tensor(s, dtype=tf.float32) for s in actors_action]
        
        with tf.GradientTape(persistent=True) as tape:
            target_actions = [self.agents[index].target_actor(actors_next_states[index]) for index in range(self.n_agents)]
            policy_actions = [self.agents[index].actor(actors_states[index]) for index in range(self.n_agents)]
            
            concat_target_actions = tf.concat(target_actions, axis=1)
            concat_policy_actions = tf.concat(policy_actions, axis=1)
            concat_actors_action = tf.concat(actors_actions, axis=1)
            
            target_critic_values = [tf.squeeze(self.agents[index].target_critic(next_states, concat_target_actions), 1) for index in range(self.n_agents)]
            critic_values = [tf.squeeze(self.agents[index].critic(states, concat_actors_action), 1) for index in range(self.n_agents)]
            targets = [rewards[:, index] + self.agents[index].gamma * target_critic_values[index] * (1-done[:, index]) for index in range(self.n_agents)]
            critic_losses = [tf.keras.losses.MSE(targets[index], critic_values[index]) for index in range(self.n_agents)]
            
            actor_losses = [-self.agents[index].critic(states, concat_policy_actions) for index in range(self.n_agents)]
            actor_losses = [tf.math.reduce_mean(actor_losses[index]) for index in range(self.n_agents)]
        
        critic_gradients = [tape.gradient(critic_losses[index], self.agents[index].critic.trainable_variables) for index in range(self.n_agents)]
        actor_gradients = [tape.gradient(actor_losses[index], self.agents[index].actor.trainable_variables) for index in range(self.n_agents)]
        
        for index in range(self.n_agents):
            self.agents[index].critic.optimizer.apply_gradients(zip(critic_gradients[index], self.agents[index].critic.trainable_variables))
            self.agents[index].actor.optimizer.apply_gradients(zip(actor_gradients[index], self.agents[index].actor.trainable_variables))
            self.agents[index].update_target_networks(self.agents[index].tau)

In [5]:
env = make_env(ENV_NAME)

In [6]:
agents_states, reward, done, info = env.step([[1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5]])

In [7]:
sa = SuperAgent(env)

In [8]:
sa.agents

[<agent.Agent at 0x7fcc9189ffd0>,
 <agent.Agent at 0x7fcc91aaef40>,
 <agent.Agent at 0x7fcc3bf7c0d0>]

In [9]:
agents_states

[array([ 0.01770053,  0.58493223,  0.04998703,  0.62523341, -0.22160163,
        -0.54285523, -0.3109281 ,  0.02469857]),
 array([ 0.23930217,  1.12778746,  0.23930217,  1.12778746,  0.27158866,
         1.16808863,  0.22160163,  0.54285523, -0.08932647,  0.5675538 ]),
 array([ 0.32862863,  0.56023366,  0.32862863,  0.56023366,  0.36091513,
         0.60053483,  0.3109281 , -0.02469857,  0.08932647, -0.5675538 ])]

In [10]:
for i in range(10000):
    actors_state, reward, done, info = env.step([[0.1, 0.1, 0.2, 0.4, 0.5], [0.1, 0.1, 0.2, 0.4, 0.5], [0.1, 0.1, 0.2, 0.4, 0.5]])
    state = np.concatenate(actors_state)
    sa.replay_buffer.add_record(actors_state, actors_state, [[0.1, 0.1, 0.2, 0.4, 0.5], [0.1, 0.1, 0.2, 0.4, 0.5], [0.1, 0.1, 0.2, 0.4, 0.5]], state, state, reward, done)

In [11]:
state, reward, next_state, done, actors_state, actors_next_state, actors_action = sa.replay_buffer.get_minibatch()

In [12]:
next_state.shape

(64, 28)

In [13]:
sa.train()