In [1]:
## Import base classes
import numpy as np
import random
import sys, os

# For Interactive Plotting
from viz_util import interactive_plotting

# For Environment import 
import sys, os
sys.path.append(os.path.abspath('..'))
from environment.grid import GridEnv, Action
from enum import Enum

# For converting during training
import torch
import torch.nn as nn
import torch.nn.functional as F

[[ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0. 11.]
 [ 0.  0.  2.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. 22.  0.  0.  0.  0.]]
([[0.0, 0.75, 0.875, 0.375], [0.375, 0.5, 0.375, 0.875]], [-1, -1], False)
----
[(0, 6), (3, 4)]
[(7, 3), (3, 7)]


In [2]:
%load_ext autoreload
%autoreload 2
# For Agent import
from multi_ddpg import MADDPG

In [3]:
# --- Create Environment --- #
def create_environment(name):
    if "taxi-grid":
        env = GridEnv()
    state_space, action_space, num_agent = env.state_space, env.action_space, env.num_agent
    ##possible_actions = Action(Enum())

    print("--- Environment {} has been loaded. ---".format(name))
    print("State size: {} | Action size: {} | Number of Agents: {}".format(state_space, action_space, num_agent))
    ##print("Possible Actions: {}".format(possible_actions))
    return env, state_space, action_space.n, num_agent##, possible_actions

# --- Check example outputs --- #
def example_env_output(env):
    next_states, rewards, done = env.step([0, 1])
    print("State: ", next_states)
    print("Reward: ", rewards)
    print("Done: ", done)

In [4]:
# state_size, action_size, num_agent, possible_actions = create_environment("taxi-grid")
env, state_size, action_size, num_agent = create_environment("taxi-grid")

--- Environment taxi-grid has been loaded. ---
State size: 8 | Action size: Discrete(4) | Number of Agents: 2


In [5]:
example_env_output(env)

State:  [[0.875, 0.375, 0.125, 0.25], [0.75, 0.875, 0.125, 0.125]]
Reward:  [-1, -1]
Done:  False


In [6]:
# Create Environment
agents = MADDPG(state_size=int(state_size/num_agent), action_size=action_size, random_seed = 1, num_agent=num_agent)


--- Agent 0 Params ---
Going to train on cpu
Learning Rate:: Actor: 0.0001 | Critic: 0.001
Replay Buffer:: Buffer Size: 100000 | Sampled Batch size: 128

--- Agent 1 Params ---
Going to train on cpu
Learning Rate:: Actor: 0.0001 | Critic: 0.001
Replay Buffer:: Buffer Size: 100000 | Sampled Batch size: 128


In [37]:
def DDPG_eval(agents, env, max_t=100):
    states = env.reset()                                   # reset the environment & get the current state (for each agent)
    cum_scores = np.zeros(num_agent)                           # initialize the score (for each agent)
    scores = []
    episode_length = 0                                     # measure number of steps allowed before done flag
    for i in range(max_t):
        # actions = np.random.randn(num_agents, action_size) # select an action (for each agent); randomly!
        actions = agents.act([torch.from_numpy(np.array(state)).float() for state in states])           # select an action (for each agent)
        #actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
        next_states, rewards, dones = env.step([np.random.choice(action_size, p=agent_action_prob.numpy()) for agent_action_prob in actions])     # send all actions to the environment, get next_state, rewards and done
        cum_scores += rewards                                  # update the score (for each agent)
        scores.append(cum_scores)
        states = next_states                               # roll over states to next time step
        episode_length +=1
        print(env.render(), end='\r')
        ##interactive_plotting(scores)
        if np.any(dones):                                  # exit loop if episode finished
            break
    print('Episode length is {}'.format(episode_length))
    print('Total score (averaged over agents) this episode: {}'.format(np.mean(cum_scores)))

In [38]:
DDPG_eval(agents, env)

[[ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  2.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0. 22.]
 [ 0.  0.  0.  0. 11.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.  0.  0.  0.]
[[ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  2.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0. 22.]
 [ 0.  0.  0.  0. 11.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.  0.]
[[ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  2.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0. 22.]
 [ 0.  0.  0.  0. 11.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
[[ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  2.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0. 22.]
 [ 0.  0.  0.  0. 11.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.  0.]
[[ 0.  0.  0.  0.  0