# Battle Royale Environment Trainer
This notebook is for training Battle Royale agents. MADDPG is used for training the agents.

## Setup Environment Dependencies

In [1]:
import sys
from gym_unity.envs import UnityEnv

%matplotlib inline

print("Python version:")
print(sys.version)
print(sys.executable)

# check Python version
if (sys.version_info[0] < 3):
    raise Exception("ERROR: ML-Agents Toolkit (v0.3 onwards) requires Python 3")

Python version:
3.7.6 (default, Jan  8 2020, 13:42:34) 
[Clang 4.0.1 (tags/RELEASE_401/final)]
/Users/adhipradhana/anaconda3/envs/unity-battle-royale/bin/python


## Start Environment

In [2]:
# Environment name
# Remember to put battle royale environment configuration within the config folder
env_name = "environment/battle-royale"

env = UnityEnv(env_name, worker_id=2, use_visual=False, multiagent=True)

print(str(env))

INFO:mlagents_envs:Connected new brain:
PlayerBrain?team=0
INFO:gym_unity:4 agents within environment.


<UnityEnv instance>




## Examine Observation Space

In [3]:
# Examine observation space
observation = env.observation_space
print("Agent observation space type: {}".format(observation))

Agent observation space type: Box(42,)


## Examine Action Space

In [4]:
# Examine action space
action = env.action_space
print("Agent action space type: {}".format(action))

Agent action space type: Box(5,)


## Agents Training
This part shows agent training using MADDPG algoritm

### Setup Algorithm Dependencies

In [3]:
from datetime import datetime
import torch
import visdom
import numpy as np

from utils.MADDPG import MADDPG
from utils.RandomProcess import OUNoise

### Setup Algoritm Parameters

In [4]:
random_seed = 4966
n_states = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
n_agents = env.number_agents
n_episode = 10000
max_steps = 50000
buffer_capacity = 1000000
batch_size = 1000
episodes_before_train = 100
checkpoint_episode = 1000

### Setup MADDPG

In [5]:
# setup seed
torch.manual_seed(random_seed)
np.random.seed(random_seed)

maddpg = MADDPG(n_agents, n_states, n_actions, batch_size, buffer_capacity, episodes_before_train)
noise = OUNoise(env.action_space)

FloatTensor = torch.cuda.FloatTensor if maddpg.use_cuda else torch.FloatTensor

vis = visdom.Visdom(port=8097)

INFO:visdom:Visdom successfully connected to server


### MADDPG Training

In [8]:
win = None
current_time = str(datetime.now())
reward_record = []
true = True

print("Exploration begins...")
for i_episode in range(n_episode):
    # reset environment
    obs = env.reset()
    obs = np.stack(obs)
    noise.reset()
    
    # convert observation to tensor
    if isinstance(obs, np.ndarray):
        obs = torch.from_numpy(obs).float()
    
    total_reward = 0.0
    rr = np.zeros((n_agents,))
    for i_step in range(max_steps):
        obs = obs.type(FloatTensor)
        actions = maddpg.select_action(obs).data.cpu()
        actions_list = [noise.get_action(action) for action in actions.tolist()]
        
        obs_, reward, done, _ = env.step(actions_list)

        reward = torch.FloatTensor(reward).type(FloatTensor)
        obs_ = np.stack(obs_)
        obs_ = torch.from_numpy(obs_).float()
        if i_step != max_steps - 1:
            next_obs = obs_
        else:
            next_obs = None

        total_reward += reward.sum()
        rr += reward.cpu().numpy()
        maddpg.memory.push(obs.data, actions, next_obs, reward)
        
        obs = next_obs

        c_loss, a_loss = maddpg.update_policy()
        
        # check if done
        if done[0]:
            break

    maddpg.episode_done += 1
    print("Episode: {}, reward = {}".format(i_episode, total_reward))
    reward_record.append(total_reward)
    
    if maddpg.episode_done == maddpg.episodes_before_train:
        print("Training begins...")
        print("MADDPG on Battle Royale")
              
    if win is None:
        win = vis.line(X=np.arange(i_episode, i_episode+1),
                       Y=np.array([
                           np.append(total_reward, rr)]),
                       opts=dict(
                           ylabel="Reward",
                           xlabel="Episode",
                           title="MADDPG on Battle Royale | " + \
                               "Agent: {} | ".format(n_agents) + \
                               "Time: {}\n".format(current_time),
                           legend=["Total"] +
                           ["Agent-".format(i) for i in range(n_agents)]))
    else:
        vis.line(X=np.array(
            [np.array(i_episode).repeat(n_agents+1)]),
                 Y=np.array([np.append(total_reward,rr)]),
                 win=win,
                 update="append")
        
    # save model
    if (maddpg.episode_done % checkpoint_episode == 0):
        maddpg.save(current_time, maddpg.episode_done)

Exploration begins...
Episode: 0, reward = 0.0
Episode: 1, reward = 0.09999999403953552
Episode: 2, reward = 0.0
Episode: 3, reward = 0.10999999195337296
Episode: 4, reward = 0.0
Episode: 5, reward = 0.0
Episode: 6, reward = 0.0
Episode: 7, reward = 0.019999999552965164
Episode: 8, reward = 0.0
Episode: 9, reward = 0.0
Training begins...
MADDPG on Battle Royale
Episode: 10, reward = 0.0
Episode: 11, reward = 0.0
Episode: 12, reward = 0.0
Episode: 13, reward = 0.0
Episode: 14, reward = 0.0
Episode: 15, reward = 0.0
Episode: 16, reward = 0.0
Episode: 17, reward = 0.0
Episode: 18, reward = 0.11999998986721039
Episode: 19, reward = 0.0
Episode: 20, reward = 0.03999999910593033
Episode: 21, reward = 0.04999999701976776
Episode: 22, reward = 0.05999999865889549
Episode: 23, reward = 0.0
Episode: 24, reward = 0.0
Episode: 25, reward = 0.11999999731779099
Episode: 26, reward = 0.0
Episode: 27, reward = 0.0
Episode: 28, reward = 0.0
Episode: 29, reward = 0.019999999552965164
Episode: 30, reward

## Close Environment

In [7]:
env.close()

INFO:mlagents_envs:Environment shut down with return code 0.


## Loading Model

In [8]:
import os

path = os.path.join(os.getcwd(), 'checkpoint', 'Time_2020-03-03_14:30:03.326161_NAgent_4_Episode_15.pth')
maddpg.load(path)