# Battle Royale Environment Trainer
This notebook is for training Battle Royale agents. MADDPG is used for training the agents.

## Setup Environment Dependencies

In [36]:
import sys
from gym_unity.envs import UnityEnv

print("Python version:")
print(sys.version)
print(sys.executable)

# check Python version
if (sys.version_info[0] < 3):
    raise Exception("ERROR: ML-Agents Toolkit (v0.3 onwards) requires Python 3")

Python version:
3.7.6 (default, Jan  8 2020, 13:42:34) 
[Clang 4.0.1 (tags/RELEASE_401/final)]
/Users/adhipradhana/anaconda3/envs/unity-battle-royale/bin/python


## Start Environment

In [53]:
# Environment name
# Remember to put battle royale environment configuration within the config folder
env_name = "environment/new/battle-royale"

env = UnityEnv(env_name, worker_id=1, use_visual=False, multiagent=True)

print(str(env))

INFO:mlagents_envs:Connected new brain:
PlayerBrain?team=0
INFO:gym_unity:3 agents within environment.


<UnityEnv instance>


## Testing Model

### Setup Algorithm Dependencies

In [54]:
import torch
import numpy as np
import random
import os

from utils.MADDPG import MADDPG

### Model Variables

In [55]:
random_seed = random.randint(0,1000000)
n_states = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
n_agents = env.number_agents
n_episode = 100
max_steps = 250
buffer_capacity = 1000000
batch_size = 1000
episodes_before_train = 100

In [56]:
# setup seed
torch.manual_seed(random_seed)
np.random.seed(random_seed)

maddpg = MADDPG(n_agents, n_states, n_actions, batch_size, buffer_capacity, episodes_before_train)

FloatTensor = torch.cuda.FloatTensor if maddpg.use_cuda else torch.FloatTensor

## Loading Model

In [49]:
# path = os.path.join(os.getcwd(), 'checkpoint', 'dynamic-new', 'normal', 'Time_2020-04-08_08.36.55.014046_NAgent_2', 'Time_2020-04-08_08.36.55.014046_NAgent_2_Episode_5000.pth')
# path = os.path.join(os.getcwd(), 'checkpoint', 'dynamic-new', 'aggressive', 'Time_2020-04-08_08.38.06.929797_NAgent_2', 'Time_2020-04-08_08.38.06.929797_NAgent_2_Episode_5000.pth')
# path = os.path.join(os.getcwd(), 'checkpoint', 'dynamic-new', 'passive', 'Time_2020-04-09_10.10.56.255609_NAgent_2', 'Time_2020-04-09_10.10.56.255609_NAgent_2_Episode_5000.pth')
# path = os.path.join(os.getcwd(), 'checkpoint', 'dynamic-new', 'mix', 'Time_2020-04-09_10.09.14.858613_NAgent_3', 'Time_2020-04-09_10.09.14.858613_NAgent_3_Episode_10000.pth')

maddpg.load(path=path, map_location='cpu')

RuntimeError: Error(s) in loading state_dict for Critic:
	size mismatch for FC1.weight: copying a param with shape torch.Size([1024, 72]) from checkpoint, the shape in current model is torch.Size([1024, 108]).
	size mismatch for FC2.weight: copying a param with shape torch.Size([512, 1034]) from checkpoint, the shape in current model is torch.Size([512, 1039]).

In [None]:
# path_normal = os.path.join(os.getcwd(), 'checkpoint', 'dynamic-new', 'normal', 'Time_2020-04-08_08.36.55.014046_NAgent_2', 'Time_2020-04-08_08.36.55.014046_NAgent_2_Episode_5000.pth')
# path_aggressive = os.path.join(os.getcwd(), 'checkpoint', 'dynamic-new', 'aggressive', 'Time_2020-04-08_08.38.06.929797_NAgent_2', 'Time_2020-04-08_08.38.06.929797_NAgent_2_Episode_5000.pth')
# path_passive = os.path.join(os.getcwd(), 'checkpoint', 'dynamic-new', 'passive', 'Time_2020-04-09_10.10.56.255609_NAgent_2', 'Time_2020-04-09_10.10.56.255609_NAgent_2_Episode_5000.pth')
# path_mix = os.path.join(os.getcwd(), 'checkpoint', 'dynamic-new', 'mix', 'Time_2020-04-09_10.09.14.858613_NAgent_3', 'Time_2020-04-09_10.09.14.858613_NAgent_3_Episode_10000.pth')

maddpg.load_all_agent(path=path_normal, model_number=0, map_location='cpu')

In [57]:
path_normal = os.path.join(os.getcwd(), 'checkpoint', 'dynamic-new', 'normal', 'Time_2020-04-08_08.36.55.014046_NAgent_2', 'Time_2020-04-08_08.36.55.014046_NAgent_2_Episode_5000.pth')
path_aggressive = os.path.join(os.getcwd(), 'checkpoint', 'dynamic-new', 'aggressive', 'Time_2020-04-08_08.38.06.929797_NAgent_2', 'Time_2020-04-08_08.38.06.929797_NAgent_2_Episode_5000.pth')
path_passive = os.path.join(os.getcwd(), 'checkpoint', 'dynamic-new', 'passive', 'Time_2020-04-09_10.10.56.255609_NAgent_2', 'Time_2020-04-09_10.10.56.255609_NAgent_2_Episode_5000.pth')
# path_mix = os.path.join(os.getcwd(), 'checkpoint', 'dynamic-new', 'mix', 'Time_2020-04-09_10.09.14.858613_NAgent_3', 'Time_2020-04-09_10.09.14.858613_NAgent_3_Episode_10000.pth')

maddpg.load_agent(path=path_normal, agent_number=0, model_number=1, map_location='cpu')
maddpg.load_agent(path=path_aggressive, agent_number=1, model_number=0, map_location='cpu')
maddpg.load_agent(path=path_passive, agent_number=2, model_number=1, map_location='cpu')

## Run Model

In [58]:
print("Testing model...")
for i_episode in range(n_episode):
    # reset environment
    obs = env.reset()
    obs = np.stack(obs)
    
    # convert observation to tensor
    if isinstance(obs, np.ndarray):
        obs = torch.from_numpy(obs).float()
    
    total_reward = 0.0
    for i_step in range(max_steps):
        obs = obs.type(FloatTensor)
        actions = maddpg.select_action(obs).data.cpu()
        actions_list = actions.tolist()
        
        obs_, reward, done, _ = env.step(actions_list)
        
        reward = torch.FloatTensor(reward).type(FloatTensor)
        obs_ = np.stack(obs_)
        obs_ = torch.from_numpy(obs_).float()
        if i_step != max_steps - 1:
            next_obs = obs_
        else:
            next_obs = None

        total_reward += reward.sum()     
        obs = next_obs

        # check if done
        if True in done:
            print(done)
            break

    maddpg.episode_done += 1
    print("Episode: {}, reward = {}".format(i_episode, total_reward))

Testing model...
[True, True, True]
Episode: 0, reward = 0.44500064849853516
Episode: 1, reward = -1.849999189376831
Episode: 2, reward = -1.9399996995925903
[True, True, True]
Episode: 3, reward = 0.7919999361038208
Episode: 4, reward = -1.0929996967315674
[True, True, True]
Episode: 5, reward = 0.39000141620635986


KeyboardInterrupt: 

## Close Environment

In [59]:
env.close()

INFO:mlagents_envs:Environment shut down with return code 0.
