# Iniciar ambiente

## Iniciar Local

In [1]:
import os
isColab = False

## (Sempre) Outras configurações

In [2]:
# # Ambiente da competição
# !pip install --upgrade ceia-soccer-twos > /dev/null 2>&1
# # a versão do ray compatível com a implementação dos agentes disponibilizada é a 1.4.0
# !pip install 'aioredis==1.3.1' > /dev/null 2>&1
# !pip install 'aiohttp==3.7.4' > /dev/null 2>&1
# !pip install 'ray==1.4.0' > /dev/null 2>&1
# !pip install 'ray[rllib]==1.4.0' > /dev/null 2>&1
# !pip install 'ray[tune]==1.4.0' > /dev/null 2>&1
# !pip install torch > /dev/null 2>&1
# !pip install lz4 > /dev/null 2>&1
# !pip install GPUtil > /dev/null 2>&1

# # Dependências necessárias para gravar os vídeos
# !apt-get install - y xvfb x11-utils > /dev/null 2>&1
# !pip install 'pyvirtualdisplay==0.2.*' > /dev/null 2>&1
# !pip install tensorboard > /dev/null 2>&1


In [3]:
!pip show ray

Name: ray
Version: 1.4.0
Summary: Ray provides a simple, universal API for building distributed applications.
Home-page: https://github.com/ray-project/ray
Author: Ray Team
Author-email: ray-dev@googlegroups.com
License: Apache 2.0
Location: /home/bruno/anaconda3/envs/soccer-twos/lib/python3.8/site-packages
Requires: pyyaml, py-spy, gpustat, numpy, opencensus, filelock, pydantic, jsonschema, redis, aiohttp-cors, aioredis, colorama, click, grpcio, msgpack, prometheus-client, requests, aiohttp, protobuf
Required-by: 


# Soccer Twos

Como tarefa bônus, experimente com os algoritmos aprendidos no ambiente `soccer_twos`, que será utilizado na competição final deste curso*. Para facilitar, utilize a variação `team_vs_policy` como no laboratório anterior.

<img src="https://raw.githubusercontent.com/bryanoliveira/soccer-twos-env/master/images/screenshot.png" height="400">

> Visualização do ambiente

Este ambiente consiste em um jogo de futebol de carros 2x2, ou seja, o objetivo é marcar um gol no adversário o mais rápido possível. Na variação `team_vs_policy`, seu agente controla um jogador do time azul e joga contra um time aleatório. Mais informações sobre o ambiente podem ser encontradas [no repositório](https://github.com/bryanoliveira/soccer-twos-env) e [na documentação do Unity ml-agents](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Learning-Environment-Examples.md#soccer-twos).


**Sua tarefa é treinar um agente com a interface do Ray apresentada, experimentando com diferentes algoritmos e hiperparâmetros.**


<br>

*A variação utilizada na competição será a `multiagent_player`, mas agentes treinados para `team_vs_policy` podem ser facilmente adaptados. Na seção "Exportando seu agente treinado" o agente "MyDqnSoccerAgent" faz exatamente isso.

## Imports

In [4]:
import gym

import ray
from ray import tune
from ray.tune import ExperimentAnalysis
# from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.env import BaseEnv
from ray.rllib.evaluation.episode import MultiAgentEpisode
from ray.rllib.evaluation.rollout_worker import RolloutWorker
from ray.rllib.policy import Policy
from ray.rllib.agents.callbacks import DefaultCallbacks
# from ray.rllib.utils.typing import PolicyID
# from ray.tune.registry import get_trainable_cls
# from ray.rllib.policy.policy import PolicySpec

import numpy as np
from typing import Any, Dict, List, Union, Optional
from collections import deque
# import pickle
from pprint import pprint

import soccer_twos
from soccer_twos import EnvType

import shutil

## Wrapper

In [5]:
import gym
from typing import Any, Dict, List, Union

from ray.rllib.env.multi_agent_env import MultiAgentEnv
import numpy as np
from collections import deque

MAX_STEPS = 1000
MATCH_STEPS = 5000


def get_scalar_projection(x, y):
    assert np.linalg.norm(y) > 0.000001
    return np.dot(x, y) / np.linalg.norm(y)


# Os seguintes valores foram obtidos experimentalmente executando pré-experimentos
# A partir desses valores vamops derivar vários outros como posições ddos gols etc
min_ball_position_x, max_ball_position_x = - \
    15.563264846801758, 15.682827949523926
min_ball_position_y, max_ball_position_y = -7.08929967880249, 7.223850250244141
min_player_position_x, max_player_position_x = - \
    17.26804542541504, 17.16301727294922
min_player_position_y, max_player_position_y = - \
    7.399587631225586, 7.406457424163818
min_ball_to_goal_avg_velocity, max_ball_to_goal_avg_velocity = - \
    -23.366606239568615, 23.749571761530724

max_ball_abs_velocity = 78.25721740722656
max_goals_one_team = -9999999
max_goals_one_match = -9999999
max_steps = -999999

max_diff_reward = -np.inf

# Infered
max_ball_abs_avg_velocity = max(
    abs(min_ball_to_goal_avg_velocity), abs(max_ball_to_goal_avg_velocity))


SPEED_IMPORTANCE = 1.0 / (14.0)
CLIP_SPEED_REWARD_BY_SPEED_IMPORTANCE = True

AFTER_BALL_STEP_PENALTY = 1 / MAX_STEPS  # 0.001

# OBS.: Este hyperparâmetro não pode ser modificado sem fazer novos testes em
# min_ball_to_goal_avg_velocity e
# max_ball_to_goal_avg_velocity:
AVG_SPEED_TIMESTEPS_WINDOW = 1


def is_after_the_ball(player_id: int, player_pos: np.array, ball_pos: np.array):
    if player_id in range(2):
        return player_pos[0] > ball_pos[0]
    elif player_id in [2, 3]:
        return player_pos[0] < ball_pos[0]


def get_center_of_goal_pos(player_id):
    global min_ball_position_x, max_ball_position_x, \
        min_ball_position_y, max_ball_position_y, \
        min_player_position_x, max_player_position_x, \
        min_player_position_y, max_player_position_y
    if player_id in [0, 1]:
        return np.array([max_ball_position_x, 0.0])
    elif player_id in [2, 3]:
        return np.array([min_ball_position_x, 0.0])


def calculate_ball_to_goal_scalar_velocity(player_id: int, info: Dict, x_axis_only=True):
    ball_velocity = info["ball_info"]["velocity"]
    if x_axis_only and player_id in [0, 1]:
        return ball_velocity[0]
    elif x_axis_only and player_id in [2, 3]:
        return -ball_velocity[0]

    goal_pos = get_center_of_goal_pos(player_id)
    ball_pos = info["ball_info"]["position"]

    # print(f"ball_pos: {ball_pos}")
    direction_to_center_of_goal = goal_pos - ball_pos
    # print(f"direction_to_center_of_goal: {direction_to_center_of_goal}")

    # global max_ball_abs_velocity
    # if np.linalg.norm(ball_velocity) > max_ball_abs_velocity:
    #     max_ball_abs_velocity = np.linalg.norm(ball_velocity)

    # print(f"ball_velocity: {ball_velocity}")
    ball_velocity_to_center_of_goal = get_scalar_projection(
        ball_velocity, direction_to_center_of_goal)
    # print(f"ball_velocity_to_center_of_goal: {ball_velocity_to_center_of_goal}")
    return ball_velocity_to_center_of_goal

# print('ball_velocity_to_center_of_goal', calculate_ball_to_goal_scalar_velocity(0, { "ball_info": { "position": np.array([3.0, 2.0]), "velocity": np.array([0.0, 0.0]) }}))


def calculate_distance(pt1: np.ndarray, pt2: np.ndarray):
    assert pt1.shape == (2,) and pt2.shape == (2,)
    return np.linalg.norm(pt1 - pt2)


class CustomRewardWrapper(gym.core.Wrapper, MultiAgentEnv):
    # def __init__(self, env):
    #     gym.Wrapper.__init__(self, env)

    def step(self, action: Union[Dict[int, List[Any]], List[Any]]):
        obs, rewards, done, info = super().step(action)

        # print(info)
        # if rewards[0] > 0.0:
        #     assert False

        ball_pos = info[0]["ball_info"]["position"]
        ball_velocity = info[0]["ball_info"]["velocity"]
        player0_pos = info[0]["player_info"]["position"]
        player1_pos = info[1]["player_info"]["position"]
        player2_pos = info[2]["player_info"]["position"]
        player3_pos = info[3]["player_info"]["position"]

        # print('ball_velocity', ball_velocity)
        if self._was_ball_effective_touched(self.prev_ball_velocity, ball_velocity):
            ball_toucher = self._get_ball_toucher(
                ball_velocity, ball_pos, player0_pos, player1_pos, player2_pos, player3_pos)
            # self.ball_touchers.append(ball_toucher)

        if type(action) is dict:
            new_rewards = {k: self._calculate_reward(
                rewards[k], k, info[k]) for k in info.keys()}
        else:
            raise NotImplementedError('Necessário implementar!')

        if type(action) is dict:
            splitted_rets = {k: self._calculate_reward(
                rewards[k], k, info[k], splitted_returns=True) for k in info.keys()}
        else:
            raise NotImplementedError('Necessário implementar!')

        info = {
            i: {
                **info[i],
                "ep_metrics": {
                    # "total_timesteps": np.array([0.0008], dtype=np.float32)
                    "total_timesteps": self.n_step + 1,
                    "total_goals": self.scoreboard["team_0"] + self.scoreboard["team_1"],
                    "goals_opponent": self.scoreboard["team_1"] if i in range(2) else self.scoreboard["team_0"],
                    "goals_in_favor": self.scoreboard["team_0"] if i in range(2) else self.scoreboard["team_1"],
                    "team_0_goals": self.scoreboard["team_0"],
                    "team_1_goals": self.scoreboard["team_1"],
                    "episode_ended": done["__all__"],
                    "have_goals": self.scoreboard["team_0"] + self.scoreboard["team_1"] > 0,
                    "env_reward": splitted_rets[i][0],
                    "ball_to_goal_speed_reward": splitted_rets[i][1],
                    # "agent_position_to_ball_reward": splitted_rets[i][2],
                }
            } for i in info.keys()
        }

        # global min_ball_position_x, max_ball_position_x, \
        #     min_ball_position_y, max_ball_position_y, \
        #     min_player_position_x, max_player_position_x, \
        #     min_player_position_y, max_player_position_y, \
        #     max_goals_one_team, max_goals_one_match
        # if done:
        #     print(f'min_ball_position_x: {min_ball_position_x}')
        #     print(f'max_ball_position_x: {max_ball_position_x}')
        #     print(f'min_ball_position_y: {min_ball_position_y}')
        #     print(f'max_ball_position_y: {max_ball_position_y}')
        #     print(f'min_player_position_x: {min_player_position_x}')
        #     print(f'max_player_position_x: {max_player_position_x}')
        #     print(f'min_player_position_y: {min_player_position_y}')
        #     print(f'max_player_position_y: {max_player_position_y}')
        #     print(f'min_ball_to_goal_avg_velocity: {min_ball_to_goal_avg_velocity}')
        #     print(f'max_ball_to_goal_avg_velocity: {max_ball_to_goal_avg_velocity}')
        #     print(f'max_goals_one_team: {max_goals_one_team}')
        #     print(f'max_goals_one_match: {max_goals_one_match}')
        #     print(self.scoreboard)
        #     print(f'Done... last n_step: {self.n_step}')
        #     if self.scoreboard["team_0"] > 0 or self.scoreboard["team_1"] > 0:
        #         input("Press Enter to continue...")

        # global max_steps
        # if done:
        #     if self.n_step + 1 > max_steps:
        #         max_steps = self.n_step + 1
        #     print('max_steps', max_steps)

        # global max_diff_reward
        # if done:
        #     print(f'max_diff_reward: {max_diff_reward}')
        #     print(f'min_ball_to_goal_avg_velocity: {min_ball_to_goal_avg_velocity}')
        #     print(f'max_ball_to_goal_avg_velocity: {max_ball_to_goal_avg_velocity}')

        # if done:
        #     print(f'max_ball_abs_velocity: {max_ball_abs_velocity}')
        # if done:
        #     print('self.ball_touched', self.ball_touched)
        #     print('self.ball_touchers', self.ball_touchers)

        self.n_step += 1
        self.prev_ball_velocity = ball_velocity.copy()

        return obs, new_rewards, done, info
            
    def reset(self, **kwargs):
        obs = super().reset(**kwargs)
        self.n_step = 0
        self.last_ball_speed_mean_per_player = {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}
        self.ball_speed_deque_per_player = {0: deque(maxlen=AVG_SPEED_TIMESTEPS_WINDOW),
                                            1: deque(maxlen=AVG_SPEED_TIMESTEPS_WINDOW),
                                            2: deque(maxlen=AVG_SPEED_TIMESTEPS_WINDOW),
                                            3: deque(maxlen=AVG_SPEED_TIMESTEPS_WINDOW)}
        self.scoreboard = {"team_0": 0, "team_1": 0}
        self.await_press = False
        self.prev_ball_velocity = np.array([0.0, 0.0])
        self.last_ball_toucher = -1
        # self.ball_touched = []
        # self.ball_touchers = []
        # print(f'min_ball_to_goal_avg_velocity: {min_ball_to_goal_avg_velocity}')
        # print(f'max_ball_to_goal_avg_velocity: {max_ball_to_goal_avg_velocity}')
        return obs

    def _was_ball_effective_touched(self, prev_ball_velocity: np.ndarray, curr_ball_velocity: np.ndarray):
        """Get if ball was touched (either by player or wall)

        Args:
            prev_ball_velocity (np.ndarray): Previous ball coordinates
            curr_ball_velocity (np.ndarray): Current ball coordinates
        """
        assert prev_ball_velocity.shape == (
            2,) and curr_ball_velocity.shape == (2,)
        percentual_scalar_thresold = 0.2  # 20%
        diff = curr_ball_velocity - prev_ball_velocity


        if np.linalg.norm(curr_ball_velocity) < 1.0:
            self.last_ball_toucher = -1

        if np.linalg.norm(prev_ball_velocity) > 0.0000001:
            return np.linalg.norm(diff) / np.linalg.norm(prev_ball_velocity) > percentual_scalar_thresold
        return np.linalg.norm(curr_ball_velocity) > np.linalg.norm(prev_ball_velocity)

    def _get_ball_toucher(self,
                          ball_velocity: np.ndarray,
                          ball_position: np.ndarray,
                          player_0_pos: np.ndarray,
                          player_1_pos: np.ndarray,
                          player_2_pos: np.ndarray,
                          player_3_pos: np.ndarray):
        assert ball_position.shape == (2,) and \
            player_0_pos.shape == (2,) and \
            player_1_pos.shape == (2,) and \
            player_2_pos.shape == (2,) and \
            player_3_pos.shape == (2,)
        top_wall_y = max_ball_position_y
        bottom_wall_y = min_ball_position_y
        left_wall_x = min_ball_position_x
        right_wall_x = max_ball_position_x

        if np.linalg.norm(ball_velocity) > 0.000001:
            distances = np.array([
                calculate_distance(ball_position, player_0_pos),
                calculate_distance(ball_position, player_1_pos),
                calculate_distance(ball_position, player_2_pos),
                calculate_distance(ball_position, player_3_pos),
                np.abs(ball_position[1] - top_wall_y),
                np.abs(ball_position[1] - bottom_wall_y),
                np.abs(ball_position[0] - left_wall_x),
                np.abs(ball_position[0] - right_wall_x)
            ])

            # print(distances)
            nearest = np.argmin(distances)
            # print(nearest)
            if nearest < 4:
                self.last_ball_toucher = nearest

        return self.last_ball_toucher

    def _calculate_reward(self, reward: float, player_id: int, info: Dict, splitted_returns=False) -> float:
        # print('calculating reward')
        if reward != 0.0:
            # print('Goal was made!', reward, info)
            self._update_scoreboard(player_id, reward)
        # global min_ball_position_x, max_ball_position_x, \
        #     min_ball_position_y, max_ball_position_y, \
        #     min_player_position_x, max_player_position_x, \
        #     min_player_position_y, max_player_position_y
        # print(f"info: {info}")
        # if info["ball_info"]["position"][0] < min_ball_position_x:
        #     min_ball_position_x = info["ball_info"]["position"][0]
        # if info["ball_info"]["position"][0] > max_ball_position_x:
        #     max_ball_position_x = info["ball_info"]["position"][0]
        # if info["ball_info"]["position"][1] < min_ball_position_y:
        #     min_ball_position_y = info["ball_info"]["position"][1]
        # if info["ball_info"]["position"][1] > max_ball_position_y:
        #     max_ball_position_y = info["ball_info"]["position"][1]
        # if info["player_info"]["position"][0] < min_player_position_x:
        #     min_player_position_x = info["player_info"]["position"][0]
        # if info["player_info"]["position"][0] > max_player_position_x:
        #     max_player_position_x = info["player_info"]["position"][0]
        # if info["player_info"]["position"][1] < min_player_position_y:
        #     min_player_position_y = info["player_info"]["position"][1]
        # if info["player_info"]["position"][1] > max_player_position_y:
        #     max_player_position_y = info["player_info"]["position"][1]

        self._update_avg_ball_speed_to_goal(
            player_id, calculate_ball_to_goal_scalar_velocity(player_id, info))
        # global max_diff_reward
        # if (np.abs(SPEED_IMPORTANCE * self.last_ball_speed_mean_per_player[player_id] / max_ball_abs_avg_velocity) > max_diff_reward):
        #     max_diff_reward = SPEED_IMPORTANCE * \
        #         self.last_ball_speed_mean_per_player[player_id] / \
        #         max_ball_abs_avg_velocity

        # ball_pos = info["ball_info"]["position"]
        # player_pos = info["player_info"]["position"]

        env_reward = reward
        
        ball_to_goal_speed_reward = np.clip(SPEED_IMPORTANCE * self.last_ball_speed_mean_per_player[player_id] / max_ball_abs_avg_velocity, -SPEED_IMPORTANCE,
                                            SPEED_IMPORTANCE) if CLIP_SPEED_REWARD_BY_SPEED_IMPORTANCE else SPEED_IMPORTANCE * self.last_ball_speed_mean_per_player[player_id] / max_ball_abs_avg_velocity
        ball_to_goal_speed_reward = (
            player_id == self.last_ball_toucher) * ball_to_goal_speed_reward
        # agent_position_to_ball_reward = is_after_the_ball(player_id, player_pos,
        #                                                   ball_pos) * (-AFTER_BALL_STEP_PENALTY)

        # if splitted_returns:
        #     return (env_reward, ball_to_goal_speed_reward, agent_position_to_ball_reward)
        # return env_reward + ball_to_goal_speed_reward + agent_position_to_ball_reward
        if splitted_returns:
            return (env_reward, ball_to_goal_speed_reward)
        return env_reward + ball_to_goal_speed_reward

    def _update_avg_ball_speed_to_goal(self, player_id: int, ball_speed: float):
        assert player_id in [0, 1, 2, 3]
        # global min_ball_to_goal_avg_velocity, max_ball_to_goal_avg_velocity

        # Getting min/max ball to goal speed forr normalization
        # print(f'player_id: {player_id}')
        # print(f'self.last_ball_speed_mean_per_player: {self.last_ball_speed_mean_per_player}')
        # print(f'self.n_step: {self.n_step}')
        # print(f'ball_speed: {ball_speed}')

        self.ball_speed_deque_per_player[player_id].append(ball_speed)
        avg = np.mean(self.ball_speed_deque_per_player[player_id])
        # if avg < min_ball_to_goal_avg_velocity:
        #     min_ball_to_goal_avg_velocity = avg
        # elif avg > max_ball_to_goal_avg_velocity:
        #     max_ball_to_goal_avg_velocity = avg

        self.last_ball_speed_mean_per_player[player_id] = avg

    def _update_scoreboard(self, player_id, reward):
        # global max_goals_one_team, max_goals_one_match

        if player_id == 0 and reward == -1.0:
            self.scoreboard["team_1"] += 1
            # print(self.scoreboard)

            # if self.scoreboard["team_1"] > max_goals_one_team:
            #     max_goals_one_team = self.scoreboard["team_1"]
            # if self.scoreboard["team_0"] + self.scoreboard["team_1"] > max_goals_one_match:
            #     max_goals_one_match = self.scoreboard["team_0"] + \
            #         self.scoreboard["team_1"]
            # if max_goals_one_match > 0:
            #     if not self.await_press:
            #         input("Press Enter to continue...")
            #         self.await_press = True
            #     else:
            #         self.await_press = False
        elif player_id == 2 and reward == -1.0:
            self.scoreboard["team_0"] += 1
            # print(self.scoreboard)

            # if self.scoreboard["team_0"] > max_goals_one_team:
            #     max_goals_one_team = self.scoreboard["team_0"]
            # if self.scoreboard["team_0"] + self.scoreboard["team_1"] > max_goals_one_match:
            #     max_goals_one_match = self.scoreboard["team_0"] + \
            #         self.scoreboard["team_1"]
            # if max_goals_one_match > 0:
            #     if not self.await_press:
            #         input("Press Enter to continue...")
            #         self.await_press = True
            #     else:
            #         self.await_press = False


## Utils

In [6]:
class RLLibWrapper(gym.core.Wrapper, MultiAgentEnv):
    """
    A RLLib wrapper so our env can inherit from MultiAgentEnv.
    """

    pass


def create_rllib_env(env_config: dict = {}):
    """
    Creates a RLLib environment and prepares it to be instantiated by Ray workers.
    Args:
        env_config: configuration for the environment.
            You may specify the following keys:
            - variation: one of soccer_twos.EnvType. Defaults to EnvType.multiagent_player.
            - opponent_policy: a Callable for your agent to train against. Defaults to a random policy.
    """
    if hasattr(env_config, "worker_index"):
        env_config["worker_id"] = (
            env_config.worker_index * env_config.get("num_envs_per_worker", 1)
            + env_config.vector_index
        )
    env = soccer_twos.make(**env_config)
    if "multiagent" in env_config and not env_config["multiagent"]:
        # is multiagent by default, is only disabled if explicitly set to False
        return env
    return RLLibWrapper(env)


def create_custom_env(env_config: dict = {}):
    env = create_rllib_env(env_config)
    return CustomRewardWrapper(env)

## Callback

In [7]:
import random

import numpy as np
from gym.spaces import Box

from ray.rllib.policy.policy import Policy
from ray.rllib.utils.annotations import override
from ray.rllib.utils.typing import ModelWeights


class RandomPolicy(Policy):
    """Hand-coded policy that returns random actions."""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # Whether for compute_actions, the bounds given in action_space
        # should be ignored (default: False). This is to test action-clipping
        # and any Env's reaction to boon_episode_stepunds breaches.
        if self.config.get("ignore_action_bounds", False) and \
                isinstance(self.action_space, Box):
            self.action_space_for_sampling = Box(
                -float("inf"),
                float("inf"),
                shape=self.action_space.shape,
                dtype=self.action_space.dtype)
        else:
            self.action_space_for_sampling = self.action_space

    @override(Policy)
    def compute_actions(self,
                        obs_batch,
                        state_batches=None,
                        prev_action_batch=None,
                        prev_reward_batch=None,
                        **kwargs):
        # Alternatively, a numpy array would work here as well.
        # e.g.: np.array([random.choice([0, 1])] * len(obs_batch))
        return [self.action_space_for_sampling.sample() for _ in obs_batch], \
               [], {}

    @override(Policy)
    def learn_on_batch(self, samples):
        """No learning."""
        return {}

    @override(Policy)
    def compute_log_likelihoods(self,
                                actions,
                                obs_batch,
                                state_batches=None,
                                prev_action_batch=None,
                                prev_reward_batch=None):
        return np.array([random.random()] * len(obs_batch))

    @override(Policy)
    def get_weights(self) -> ModelWeights:
        """No weights to save."""
        return {}

    @override(Policy)
    def set_weights(self, weights: ModelWeights) -> None:
        """No weights to set."""
        pass

In [8]:
class Alternator:
    def __init__(self) -> None:
        self.value = -1

    def reset(self):
        self.value = -1

    def step_value(self):
        self.value += 1
        self.value %= 2
        return self.value

In [9]:
WIN_RATE_THEWSHOLD = .2

class SelfPlayCallback(DefaultCallbacks):
    def __init__(self):
        super().__init__()


    def on_episode_step(self,
                        *,
                        worker: "RolloutWorker",
                        base_env: BaseEnv,
                        episode: MultiAgentEpisode,
                        env_index: Optional[int] = None,
                        **kwargs) -> None:
        total_timesteps = episode.last_info_for(
            0)["ep_metrics"]["total_timesteps"]
        total_goals = float(episode.last_info_for(0)[
                            "ep_metrics"]["total_goals"])
        estimated_goals_in_match = total_goals * MATCH_STEPS / \
            float(total_timesteps) if total_goals > 0 else 0.0
        timesteps_to_goal = float(
            total_timesteps) if total_goals > 0 else 9999.0

        if not episode.user_data:
            episode.user_data = {
                0: {
                    "total_env_reward": 0.0,
                    "total_ball_to_goal_speed_reward": 0.0,
                    # "total_agent_position_to_ball_reward": 0.0,
                },
                1: {
                    "total_env_reward": 0.0,
                    "total_ball_to_goal_speed_reward": 0.0,
                    # "total_agent_position_to_ball_reward": 0.0,
                },
                2: {
                    "total_env_reward": 0.0,
                    "total_ball_to_goal_speed_reward": 0.0,
                    # "total_agent_position_to_ball_reward": 0.0,
                },
                3: {
                    "total_env_reward": 0.0,
                    "total_ball_to_goal_speed_reward": 0.0,
                    # "total_agent_position_to_ball_reward": 0.0,
                }
            }

        episode.user_data = {
            **episode.user_data,
            0: {
                "total_env_reward": episode.user_data[0]["total_env_reward"] + episode.last_info_for(0)["ep_metrics"]["env_reward"],
                "total_ball_to_goal_speed_reward": episode.user_data[0]["total_ball_to_goal_speed_reward"] + episode.last_info_for(0)["ep_metrics"]["ball_to_goal_speed_reward"],
                # "total_agent_position_to_ball_reward": episode.user_data[0]["total_agent_position_to_ball_reward"] + episode.last_info_for(0)["ep_metrics"]["agent_position_to_ball_reward"],
            },
            1: {
                "total_env_reward": episode.user_data[1]["total_env_reward"] + episode.last_info_for(1)["ep_metrics"]["env_reward"],
                "total_ball_to_goal_speed_reward": episode.user_data[1]["total_ball_to_goal_speed_reward"] + episode.last_info_for(1)["ep_metrics"]["ball_to_goal_speed_reward"],
                # "total_agent_position_to_ball_reward": episode.user_data[1]["total_agent_position_to_ball_reward"] + episode.last_info_for(1)["ep_metrics"]["agent_position_to_ball_reward"],
            },
            2: {
                "total_env_reward": episode.user_data[2]["total_env_reward"] + episode.last_info_for(2)["ep_metrics"]["env_reward"],
                "total_ball_to_goal_speed_reward": episode.user_data[2]["total_ball_to_goal_speed_reward"] + episode.last_info_for(2)["ep_metrics"]["ball_to_goal_speed_reward"],
                # "total_agent_position_to_ball_reward": episode.user_data[2]["total_agent_position_to_ball_reward"] + episode.last_info_for(2)["ep_metrics"]["agent_position_to_ball_reward"],
            },
            3: {
                "total_env_reward": episode.user_data[3]["total_env_reward"] + episode.last_info_for(3)["ep_metrics"]["env_reward"],
                "total_ball_to_goal_speed_reward": episode.user_data[3]["total_ball_to_goal_speed_reward"] + episode.last_info_for(3)["ep_metrics"]["ball_to_goal_speed_reward"],
                # "total_agent_position_to_ball_reward": episode.user_data[3]["total_agent_position_to_ball_reward"] + episode.last_info_for(3)["ep_metrics"]["agent_position_to_ball_reward"],
            }
        }

        episode.custom_metrics = {
            # "total_timesteps": total_timesteps,
            # "timesteps_to_goal": timesteps_to_goal,
            # "estimated_goals_in_match": estimated_goals_in_match,
            # "team_0_goals": episode.last_info_for(0)["ep_metrics"]["team_0_goals"],
            # "team_1_goals": episode.last_info_for(0)["ep_metrics"]["team_1_goals"],
            # "have_goals": episode.last_info_for(0)["ep_metrics"]["have_goals"],
            "agent_0_total_env_reward": episode.user_data[0]["total_env_reward"],
            "agent_0_total_ball_to_goal_speed_reward": episode.user_data[0]["total_ball_to_goal_speed_reward"],
            # "agent_0_total_agent_position_to_ball_reward": episode.user_data[0]["total_agent_position_to_ball_reward"],
        }

    # def on_episode_end(self,
    #                    *,
    #                    worker: "RolloutWorker",
    #                    base_env: BaseEnv,
    #                    policies: Dict[PolicyID, Policy],
    #                    episode: MultiAgentEpisode,
    #                    env_index: Optional[int] = None,
    #                    **kwargs) -> None:
    #     total_timesteps = episode.last_info_for(
    #         0)["ep_metrics"]["total_timesteps"]
    #     total_goals = float(episode.last_info_for(0)[
    #                         "ep_metrics"]["total_goals"])
    #     estimated_goals_in_match = total_goals * MATCH_STEPS / \
    #         float(total_timesteps) if total_goals > 0 else 0.0
    #     timesteps_to_goal = float(
    #         total_timesteps) if total_goals > 0 else 9999.0

    #     episode.user_data = {
    #         **episode.user_data,
    #         0: {
    #             "total_env_reward": episode.user_data[0]["total_env_reward"] + episode.last_info_for(0)["ep_metrics"]["env_reward"],
    #             "total_ball_to_goal_speed_reward": episode.user_data[0]["total_ball_to_goal_speed_reward"] + episode.last_info_for(0)["ep_metrics"]["ball_to_goal_speed_reward"],
    #             # "total_agent_position_to_ball_reward": episode.user_data[0]["total_agent_position_to_ball_reward"] + episode.last_info_for(0)["ep_metrics"]["agent_position_to_ball_reward"],
    #         },
    #         1: {
    #             "total_env_reward": episode.user_data[1]["total_env_reward"] + episode.last_info_for(1)["ep_metrics"]["env_reward"],
    #             "total_ball_to_goal_speed_reward": episode.user_data[1]["total_ball_to_goal_speed_reward"] + episode.last_info_for(1)["ep_metrics"]["ball_to_goal_speed_reward"],
    #             # "total_agent_position_to_ball_reward": episode.user_data[1]["total_agent_position_to_ball_reward"] + episode.last_info_for(1)["ep_metrics"]["agent_position_to_ball_reward"],
    #         },
    #         2: {
    #             "total_env_reward": episode.user_data[2]["total_env_reward"] + episode.last_info_for(2)["ep_metrics"]["env_reward"],
    #             "total_ball_to_goal_speed_reward": episode.user_data[2]["total_ball_to_goal_speed_reward"] + episode.last_info_for(2)["ep_metrics"]["ball_to_goal_speed_reward"],
    #             # "total_agent_position_to_ball_reward": episode.user_data[2]["total_agent_position_to_ball_reward"] + episode.last_info_for(2)["ep_metrics"]["agent_position_to_ball_reward"],
    #         },
    #         3: {
    #             "total_env_reward": episode.user_data[3]["total_env_reward"] + episode.last_info_for(3)["ep_metrics"]["env_reward"],
    #             "total_ball_to_goal_speed_reward": episode.user_data[3]["total_ball_to_goal_speed_reward"] + episode.last_info_for(3)["ep_metrics"]["ball_to_goal_speed_reward"],
    #             # "total_agent_position_to_ball_reward": episode.user_data[3]["total_agent_position_to_ball_reward"] + episode.last_info_for(3)["ep_metrics"]["agent_position_to_ball_reward"],
    #         }
    #     }

    #     episode.custom_metrics = {
    #         # "total_timesteps": total_timesteps,
    #         # "timesteps_to_goal": timesteps_to_goal,
    #         # "estimated_goals_in_match": estimated_goals_in_match,
    #         # "team_0_goals": episode.last_info_for(0)["ep_metrics"]["team_0_goals"],
    #         # "team_1_goals": episode.last_info_for(0)["ep_metrics"]["team_1_goals"],
    #         # "have_goals": episode.last_info_for(0)["ep_metrics"]["have_goals"],
    #         "agent_0_total_env_reward": episode.user_data[0]["total_env_reward"],
    #         "agent_0_total_ball_to_goal_speed_reward": episode.user_data[0]["total_ball_to_goal_speed_reward"],
    #         # "agent_0_total_agent_position_to_ball_reward": episode.user_data[0]["total_agent_position_to_ball_reward"],
    #     }

    def __init__(self):
        super().__init__()
        # 0=RandomPolicy, 1=1st main policy snapshot,
        # 2=2nd main policy snapshot, etc..
        self.current_opponent = 0

    def on_train_result(self, *, trainer, result, **kwargs):
        # Get the win rate for the train batch.
        # Note that normally, one should set up a proper evaluation config,
        # such that evaluation always happens on the already updated policy,
        # instead of on the already used train_batch.
        # print("result", result)
        # print("result[hist_stats]", result["hist_stats"])
        main_rew = result["hist_stats"].pop("policy_main_reward")
        opponent_rew = result["hist_stats"].pop("policy_random_reward")
        # opponent_rew = list(result["hist_stats"].values())[0]
        # print('len(main_rew)', len(main_rew))
        # print("len(opponent_rew)", len(opponent_rew))
        assert len(main_rew) == len(opponent_rew)
        won = 0
        for r_main, r_opponent in zip(main_rew, opponent_rew):
            if r_main > r_opponent:
                won += 1
        win_rate = won / len(main_rew)
        result["win_rate"] = win_rate
        print(f"Iter={trainer.iteration} win-rate={win_rate} -> ", end="")
        # If win rate is good -> Snapshot current policy and play against
        # it next, keeping the snapshot fixed and only improving the "main"
        # policy.
        if win_rate > WIN_RATE_THEWSHOLD:
            self.current_opponent += 1
            new_pol_id = f"main_v{self.current_opponent}"
            print(f"adding new opponent to the mix ({new_pol_id}).")

            # Re-define the mapping function, such that "main" is forced
            # to play against any of the previously played policies
            # (excluding "random").
            alternator = Alternator()
            def new_policy_mapping_fn(agent_id, **kwargs):
                # agent_id = [0|1] -> policy depends on episode ID
                # This way, we make sure that both policies sometimes play
                # (start player) and sometimes agent1 (player to move 2nd).
                selected_pol = "main" if alternator.step_value() == agent_id \
                    else "main_v{}".format(np.random.choice(
                        list(range(1, self.current_opponent + 1))))
                print(f'policy_mapping_fn selected_pol: {selected_pol}\nself.current_opponent: {self.current_opponent}')
                return selected_pol

            # new_policy = trainer.add_policy(
            #     policy_id=new_pol_id,
            #     policy_cls=type(trainer.get_policy("main")),
            #     policy_mapping_fn=policy_mapping_fn,
            # )
           
            trainer.workers.local_worker().policy_config["multiagent"]["policy_mapping_fn"] = new_policy_mapping_fn
            trainer.workers.local_worker().policy_mapping_fn = new_policy_mapping_fn
            
            trainer.workers.local_worker().policy_map[new_pol_id] = trainer.get_policy("main")

            # for r in trainer.workers.remote_workers():
            #     # r.policy_config["multiagent"]["policy_mapping_fn"] = policy_mapping_fn
            #     # r.policy_mapping_fn = policy_mapping_fn
            #     r.policy_map[new_pol_id] = trainer.get_policy("main")
            #     # r.policy_map[new_pol_id].set_state(main_state)

            # Set the weights of the new policy to the main policy.
            # We'll keep training the main policy, whereas `new_pol_id` will
            # remain fixed.
            main_state = trainer.get_policy("main").get_state()
            # new_policy.set_state(main_state)
            trainer.workers.local_worker().policy_map[new_pol_id].set_state(main_state)
            # We need to sync the just copied local weights (from main policy)
            # to all the remote workers as well.
            # trainer.workers.sync_weights()
        else:
            print("not good enough; will keep learning ...")

        # +2 = main + random
        result["league_size"] = self.current_opponent + 2

## Stop

In [10]:
stop = {
    "timesteps_total": 15000000,  # 15M
    # "time_total_s": 14400, # 4h
    # "episodes_total": 10,
    "training_iteration": 100,
}


## Config


In [11]:
NUM_ENVS_PER_WORKER = 8
ENVIRONMENT_ID = "Soccer"

ENVIRONMENT_CONFIG = {
    "num_envs_per_worker": NUM_ENVS_PER_WORKER,
    "variation": EnvType.multiagent_player,
}


temp_env = create_custom_env(ENVIRONMENT_CONFIG)
obs_space = temp_env.observation_space
act_space = temp_env.action_space
temp_env.close()

alternator = Alternator()
def policy_mapping_fn(agent_id, **kwargs):
    print('chamando policy_mapping_fn original')
    # agent_id = [0|1] -> policy depends on episode ID
    # This way, we make sure that both policies sometimes play agent0
    # (start player) and sometimes agent1 (player to move 2nd).
    return "main" if alternator.step_value() == agent_id else "random"

gpu_count = 1
num_workers = 0
num_gpus_for_driver = 1 / (num_workers + 1) # Driver GPU
num_gpus_per_worker = (gpu_count - num_gpus_for_driver) / num_workers if num_workers > 0 else 0

config = {
    # system settings
    "num_gpus": num_gpus_for_driver,
    "num_workers": num_workers,
    "num_envs_per_worker": NUM_ENVS_PER_WORKER,
    "num_cpus_for_driver": 8,
    "num_cpus_per_worker": 0,
    "num_gpus_per_worker": num_gpus_per_worker,
    "log_level": "INFO",
    "framework": "torch",
    # RL setup
    "multiagent": {
        "policies": {
            "main": (None, obs_space, act_space, {}),
            "random": (RandomPolicy, obs_space, act_space, {}),
        },
        "policy_mapping_fn": policy_mapping_fn,
        "policies_to_train": ["main"],
    },
    "env": ENVIRONMENT_ID,
    "env_config": {
        **ENVIRONMENT_CONFIG,
        # "render": True,
        # "time_scale": 1,
    },
    "callbacks": SelfPlayCallback,
}

[INFO] Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0


INFO:mlagents_envs.environment:Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0


[INFO] Connected new brain: SoccerTwos?team=1


INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=1


[INFO] Connected new brain: SoccerTwos?team=0


INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=0


## Run experiment

### Train PPO SelfPlay

In [12]:
def run_experiment():
    ray.init(num_cpus=8, include_dashboard=False, ignore_reinit_error=True)

    tune.registry.register_env(ENVIRONMENT_ID, create_custom_env)

    analysis = tune.run(
        "PPO",
        num_samples=1,
        # name="PPO_multiagent_league",
        name="Teste_14",
        config=config,
        stop=stop,
        checkpoint_freq=1,
        checkpoint_at_end=True,
        local_dir="../../ray_results",
        # restore="../../ray_results/PPO_selfplay_1/PPO_Soccer_ID/checkpoint_00X/checkpoint-X",
        # resume=True
    )

    # Gets best trial based on max accuracy across all training iterations.
    best_trial = analysis.get_best_trial("episode_reward_mean", mode="max")
    print(best_trial)
    # Gets best checkpoint for trial based on accuracy.
    best_checkpoint = analysis.get_best_checkpoint(
        trial=best_trial, metric="episode_reward_mean", mode="max"
    )
    print(best_checkpoint)
    print("Done training")
    return analysis, best_trial, best_checkpoint

run_experiment()


Trial name,status,loc
PPO_Soccer_ed9e4_00000,PENDING,


[2m[36m(pid=17689)[0m INFO:mlagents_envs.environment:Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0
[2m[36m(pid=17689)[0m INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=1
[2m[36m(pid=17689)[0m INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=0


[2m[36m(pid=17689)[0m [INFO] Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0
[2m[36m(pid=17689)[0m [INFO] Connected new brain: SoccerTwos?team=1
[2m[36m(pid=17689)[0m [INFO] Connected new brain: SoccerTwos?team=0


[2m[36m(pid=17689)[0m 2021-12-07 21:36:51,240	INFO torch_policy.py:148 -- TorchPolicy (worker=local) running on 1.0 GPU(s).
[2m[36m(pid=17689)[0m 2021-12-07 21:36:53,761	INFO rollout_worker.py:1199 -- Built policy map: {'main': <ray.rllib.policy.policy_template.PPOTorchPolicy object at 0x7fc4a2090dc0>, 'random': <__main__.RandomPolicy object at 0x7fc4a212e820>}
[2m[36m(pid=17689)[0m 2021-12-07 21:36:53,761	INFO rollout_worker.py:1200 -- Built preprocessor map: {'main': <ray.rllib.models.preprocessors.NoPreprocessor object at 0x7fc4a2090b50>, 'random': <ray.rllib.models.preprocessors.NoPreprocessor object at 0x7fc4a20a2520>}
[2m[36m(pid=17689)[0m 2021-12-07 21:36:53,761	INFO rollout_worker.py:583 -- Built filter map: {'main': <ray.rllib.utils.filter.NoFilter object at 0x7fc4a21485e0>, 'random': <ray.rllib.utils.filter.NoFilter object at 0x7fc4a2090b20>}
[2m[36m(pid=17689)[0m INFO:mlagents_envs.environment:Connected to Unity environment with package version 2.1.0-exp.1 and

[2m[36m(pid=17689)[0m [INFO] Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0
[2m[36m(pid=17689)[0m [INFO] Connected new brain: SoccerTwos?team=1
[2m[36m(pid=17689)[0m [INFO] Connected new brain: SoccerTwos?team=0


[2m[36m(pid=17689)[0m INFO:mlagents_envs.environment:Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0
[2m[36m(pid=17689)[0m INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=1
[2m[36m(pid=17689)[0m INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=0


[2m[36m(pid=17689)[0m [INFO] Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0
[2m[36m(pid=17689)[0m [INFO] Connected new brain: SoccerTwos?team=1
[2m[36m(pid=17689)[0m [INFO] Connected new brain: SoccerTwos?team=0


[2m[36m(pid=17689)[0m INFO:mlagents_envs.environment:Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0
[2m[36m(pid=17689)[0m INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=1
[2m[36m(pid=17689)[0m INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=0


[2m[36m(pid=17689)[0m [INFO] Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0
[2m[36m(pid=17689)[0m [INFO] Connected new brain: SoccerTwos?team=1
[2m[36m(pid=17689)[0m [INFO] Connected new brain: SoccerTwos?team=0


[2m[36m(pid=17689)[0m INFO:mlagents_envs.environment:Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0
[2m[36m(pid=17689)[0m INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=1
[2m[36m(pid=17689)[0m INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=0


[2m[36m(pid=17689)[0m [INFO] Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0
[2m[36m(pid=17689)[0m [INFO] Connected new brain: SoccerTwos?team=1
[2m[36m(pid=17689)[0m [INFO] Connected new brain: SoccerTwos?team=0


[2m[36m(pid=17689)[0m INFO:mlagents_envs.environment:Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0
[2m[36m(pid=17689)[0m INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=1
[2m[36m(pid=17689)[0m INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=0


[2m[36m(pid=17689)[0m [INFO] Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0
[2m[36m(pid=17689)[0m [INFO] Connected new brain: SoccerTwos?team=1
[2m[36m(pid=17689)[0m [INFO] Connected new brain: SoccerTwos?team=0


[2m[36m(pid=17689)[0m INFO:mlagents_envs.environment:Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0
[2m[36m(pid=17689)[0m INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=1
[2m[36m(pid=17689)[0m INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=0


[2m[36m(pid=17689)[0m [INFO] Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0
[2m[36m(pid=17689)[0m [INFO] Connected new brain: SoccerTwos?team=1
[2m[36m(pid=17689)[0m [INFO] Connected new brain: SoccerTwos?team=0


[2m[36m(pid=17689)[0m INFO:mlagents_envs.environment:Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0
[2m[36m(pid=17689)[0m INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=1
[2m[36m(pid=17689)[0m INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=0
[2m[36m(pid=17689)[0m 2021-12-07 21:36:58,931	INFO rollout_worker.py:723 -- Generating sample batch of size 4000


[2m[36m(pid=17689)[0m [INFO] Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0
[2m[36m(pid=17689)[0m [INFO] Connected new brain: SoccerTwos?team=1
[2m[36m(pid=17689)[0m [INFO] Connected new brain: SoccerTwos?team=0


[2m[36m(pid=17689)[0m 2021-12-07 21:36:58,967	INFO sampler.py:590 -- Raw obs from env: { 0: { 0: np.ndarray((336,), dtype=float32, min=0.0, max=1.0, mean=0.187),
[2m[36m(pid=17689)[0m        1: np.ndarray((336,), dtype=float32, min=0.0, max=1.0, mean=0.194),
[2m[36m(pid=17689)[0m        2: np.ndarray((336,), dtype=float32, min=0.0, max=1.0, mean=0.193),
[2m[36m(pid=17689)[0m        3: np.ndarray((336,), dtype=float32, min=0.0, max=1.0, mean=0.192)},
[2m[36m(pid=17689)[0m   1: { 0: np.ndarray((336,), dtype=float32, min=0.0, max=1.0, mean=0.187),
[2m[36m(pid=17689)[0m        1: np.ndarray((336,), dtype=float32, min=0.0, max=1.0, mean=0.194),
[2m[36m(pid=17689)[0m        2: np.ndarray((336,), dtype=float32, min=0.0, max=1.0, mean=0.193),
[2m[36m(pid=17689)[0m        3: np.ndarray((336,), dtype=float32, min=0.0, max=1.0, mean=0.192)},
[2m[36m(pid=17689)[0m   2: { 0: np.ndarray((336,), dtype=float32, min=0.0, max=1.0, mean=0.187),
[2m[36m(pid=17689)[0m        1

[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m

[2m[36m(pid=17689)[0m 2021-12-07 21:37:13,152	INFO simple_list_collector.py:659 -- Trajectory fragment after postprocess_trajectory():
[2m[36m(pid=17689)[0m 
[2m[36m(pid=17689)[0m { 0: { 'action_dist_inputs': np.ndarray((255, 9), dtype=float32, min=-0.009, max=0.008, mean=-0.001),
[2m[36m(pid=17689)[0m        'action_logp': np.ndarray((255,), dtype=float32, min=-3.31, max=-3.285, mean=-3.295),
[2m[36m(pid=17689)[0m        'actions': np.ndarray((255, 3), dtype=int64, min=0.0, max=2.0, mean=0.987),
[2m[36m(pid=17689)[0m        'advantages': np.ndarray((255,), dtype=float32, min=0.177, max=0.77, mean=0.351),
[2m[36m(pid=17689)[0m        'agent_index': np.ndarray((255,), dtype=int64, min=0.0, max=0.0, mean=0.0),
[2m[36m(pid=17689)[0m        'dones': np.ndarray((255,), dtype=bool, min=0.0, max=1.0, mean=0.004),
[2m[36m(pid=17689)[0m        'eps_id': np.ndarray((255,), dtype=int64, min=488914350.0, max=488914350.0, mean=488914350.0),
[2m[36m(pid=17689)[0m       

[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original


[2m[36m(pid=17689)[0m 2021-12-07 21:37:26,634	INFO rollout_worker.py:761 -- Completed sample batch:
[2m[36m(pid=17689)[0m 
[2m[36m(pid=17689)[0m { 'count': 4000,
[2m[36m(pid=17689)[0m   'policy_batches': { 'main': { 'action_dist_inputs': np.ndarray((8000, 9), dtype=float32, min=-0.011, max=0.012, mean=-0.001),
[2m[36m(pid=17689)[0m                                 'action_logp': np.ndarray((8000,), dtype=float32, min=-3.31, max=-3.28, mean=-3.296),
[2m[36m(pid=17689)[0m                                 'actions': np.ndarray((8000, 3), dtype=int64, min=0.0, max=2.0, mean=1.004),
[2m[36m(pid=17689)[0m                                 'advantages': np.ndarray((8000,), dtype=float32, min=-1.0, max=0.77, mean=0.017),
[2m[36m(pid=17689)[0m                                 'agent_index': np.ndarray((8000,), dtype=int64, min=0.0, max=1.0, mean=0.5),
[2m[36m(pid=17689)[0m                                 'dones': np.ndarray((8000,), dtype=bool, min=0.0, max=1.0, mean=0.001

[2m[36m(pid=17689)[0m Iter=1 win-rate=0.5 -> adding new opponent to the mix (main_v1).Result for PPO_Soccer_ed9e4_00000:

  agent_timesteps_total: 16000
  custom_metrics:
    agent_0_total_ball_to_goal_speed_reward_max: 0.35569555610097214
    agent_0_total_ball_to_goal_speed_reward_mean: 0.17784777805048607
    agent_0_total_ball_to_goal_speed_reward_min: 0.0
    agent_0_total_env_reward_max: 0.0
    agent_0_total_env_reward_mean: 0.0
    agent_0_total_env_reward_min: 0.0
  date: 2021-12-07_21-37-49
  done: false
  episode_len_mean: 141.5
  episode_media: {}
  episode_reward_max: 0.5345264327538932
  episode_reward_mean: 0.13965102424657916
  episode_reward_min: -0.2552243842607349
  episodes_this_iter: 2
  episodes_total: 2
  experiment_id: f72239e32aca45bbbf67f8edfb25bfdc
  hostname: bruno-odyssey-mint
  info:
    learner:
      main:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.1999999999999999
          cur_lr: 5.000000000000002e-05
        

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Soccer_ed9e4_00000,RUNNING,192.168.0.104:17689,1,50.3171,4000,0.139651,0.534526,-0.255224,141.5


[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Soccer_ed9e4_00000,RUNNING,192.168.0.104:17689,2,106.22,8000,-0.136589,0.534526,-1.71854,798.889


[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m Iter=3 win-rate=0.5 -> adding new opponent to the mix (main_v3).
Result for PPO_Soccer_ed9e4_00000:
  agent_timesteps_total: 48000
  custom_metrics:
    agent_0_total_ball_to_goal_speed_reward_max: 0.35569555610097214
    agent_0_total_ball_to_goal_speed_reward_mean: 0.04601323519351992
    agent_0_total_ball_to_goal_speed_reward_min: -0.19013650200370572
    agent_0_total_env_reward_max: 0.0
    agent_0_total_env_reward_mean: 0.0
    agent_0_total_env_reward_min: 0.0
  date: 2021-12-07_21-39-42
  done: false
  episode_len_mean: 819.0
  episode_media: {}
  episode_reward_max: 0.5345264327538932
  episode_reward_mean: -0.11782667370758189
  episode_reward_min: -1.7185449999628042
  episodes_this_iter: 1
  episodes_total: 1

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Soccer_ed9e4_00000,RUNNING,192.168.0.104:17689,3,163.698,12000,-0.117827,0.534526,-1.71854,819


[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Soccer_ed9e4_00000,RUNNING,192.168.0.104:17689,4,220.375,16000,-0.268839,0.534526,-1.71854,802.444


[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original
[2m[36m(pid=17689)[0m chamando policy_mapping_fn original


## Export agent

In [None]:
this_path = os.path.dirname(os.path.realpath("__file__"))
# print('this_path', this_path)


def export_agent(agent_file: str, TRIAL, agent_name="bajai_belzonte", makeZip=False):
    agent_path = os.path.join(f'{this_path}/agents', agent_name)
    os.makedirs(agent_path, exist_ok=True)


    shutil.rmtree(agent_path)
    os.makedirs(agent_path)

    # salva a classe do agente
    with open(os.path.join(agent_path, "agent.py"), "w") as f:
        f.write(agent_file)

    # salva um __init__ para criar o módulo Python
    with open(os.path.join(agent_path, "__init__.py"), "w") as f:
        f.write("from .agent import MyRaySoccerAgent")

    # copia o trial inteiro, incluindo os arquivos de configuração do experimento
    print(f"TRIALLL {TRIAL}")
    shutil.copytree(TRIAL, os.path.join(
        agent_path, TRIAL.split("ray_results/")[1]), )

    # empacota tudo num arquivo .zip
    if makeZip:
        shutil.make_archive(os.path.join(agent_path, agent_name),
                            "zip", os.path.join(agent_path, agent_name))


def get_agent_file_str(ALGORITHM, CHECKPOINT, POLICY_NAME="main"):
    return f"""
import pickle
import os
from typing import Dict

import gym
import numpy as np
import ray
from ray import tune
from ray.rllib.env.base_env import BaseEnv
from ray.tune.registry import get_trainable_cls

from soccer_twos import AgentInterface

ALGORITHM = "{ALGORITHM}"
CHECKPOINT_PATH = os.path.join(
    os.path.dirname(os.path.abspath(__file__)), 
    "{CHECKPOINT.split("ray_results/")[1]}"
)
POLICY_NAME = "{POLICY_NAME}"


class MyRaySoccerAgent(AgentInterface):
    def __init__(self, env: gym.Env):
        super().__init__()
        ray.init(ignore_reinit_error=True)

        # Load configuration from checkpoint file.
        config_path = ""
        if CHECKPOINT_PATH:
            config_dir = os.path.dirname(CHECKPOINT_PATH)
            config_path = os.path.join(config_dir, "params.pkl")
            # Try parent directory.
            if not os.path.exists(config_path):
                config_path = os.path.join(config_dir, "../params.pkl")

        # Load the config from pickled.
        if os.path.exists(config_path):
            with open(config_path, "rb") as f:
                config = pickle.load(f)
        else:
            # If no config in given checkpoint -> Error.
            raise ValueError(
                "Could not find params.pkl in either the checkpoint dir or "
                "its parent directory!"
            )

        # no need for parallelism on evaluation
        config["num_workers"] = 0
        config["num_gpus"] = 0

        # create a dummy env since it's required but we only care about the policy
        tune.registry.register_env("DummyEnv", lambda *_: BaseEnv())
        config["env"] = "DummyEnv"

        # create the Trainer from config
        cls = get_trainable_cls(ALGORITHM)
        agent = cls(env=config["env"], config=config)
        # load state from checkpoint
        agent.restore(CHECKPOINT_PATH)
        # get policy for evaluation
        self.policy = agent.get_policy(POLICY_NAME)

    def act(self, observation: Dict[int, np.ndarray]) -> Dict[int, np.ndarray]:
        actions = {{}}
        for player_id in observation:
            # compute_single_action returns a tuple of (action, action_info, ...)
            # as we only need the action, we discard the other elements
            actions[player_id], *_ = self.policy.compute_single_action(
                observation[player_id]
            )
        return actions

"""


def getAnalysis(experiment: str):
    return ExperimentAnalysis(experiment)


def export():
    # PPO_Soccer_18d23_00000
    # /home/bruno/Workspace/soccer-tows-player/src/ray_results/Testing_env/PPO_Soccer_18d23_00000_0_2021-11-24_20-34-41/checkpoint_000500/checkpoint-500
    analysis = getAnalysis(
        "/home/bruno/Workspace/soccer-tows-player/src/ray_results/Teste_14")

    ALGORITHM = "PPO"
    TRIAL = analysis.get_best_logdir("training_iteration", "max")
    CHECKPOINT = analysis.get_best_checkpoint(
        TRIAL,
        "training_iteration",
        "max",
    )

    print(TRIAL, CHECKPOINT)
    agent_file = get_agent_file_str(ALGORITHM, CHECKPOINT)
    export_agent(agent_file, TRIAL)


export()
