# Iniciar ambiente

## Iniciar Local

In [1]:
import os
isColab = False

## (Sempre) Outras configurações

In [2]:
# Ambiente da competição
!pip install --upgrade ceia-soccer-twos > /dev/null 2>&1
# a versão do ray compatível com a implementação dos agentes disponibilizada é a 1.4.0
!pip install 'aioredis==1.3.1' > /dev/null 2>&1
!pip install 'aiohttp==3.7.4' > /dev/null 2>&1
!pip install 'ray==1.4.0' > /dev/null 2>&1
!pip install 'ray[rllib]==1.4.0' > /dev/null 2>&1
!pip install 'ray[tune]==1.4.0' > /dev/null 2>&1
!pip install torch > /dev/null 2>&1
!pip install lz4 > /dev/null 2>&1
!pip install GPUtil > /dev/null 2>&1

# Dependências necessárias para gravar os vídeos
!apt-get install - y xvfb x11-utils > /dev/null 2>&1
!pip install 'pyvirtualdisplay==0.2.*' > /dev/null 2>&1
!pip install tensorboard > /dev/null 2>&1


# Soccer Twos

Como tarefa bônus, experimente com os algoritmos aprendidos no ambiente `soccer_twos`, que será utilizado na competição final deste curso*. Para facilitar, utilize a variação `team_vs_policy` como no laboratório anterior.

<img src="https://raw.githubusercontent.com/bryanoliveira/soccer-twos-env/master/images/screenshot.png" height="400">

> Visualização do ambiente

Este ambiente consiste em um jogo de futebol de carros 2x2, ou seja, o objetivo é marcar um gol no adversário o mais rápido possível. Na variação `team_vs_policy`, seu agente controla um jogador do time azul e joga contra um time aleatório. Mais informações sobre o ambiente podem ser encontradas [no repositório](https://github.com/bryanoliveira/soccer-twos-env) e [na documentação do Unity ml-agents](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Learning-Environment-Examples.md#soccer-twos).


**Sua tarefa é treinar um agente com a interface do Ray apresentada, experimentando com diferentes algoritmos e hiperparâmetros.**


<br>

*A variação utilizada na competição será a `multiagent_player`, mas agentes treinados para `team_vs_policy` podem ser facilmente adaptados. Na seção "Exportando seu agente treinado" o agente "MyDqnSoccerAgent" faz exatamente isso.

## Imports

In [3]:
import gym

import ray
from ray import tune
from ray.tune import Analysis
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.env import BaseEnv
from ray.rllib.evaluation.episode import MultiAgentEpisode
from ray.rllib.evaluation.rollout_worker import RolloutWorker
from ray.rllib.policy import Policy
from ray.rllib.agents.callbacks import DefaultCallbacks
from ray.rllib.utils.typing import PolicyID

import numpy as np
from typing import Any, Dict, List, Union, Optional
from collections import deque

import soccer_twos
from soccer_twos import EnvType

import shutil



## Wrapper

In [4]:
def get_scalar_projection(x, y):
    return np.dot(x, y) / np.linalg.norm(y)


# Os seguintes valores foram obtidos experimentalmente executando pré-experimentos
# A partir desses valores vamops derivar vários outros como posições ddos gols etc
min_ball_position_x, max_ball_position_x = - \
    15.563264846801758, 15.682827949523926
min_ball_position_y, max_ball_position_y = -7.08929967880249, 7.223850250244141
min_player_position_x, max_player_position_x = - \
    17.26804542541504, 17.16301727294922
min_player_position_y, max_player_position_y = - \
    7.399587631225586, 7.406457424163818
min_ball_to_goal_avg_velocity, max_ball_to_goal_avg_velocity = - \
    -23.366606239568615, 23.749571761530724
max_goals_one_team = -9999999
max_goals_one_match = -9999999
max_steps = -999999

max_diff_reward = -np.inf

# Infered
max_ball_abs_avg_velocity = max(
    abs(min_ball_to_goal_avg_velocity), abs(max_ball_to_goal_avg_velocity))


SPEED_IMPORTANCE = 1.0
CLIP_SPEED_REWARD_BY_SPEED_IMPORTANCE = True

# OBS.: Este hyperparâmetro não pode ser modificado sem fazer novos testes em
# min_ball_to_goal_avg_velocity e
# max_ball_to_goal_avg_velocity:
AVG_SPEED_TIMESTEPS_WINDOW = 50


def get_center_of_goal_pos(player_id):
    global min_ball_position_x, max_ball_position_x, \
        min_ball_position_y, max_ball_position_y, \
        min_player_position_x, max_player_position_x, \
        min_player_position_y, max_player_position_y
    if player_id in [0, 1]:
        return np.array([max_ball_position_x, 0.0])
    elif player_id in [2, 3]:
        return np.array([min_ball_position_x, 0.0])


def calculate_ball_to_goal_scalar_velocity(player_id: int, info: Dict):
    goal_pos = get_center_of_goal_pos(player_id)
    # print(f"goal_pos: {goal_pos}")
    ball_pos = info["ball_info"]["position"]
    # print(f"ball_pos: {ball_pos}")
    direction_to_center_of_goal = goal_pos - ball_pos
    # print(f"direction_to_center_of_goal: {direction_to_center_of_goal}")

    ball_velocity = info["ball_info"]["velocity"]
    # print(f"ball_velocity: {ball_velocity}")
    ball_velocity_to_center_of_goal = get_scalar_projection(
        ball_velocity, direction_to_center_of_goal)
    # print(f"ball_velocity_to_center_of_goal: {ball_velocity_to_center_of_goal}")
    return ball_velocity_to_center_of_goal

# print('ball_velocity_to_center_of_goal', calculate_ball_to_goal_scalar_velocity(0, { "ball_info": { "position": np.array([3.0, 2.0]), "velocity": np.array([0.0, 0.0]) }}))


class CustomRewardWrapper(gym.core.Wrapper, MultiAgentEnv):
    # def __init__(self, env):
    #     gym.Wrapper.__init__(self, env)

    def step(self, action: Union[Dict[int, List[Any]], List[Any]]):
        obs, rewards, done, info = super().step(action)

        # print(info)
        # if rewards[0] > 0.0:
        #     assert False

        if type(action) is dict:
            rewards = {k: self._calculate_reward(
                rewards[k], k, info[k]) for k in info.keys()}
        else:
            raise NotImplementedError('Necessário implementar!')

        info = {
            i: {
                **info[i],
                "ep_metrics": {
                    # "total_timesteps": np.array([0.0008], dtype=np.float32)
                    "total_timesteps": self.n_step + 1,
                    "total_goals": self.scoreboard["team_0"] + self.scoreboard["team_1"],
                    "goals_opponent": self.scoreboard["team_1"] if i in range(2) else self.scoreboard["team_0"],
                    "goals_in_favor": self.scoreboard["team_0"] if i in range(2) else self.scoreboard["team_1"],
                    "team_0_goals": self.scoreboard["team_0"],
                    "team_1_goals": self.scoreboard["team_1"],
                    "episode_ended": done["__all__"],
                    "have_goals": self.scoreboard["team_0"] + self.scoreboard["team_1"] > 0,
                }
            } for i in info.keys()
        }

        # global min_ball_position_x, max_ball_position_x, \
        #     min_ball_position_y, max_ball_position_y, \
        #     min_player_position_x, max_player_position_x, \
        #     min_player_position_y, max_player_position_y, \
        #     max_goals_one_team, max_goals_one_match
        # if done:
        #     print(f'min_ball_position_x: {min_ball_position_x}')
        #     print(f'max_ball_position_x: {max_ball_position_x}')
        #     print(f'min_ball_position_y: {min_ball_position_y}')
        #     print(f'max_ball_position_y: {max_ball_position_y}')
        #     print(f'min_player_position_x: {min_player_position_x}')
        #     print(f'max_player_position_x: {max_player_position_x}')
        #     print(f'min_player_position_y: {min_player_position_y}')
        #     print(f'max_player_position_y: {max_player_position_y}')
        #     print(f'min_ball_to_goal_avg_velocity: {min_ball_to_goal_avg_velocity}')
        #     print(f'max_ball_to_goal_avg_velocity: {max_ball_to_goal_avg_velocity}')
        #     print(f'max_goals_one_team: {max_goals_one_team}')
        #     print(f'max_goals_one_match: {max_goals_one_match}')
        #     print(self.scoreboard)
        #     print(f'Done... last n_step: {self.n_step}')
        #     if self.scoreboard["team_0"] > 0 or self.scoreboard["team_1"] > 0:
        #         input("Press Enter to continue...")

        # global max_steps
        # if done:
        #     if self.n_step + 1 > max_steps:
        #         max_steps = self.n_step + 1
        #     print('max_steps', max_steps)

        # global max_diff_reward
        # if done:
        #     print(f'max_diff_reward: {max_diff_reward}')
        #     print(f'min_ball_to_goal_avg_velocity: {min_ball_to_goal_avg_velocity}')
        #     print(f'max_ball_to_goal_avg_velocity: {max_ball_to_goal_avg_velocity}')

        self.n_step += 1
        return obs, rewards, done, info

    def reset(self, **kwargs):
        obs = super().reset(**kwargs)
        self.n_step = 0
        self.last_ball_speed_mean_per_player = {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}
        self.ball_speed_deque_per_player = {0: deque(maxlen=AVG_SPEED_TIMESTEPS_WINDOW),
                                            1: deque(maxlen=AVG_SPEED_TIMESTEPS_WINDOW),
                                            2: deque(maxlen=AVG_SPEED_TIMESTEPS_WINDOW),
                                            3: deque(maxlen=AVG_SPEED_TIMESTEPS_WINDOW)}
        self.scoreboard = {"team_0": 0, "team_1": 0}
        self.await_press = False
        # print(f'min_ball_to_goal_avg_velocity: {min_ball_to_goal_avg_velocity}')
        # print(f'max_ball_to_goal_avg_velocity: {max_ball_to_goal_avg_velocity}')
        return obs

    def _calculate_reward(self, reward: float, player_id: int, info: Dict) -> float:
        # print('calculating reward')
        if reward != 0.0:
            # print('Goal was made!', reward, info)
            self._update_scoreboard(player_id, reward)
        # global min_ball_position_x, max_ball_position_x, \
        #     min_ball_position_y, max_ball_position_y, \
        #     min_player_position_x, max_player_position_x, \
        #     min_player_position_y, max_player_position_y
        # print(f"info: {info}")
        # if info["ball_info"]["position"][0] < min_ball_position_x:
        #     min_ball_position_x = info["ball_info"]["position"][0]
        # if info["ball_info"]["position"][0] > max_ball_position_x:
        #     max_ball_position_x = info["ball_info"]["position"][0]
        # if info["ball_info"]["position"][1] < min_ball_position_y:
        #     min_ball_position_y = info["ball_info"]["position"][1]
        # if info["ball_info"]["position"][1] > max_ball_position_y:
        #     max_ball_position_y = info["ball_info"]["position"][1]
        # if info["player_info"]["position"][0] < min_player_position_x:
        #     min_player_position_x = info["player_info"]["position"][0]
        # if info["player_info"]["position"][0] > max_player_position_x:
        #     max_player_position_x = info["player_info"]["position"][0]
        # if info["player_info"]["position"][1] < min_player_position_y:
        #     min_player_position_y = info["player_info"]["position"][1]
        # if info["player_info"]["position"][1] > max_player_position_y:
        #     max_player_position_y = info["player_info"]["position"][1]

        self._update_avg_ball_speed_to_goal(
            player_id, calculate_ball_to_goal_scalar_velocity(player_id, info))
        # global max_diff_reward
        # if (np.abs(SPEED_IMPORTANCE * self.last_ball_speed_mean_per_player[player_id] / max_ball_abs_avg_velocity) > max_diff_reward):
        #     max_diff_reward = SPEED_IMPORTANCE * \
        #         self.last_ball_speed_mean_per_player[player_id] / \
        #         max_ball_abs_avg_velocity
        if CLIP_SPEED_REWARD_BY_SPEED_IMPORTANCE:
            # print(reward + np.clip(SPEED_IMPORTANCE * self.last_ball_speed_mean_per_player[player_id] / max_ball_abs_avg_velocity, -SPEED_IMPORTANCE, SPEED_IMPORTANCE))
            return reward + np.clip(SPEED_IMPORTANCE * self.last_ball_speed_mean_per_player[player_id] / max_ball_abs_avg_velocity, -SPEED_IMPORTANCE, SPEED_IMPORTANCE)
        return reward + SPEED_IMPORTANCE * self.last_ball_speed_mean_per_player[player_id] / max_ball_abs_avg_velocity

    def _update_avg_ball_speed_to_goal(self, player_id: int, ball_speed: float):
        assert player_id in [0, 1, 2, 3]
        global min_ball_to_goal_avg_velocity, max_ball_to_goal_avg_velocity

        # Getting min/max ball to goal speed forr normalization
        # print(f'player_id: {player_id}')
        # print(f'self.last_ball_speed_mean_per_player: {self.last_ball_speed_mean_per_player}')
        # print(f'self.n_step: {self.n_step}')
        # print(f'ball_speed: {ball_speed}')

        self.ball_speed_deque_per_player[player_id].append(ball_speed)
        avg = np.mean(self.ball_speed_deque_per_player[player_id])
        # if avg < min_ball_to_goal_avg_velocity:
        #     min_ball_to_goal_avg_velocity = avg
        # elif avg > max_ball_to_goal_avg_velocity:
        #     max_ball_to_goal_avg_velocity = avg

        self.last_ball_speed_mean_per_player[player_id] = avg

    def _update_scoreboard(self, player_id, reward):
        global max_goals_one_team, max_goals_one_match

        if player_id == 0 and reward == -1.0:
            self.scoreboard["team_1"] += 1
            # print(self.scoreboard)

            # if self.scoreboard["team_1"] > max_goals_one_team:
            #     max_goals_one_team = self.scoreboard["team_1"]
            # if self.scoreboard["team_0"] + self.scoreboard["team_1"] > max_goals_one_match:
            #     max_goals_one_match = self.scoreboard["team_0"] + \
            #         self.scoreboard["team_1"]
            # if max_goals_one_match > 0:
            #     if not self.await_press:
            #         input("Press Enter to continue...")
            #         self.await_press = True
            #     else:
            #         self.await_press = False
        elif player_id == 2 and reward == -1.0:
            self.scoreboard["team_0"] += 1
            # print(self.scoreboard)

            # if self.scoreboard["team_0"] > max_goals_one_team:
            #     max_goals_one_team = self.scoreboard["team_0"]
            # if self.scoreboard["team_0"] + self.scoreboard["team_1"] > max_goals_one_match:
            #     max_goals_one_match = self.scoreboard["team_0"] + \
            #         self.scoreboard["team_1"]
            # if max_goals_one_match > 0:
            #     if not self.await_press:
            #         input("Press Enter to continue...")
            #         self.await_press = True
            #     else:
            #         self.await_press = False


## Utils

In [5]:
class RLLibWrapper(gym.core.Wrapper, MultiAgentEnv):
    """
    A RLLib wrapper so our env can inherit from MultiAgentEnv.
    """

    pass


def create_rllib_env(env_config: dict = {}):
    """
    Creates a RLLib environment and prepares it to be instantiated by Ray workers.
    Args:
        env_config: configuration for the environment.
            You may specify the following keys:
            - variation: one of soccer_twos.EnvType. Defaults to EnvType.multiagent_player.
            - opponent_policy: a Callable for your agent to train against. Defaults to a random policy.
    """
    if hasattr(env_config, "worker_index"):
        env_config["worker_id"] = (
            env_config.worker_index * env_config.get("num_envs_per_worker", 1)
            + env_config.vector_index
        )
    env = soccer_twos.make(**env_config)
    if "multiagent" in env_config and not env_config["multiagent"]:
        # is multiagent by default, is only disabled if explicitly set to False
        return env
    return RLLibWrapper(env)


def create_custom_env(env_config: dict = {}):
    env = create_rllib_env(env_config)
    return CustomRewardWrapper(env)

## Callback

In [6]:
MAX_STEPS = 1000
MATCH_STEPS = 4000

class Callback(DefaultCallbacks):
    def on_episode_end(self,
                       *,
                       worker: "RolloutWorker",
                       base_env: BaseEnv,
                       policies: Dict[PolicyID, Policy],
                       episode: MultiAgentEpisode,
                       env_index: Optional[int] = None,
                       **kwargs):
        total_timesteps = episode.last_info_for(
            0)["ep_metrics"]["total_timesteps"]
        total_goals = float(episode.last_info_for(0)["ep_metrics"]["total_goals"])
        estimated_goals_in_match = total_goals * MATCH_STEPS / \
            float(total_timesteps) if total_goals > 0 else 0.0
        timesteps_to_goal = float(total_timesteps) if total_goals > 0 else 9999.0
        episode.custom_metrics = {
            "total_timesteps": total_timesteps,
            "timesteps_to_goal": timesteps_to_goal,
            "estimated_goals_in_match": estimated_goals_in_match,
            "team_0_goals": episode.last_info_for(0)["ep_metrics"]["team_0_goals"],
            "team_1_goals": episode.last_info_for(0)["ep_metrics"]["team_1_goals"],
            "have_goals": episode.last_info_for(0)["ep_metrics"]["have_goals"]
        }


## Stop

In [7]:
stop = {
    "timesteps_total": 15000000,  # 15M
    # "time_total_s": 14400, # 4h
    "episodes_total": 2,
}


## Config


In [8]:
# NUM_ENVS_PER_WORKER = 1
NUM_ENVS_PER_WORKER = 4
ENVIRONMENT_ID = "Soccer"

ENVIRONMENT_CONFIG = {
    "num_envs_per_worker": NUM_ENVS_PER_WORKER,
    "variation": EnvType.multiagent_player,
}


temp_env = create_custom_env(ENVIRONMENT_CONFIG)
obs_space = temp_env.observation_space
act_space = temp_env.action_space
temp_env.close()


config = {
    # system settings
    "num_gpus": 1,
    # "num_workers": 3,
    "num_workers": 0,
    "num_envs_per_worker": NUM_ENVS_PER_WORKER,
    "num_cpus_for_driver": 8,
    "num_cpus_per_worker": 1,
    "num_gpus_per_worker": 1,
    "log_level": "INFO",
    "framework": "torch",
    # RL setup
    "multiagent": {
        "policies": {
            "default": (None, obs_space, act_space, {}),
        },
        "policy_mapping_fn": tune.function(lambda _: "default"),
        "policies_to_train": ["default"],
    },
    "env": ENVIRONMENT_ID,
    "env_config": ENVIRONMENT_CONFIG,
    "callbacks": Callback,
}


[INFO] Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0


INFO:mlagents_envs.environment:Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0


[INFO] Connected new brain: SoccerTwos?team=1


INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=1


[INFO] Connected new brain: SoccerTwos?team=0


INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=0


## Run experiment

In [10]:
def run_experiment():
    ray.init(num_cpus=8, include_dashboard=False, ignore_reinit_error=True)

    tune.registry.register_env(ENVIRONMENT_ID, create_custom_env)

    analysis = tune.run(
        "PPO",
        num_samples=1,
        name="PPO_multiplayer_agent_test",
        config=config,
        stop=stop,
        checkpoint_freq=1,
        checkpoint_at_end=True,
        local_dir="../../ray_results",
        # restore="/src/ray_results/PPO_selfplay_1/PPO_Soccer_ID/checkpoint_00X/checkpoint-X",
        # resume=True
    )

    # Gets best trial based on max accuracy across all training iterations.
    best_trial = analysis.get_best_trial("episode_reward_mean", mode="max")
    print(best_trial)
    # Gets best checkpoint for trial based on accuracy.
    best_checkpoint = analysis.get_best_checkpoint(
        trial=best_trial, metric="episode_reward_mean", mode="max"
    )
    print(best_checkpoint)
    print("Done training")

run_experiment()


2021-11-28 19:33:17,802	INFO worker.py:736 -- Calling ray.init() again after it has already been called.


Trial name,status,loc
PPO_Soccer_2ede1_00000,PENDING,


[2m[36m(pid=152509)[0m [INFO] Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0
[2m[36m(pid=152509)[0m [INFO] Connected new brain: SoccerTwos?team=1
[2m[36m(pid=152509)[0m [INFO] Connected new brain: SoccerTwos?team=0


[2m[36m(pid=152509)[0m INFO:mlagents_envs.environment:Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0
[2m[36m(pid=152509)[0m INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=1
[2m[36m(pid=152509)[0m INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=0
[2m[36m(pid=152509)[0m 2021-11-28 19:33:21,268	INFO torch_policy.py:148 -- TorchPolicy (worker=local) running on 1 GPU(s).
[2m[36m(pid=152509)[0m 2021-11-28 19:33:23,765	INFO rollout_worker.py:1199 -- Built policy map: {'default': <ray.rllib.policy.policy_template.PPOTorchPolicy object at 0x7fd3b6757d60>}
[2m[36m(pid=152509)[0m 2021-11-28 19:33:23,765	INFO rollout_worker.py:1200 -- Built preprocessor map: {'default': <ray.rllib.models.preprocessors.NoPreprocessor object at 0x7fd3b6757ac0>}
[2m[36m(pid=152509)[0m 2021-11-28 19:33:23,766	INFO rollout_worker.py:583 -- Built filter map: {'default': <ray.rllib.utils.filter.NoFilter object a

[2m[36m(pid=152509)[0m [INFO] Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0
[2m[36m(pid=152509)[0m [INFO] Connected new brain: SoccerTwos?team=1
[2m[36m(pid=152509)[0m [INFO] Connected new brain: SoccerTwos?team=0


[2m[36m(pid=152509)[0m INFO:mlagents_envs.environment:Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0
[2m[36m(pid=152509)[0m INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=1
[2m[36m(pid=152509)[0m INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=0


[2m[36m(pid=152509)[0m [INFO] Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0
[2m[36m(pid=152509)[0m [INFO] Connected new brain: SoccerTwos?team=1
[2m[36m(pid=152509)[0m [INFO] Connected new brain: SoccerTwos?team=0


[2m[36m(pid=152509)[0m INFO:mlagents_envs.environment:Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0
[2m[36m(pid=152509)[0m INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=1
[2m[36m(pid=152509)[0m INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=0


[2m[36m(pid=152509)[0m [INFO] Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0
[2m[36m(pid=152509)[0m [INFO] Connected new brain: SoccerTwos?team=1
[2m[36m(pid=152509)[0m [INFO] Connected new brain: SoccerTwos?team=0


[2m[36m(pid=152509)[0m INFO:mlagents_envs.environment:Connected to Unity environment with package version 2.1.0-exp.1 and communication version 1.5.0
[2m[36m(pid=152509)[0m INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=1
[2m[36m(pid=152509)[0m INFO:mlagents_envs.environment:Connected new brain: SoccerTwos?team=0
[2m[36m(pid=152509)[0m 2021-11-28 19:33:26,197	INFO rollout_worker.py:723 -- Generating sample batch of size 800
[2m[36m(pid=152509)[0m 2021-11-28 19:33:26,214	INFO sampler.py:590 -- Raw obs from env: { 0: { 0: np.ndarray((336,), dtype=float32, min=0.0, max=1.0, mean=0.187),
[2m[36m(pid=152509)[0m        1: np.ndarray((336,), dtype=float32, min=0.0, max=1.0, mean=0.194),
[2m[36m(pid=152509)[0m        2: np.ndarray((336,), dtype=float32, min=0.0, max=1.0, mean=0.193),
[2m[36m(pid=152509)[0m        3: np.ndarray((336,), dtype=float32, min=0.0, max=1.0, mean=0.192)},
[2m[36m(pid=152509)[0m   1: { 0: np.ndarray((336,), dtype=float32,

Result for PPO_Soccer_2ede1_00000:
  agent_timesteps_total: 16000
  custom_metrics: {}
  date: 2021-11-28_19-34-38
  done: true
  episode_len_mean: 673.0
  episode_media: {}
  episode_reward_max: 0.1500322917936024
  episode_reward_mean: -1.2250118028366082
  episode_reward_min: -4.473576203702796
  episodes_this_iter: 5
  episodes_total: 5
  experiment_id: 2d66ce5f505743279a76cfb63b1cf96e
  hostname: bruno-odyssey-mint
  info:
    learner:
      default:
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.2
          cur_lr: 5.0e-05
          entropy: 3.2678589706420897
          entropy_coeff: 0.0
          kl: 0.028255948707461356
          policy_loss: -0.05023411716520786
          total_loss: 1.3303062720298766
          vf_explained_var: 0.09710343182086945
          vf_loss: 1.3748892035484315
    num_agent_steps_sampled: 16000
    num_agent_steps_trained: 16000
    num_steps_sampled: 4000
    num_steps_trained: 4000
  iterations_since_restore: 1
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Soccer_2ede1_00000,RUNNING,192.168.0.103:152509,1,71.9286,4000,-1.22501,0.150032,-4.47358,673


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_Soccer_2ede1_00000,TERMINATED,,1,71.9286,4000,-1.22501,0.150032,-4.47358,673


2021-11-28 19:34:40,933	INFO tune.py:549 -- Total run time: 83.13 seconds (80.42 seconds for the tuning loop).


PPO_Soccer_2ede1_00000
/home/bruno/Workspace/soccer-tows-player/src/ray_results/PPO_multiplayer_agent_test/PPO_Soccer_2ede1_00000_0_2021-11-28_19-33-17/checkpoint_000001/checkpoint-1
Done training


## Export agent

In [11]:
this_path = os.path.dirname(os.path.realpath("__file__"))
print('this_path', this_path)

def export_agent(agent_file: str, TRIAL, agent_name="my_ray_soccer_agent", makeZip=False):
    agent_path = os.path.join(f'{this_path}/agents', agent_name)
    os.makedirs(agent_path, exist_ok=True)

    shutil.rmtree(agent_path)
    os.makedirs(agent_path)

    # salva a classe do agente
    with open(os.path.join(agent_path, "agent.py"), "w") as f:
        f.write(agent_file)

    # salva um __init__ para criar o módulo Python
    with open(os.path.join(agent_path, "__init__.py"), "w") as f:
        f.write("from .agent import MyRaySoccerAgent")

    # copia o trial inteiro, incluindo os arquivos de configuração do experimento
    print(f"TRIALLL {TRIAL}")
    shutil.copytree(TRIAL, os.path.join(agent_path, TRIAL.split("ray_results/")[1]), )

    # empacota tudo num arquivo .zip
    if makeZip:
        shutil.make_archive(os.path.join(agent_path, agent_name),
                            "zip", os.path.join(agent_path, agent_name))

def get_agent_file_str(ALGORITHM, CHECKPOINT, POLICY_NAME="default"):
    return f"""
import pickle
import os
from typing import Dict

import gym
import numpy as np
import ray
from ray import tune
from ray.rllib.env.base_env import BaseEnv
from ray.tune.registry import get_trainable_cls

from soccer_twos import AgentInterface

ALGORITHM = "{ALGORITHM}"
CHECKPOINT_PATH = os.path.join(
    os.path.dirname(os.path.abspath(__file__)), 
    "{CHECKPOINT.split("ray_results/")[1]}"
)
POLICY_NAME = "{POLICY_NAME}"


class MyRaySoccerAgent(AgentInterface):
    def __init__(self, env: gym.Env):
        super().__init__()
        ray.init(ignore_reinit_error=True)

        # Load configuration from checkpoint file.
        config_path = ""
        if CHECKPOINT_PATH:
            config_dir = os.path.dirname(CHECKPOINT_PATH)
            config_path = os.path.join(config_dir, "params.pkl")
            # Try parent directory.
            if not os.path.exists(config_path):
                config_path = os.path.join(config_dir, "../params.pkl")

        # Load the config from pickled.
        if os.path.exists(config_path):
            with open(config_path, "rb") as f:
                config = pickle.load(f)
        else:
            # If no config in given checkpoint -> Error.
            raise ValueError(
                "Could not find params.pkl in either the checkpoint dir or "
                "its parent directory!"
            )

        # no need for parallelism on evaluation
        config["num_workers"] = 0
        config["num_gpus"] = 0

        # create a dummy env since it's required but we only care about the policy
        tune.registry.register_env("DummyEnv", lambda *_: BaseEnv())
        config["env"] = "DummyEnv"

        # create the Trainer from config
        cls = get_trainable_cls(ALGORITHM)
        agent = cls(env=config["env"], config=config)
        # load state from checkpoint
        agent.restore(CHECKPOINT_PATH)
        # get policy for evaluation
        self.policy = agent.get_policy(POLICY_NAME)

    def act(self, observation: Dict[int, np.ndarray]) -> Dict[int, np.ndarray]:
        actions = {{}}
        for player_id in observation:
            # compute_single_action returns a tuple of (action, action_info, ...)
            # as we only need the action, we discard the other elements
            actions[player_id], *_ = self.policy.compute_single_action(
                observation[player_id]
            )
        return actions

"""

def getAnalysis(experiment: str):
    return Analysis(experiment)

def export():
    # PPO_Soccer_18d23_00000
    # /home/bruno/Workspace/soccer-tows-player/src/ray_results/Testing_env/PPO_Soccer_18d23_00000_0_2021-11-24_20-34-41/checkpoint_000500/checkpoint-500
    analysis = getAnalysis("/home/bruno/Workspace/soccer-tows-player/src/ray_results/PPO_multiplayer_agent_test")

    
    ALGORITHM = "PPO"
    TRIAL = analysis.get_best_logdir("training_iteration", "max")
    CHECKPOINT = analysis.get_best_checkpoint(
        TRIAL,
        "training_iteration",
        "max",
    )

    print(TRIAL, CHECKPOINT)
    agent_file = get_agent_file_str(ALGORITHM, CHECKPOINT)
    export_agent(agent_file, TRIAL)

export()

this_path /home/bruno/Workspace/soccer-tows-player/src/experiments/ppo_multiagent
/home/bruno/Workspace/soccer-tows-player/src/ray_results/PPO_multiplayer_agent_test/PPO_Soccer_2ede1_00000_0_2021-11-28_19-33-17 /home/bruno/Workspace/soccer-tows-player/src/ray_results/PPO_multiplayer_agent_test/PPO_Soccer_2ede1_00000_0_2021-11-28_19-33-17/checkpoint_000001/checkpoint-1
TRIALLL /home/bruno/Workspace/soccer-tows-player/src/ray_results/PPO_multiplayer_agent_test/PPO_Soccer_2ede1_00000_0_2021-11-28_19-33-17
