# Iniciar ambiente

## Iniciar Local

In [None]:
import os
isColab = False

In [None]:
if isColab:
    from google.colab import drive
    drive.mount("/content/gdrive")

In [None]:
import multiprocessing

cores = multiprocessing.cpu_count() # Count the number of cores in a computer
cores

## (Sempre) Outras configurações

In [None]:
# # # Ambiente da competição
# !pip install --upgrade ceia-soccer-twos > /dev/null 2>&1
# # # a versão do ray compatível com a implementação dos agentes disponibilizada é a 1.4.0
# !pip install 'aioredis==1.3.1' > /dev/null 2>&1
# !pip install 'aiohttp==3.7.4' > /dev/null 2>&1
# !pip install 'ray==1.4.0' > /dev/null 2>&1
# !pip install 'ray[rllib]==1.4.0' > /dev/null 2>&1
# !pip install 'ray[tune]==1.4.0' > /dev/null 2>&1
# !pip install torch > /dev/null 2>&1
# !pip install lz4 > /dev/null 2>&1
# !pip install GPUtil > /dev/null 2>&1

# # # Dependências necessárias para gravar os vídeos
# # !apt-get install - y xvfb x11-utils > /dev/null 2>&1
# # !pip install 'pyvirtualdisplay==0.2.*' > /dev/null 2>&1
# # !pip install tensorboard > /dev/null 2>&1


In [None]:
!pip show ray

In [None]:
RAY_RESULTS_PATH = "/content/gdrive/MyDrive/minicurso_rl/ray_results" if isColab else '../../ray_results'
RAY_RESULTS_PYTHON_PATH = RAY_RESULTS_PATH.replace("\\", "")
if not os.path.exists(RAY_RESULTS_PYTHON_PATH):
  %mkdir -p $RAY_RESULTS_PATH
print(RAY_RESULTS_PATH)

AGENTS_PATH = "/content/gdrive/MyDrive/minicurso_rl/agents" if isColab else './agents'
AGENTS_PATH_PYTHON_PATH = AGENTS_PATH.replace("\\", "")
if not os.path.exists(AGENTS_PATH_PYTHON_PATH):
  %mkdir -p $AGENTS_PATH
print(AGENTS_PATH)

# Soccer Twos

Como tarefa bônus, experimente com os algoritmos aprendidos no ambiente `soccer_twos`, que será utilizado na competição final deste curso*. Para facilitar, utilize a variação `team_vs_policy` como no laboratório anterior.

<img src="https://raw.githubusercontent.com/bryanoliveira/soccer-twos-env/master/images/screenshot.png" height="400">

> Visualização do ambiente

Este ambiente consiste em um jogo de futebol de carros 2x2, ou seja, o objetivo é marcar um gol no adversário o mais rápido possível. Na variação `team_vs_policy`, seu agente controla um jogador do time azul e joga contra um time aleatório. Mais informações sobre o ambiente podem ser encontradas [no repositório](https://github.com/bryanoliveira/soccer-twos-env) e [na documentação do Unity ml-agents](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Learning-Environment-Examples.md#soccer-twos).


**Sua tarefa é treinar um agente com a interface do Ray apresentada, experimentando com diferentes algoritmos e hiperparâmetros.**


<br>

*A variação utilizada na competição será a `multiagent_player`, mas agentes treinados para `team_vs_policy` podem ser facilmente adaptados. Na seção "Exportando seu agente treinado" o agente "MyDqnSoccerAgent" faz exatamente isso.

## Imports

In [None]:
import gym

import ray
from ray import tune
from ray.tune import Analysis
# from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.env import BaseEnv
from ray.rllib.evaluation.episode import MultiAgentEpisode
from ray.rllib.evaluation.rollout_worker import RolloutWorker
from ray.rllib.policy import Policy
from ray.rllib.agents.callbacks import DefaultCallbacks
# from ray.rllib.utils.typing import PolicyID
# from ray.tune.registry import get_trainable_cls
# from ray.rllib.policy.policy import PolicySpec

import numpy as np
from typing import Any, Dict, List, Union, Optional
from collections import deque
# import pickle
import math
from pprint import pprint

import soccer_twos
from soccer_twos import EnvType

import shutil

## Wrapper

In [None]:
import gym
from typing import Any, Dict, List, Union

from ray.rllib.env.multi_agent_env import MultiAgentEnv
import numpy as np
from collections import deque

MAX_STEPS = 1000
MATCH_STEPS = 4000


def get_scalar_projection(x, y):
    assert np.linalg.norm(y) > 0.000001
    return np.dot(x, y) / np.linalg.norm(y)


# Os seguintes valores foram obtidos experimentalmente executando pré-experimentos
# A partir desses valores vamops derivar vários outros como posições ddos gols etc
min_ball_position_x, max_ball_position_x = - \
    15.563264846801758, 15.682827949523926
min_ball_position_y, max_ball_position_y = -7.08929967880249, 7.223850250244141
min_player_position_x, max_player_position_x = - \
    17.26804542541504, 17.16301727294922
min_player_position_y, max_player_position_y = - \
    7.399587631225586, 7.406457424163818
min_ball_to_goal_avg_velocity, max_ball_to_goal_avg_velocity = - \
    -23.366606239568615, 23.749571761530724

max_ball_abs_velocity = 78.25721740722656
max_goals_one_team = -9999999
max_goals_one_match = -9999999
max_steps = -999999

max_diff_reward = -np.inf

# Infered
max_ball_abs_avg_velocity = max(
    abs(min_ball_to_goal_avg_velocity), abs(max_ball_to_goal_avg_velocity))


SPEED_IMPORTANCE = 1.0 / (14.0)
CLIP_SPEED_REWARD_BY_SPEED_IMPORTANCE = True

AFTER_BALL_STEP_PENALTY = 1 / MAX_STEPS  # 0.001

# OBS.: Este hyperparâmetro não pode ser modificado sem fazer novos testes em
# min_ball_to_goal_avg_velocity e
# max_ball_to_goal_avg_velocity:
AVG_SPEED_TIMESTEPS_WINDOW = 1


def is_after_the_ball(player_id: int, player_pos: np.array, ball_pos: np.array):
    if player_id in range(2):
        return player_pos[0] > ball_pos[0]
    elif player_id in [2, 3]:
        return player_pos[0] < ball_pos[0]


def get_center_of_goal_pos(player_id):
    global min_ball_position_x, max_ball_position_x, \
        min_ball_position_y, max_ball_position_y, \
        min_player_position_x, max_player_position_x, \
        min_player_position_y, max_player_position_y
    if player_id in [0, 1]:
        return np.array([max_ball_position_x, 0.0])
    elif player_id in [2, 3]:
        return np.array([min_ball_position_x, 0.0])


def calculate_ball_to_goal_scalar_velocity(player_id: int, info: Dict, x_axis_only=True):
    ball_velocity = info["ball_info"]["velocity"]
    if x_axis_only and player_id in [0, 1]:
        return ball_velocity[0]
    elif x_axis_only and player_id in [2, 3]:
        return -ball_velocity[0]

    goal_pos = get_center_of_goal_pos(player_id)
    ball_pos = info["ball_info"]["position"]

    direction_to_center_of_goal = goal_pos - ball_pos

    ball_velocity_to_center_of_goal = get_scalar_projection(
        ball_velocity, direction_to_center_of_goal)
    return ball_velocity_to_center_of_goal


def calculate_distance(pt1: np.ndarray, pt2: np.ndarray):
    assert pt1.shape == (2,) and pt2.shape == (2,)
    return np.linalg.norm(pt1 - pt2)


class CustomRewardWrapper(gym.core.Wrapper, MultiAgentEnv):
    def step(self, action: Union[Dict[int, List[Any]], List[Any]]):
        obs, rewards, done, info = super().step(action)

        ball_pos = info[0]["ball_info"]["position"]
        ball_velocity = info[0]["ball_info"]["velocity"]
        player0_pos = info[0]["player_info"]["position"]
        player1_pos = info[1]["player_info"]["position"]
        player2_pos = info[2]["player_info"]["position"]
        player3_pos = info[3]["player_info"]["position"]

        if self._was_ball_effective_touched(self.prev_ball_velocity, ball_velocity):
            self._get_ball_toucher(
                ball_velocity, ball_pos, player0_pos, player1_pos, player2_pos, player3_pos)

        if type(action) is dict:
            new_rewards = {k: self._calculate_reward(
                rewards[k], k, info[k]) for k in info.keys()}
        else:
            raise NotImplementedError('Necessário implementar!')

        if type(action) is dict:
            splitted_rets = {k: self._calculate_reward(
                rewards[k], k, info[k], splitted_returns=True) for k in info.keys()}
        else:
            raise NotImplementedError('Necessário implementar!')

        info = {
            i: {
                **info[i],
                "ep_metrics": {
                    "total_timesteps": self.n_step + 1,
                    "total_goals": self.scoreboard["team_0"] + self.scoreboard["team_1"],
                    "goals_opponent": self.scoreboard["team_1"] if i in range(2) else self.scoreboard["team_0"],
                    "goals_in_favor": self.scoreboard["team_0"] if i in range(2) else self.scoreboard["team_1"],
                    "team_0_goals": self.scoreboard["team_0"],
                    "team_1_goals": self.scoreboard["team_1"],
                    "episode_ended": done["__all__"],
                    "have_goals": self.scoreboard["team_0"] + self.scoreboard["team_1"] > 0,
                    "env_reward": splitted_rets[i][0],
                    "ball_to_goal_speed_reward": splitted_rets[i][1],
                }
            } for i in info.keys()
        }

        self.n_step += 1
        self.prev_ball_velocity = ball_velocity.copy()

        return obs, new_rewards, done, info
            
    def reset(self, **kwargs):
        obs = super().reset(**kwargs)
        self.n_step = 0
        self.last_ball_speed_mean_per_player = {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0}
        self.ball_speed_deque_per_player = {0: deque(maxlen=AVG_SPEED_TIMESTEPS_WINDOW),
                                            1: deque(maxlen=AVG_SPEED_TIMESTEPS_WINDOW),
                                            2: deque(maxlen=AVG_SPEED_TIMESTEPS_WINDOW),
                                            3: deque(maxlen=AVG_SPEED_TIMESTEPS_WINDOW)}
        self.scoreboard = {"team_0": 0, "team_1": 0}
        self.await_press = False
        self.prev_ball_velocity = np.array([0.0, 0.0])
        self.last_ball_toucher = -1
        
        return obs

    def _was_ball_effective_touched(self, prev_ball_velocity: np.ndarray, curr_ball_velocity: np.ndarray):
        """Get if ball was touched (either by player or wall)

        Args:
            prev_ball_velocity (np.ndarray): Previous ball coordinates
            curr_ball_velocity (np.ndarray): Current ball coordinates
        """
        assert prev_ball_velocity.shape == (
            2,) and curr_ball_velocity.shape == (2,)
        percentual_scalar_thresold = 0.2  # 20%
        diff = curr_ball_velocity - prev_ball_velocity


        if np.linalg.norm(curr_ball_velocity) < 1.0:
            self.last_ball_toucher = -1

        if np.linalg.norm(prev_ball_velocity) > 0.0000001:
            return np.linalg.norm(diff) / np.linalg.norm(prev_ball_velocity) > percentual_scalar_thresold
        return np.linalg.norm(curr_ball_velocity) > np.linalg.norm(prev_ball_velocity)

    def _get_ball_toucher(self,
                          ball_velocity: np.ndarray,
                          ball_position: np.ndarray,
                          player_0_pos: np.ndarray,
                          player_1_pos: np.ndarray,
                          player_2_pos: np.ndarray,
                          player_3_pos: np.ndarray):
        assert ball_position.shape == (2,) and \
            player_0_pos.shape == (2,) and \
            player_1_pos.shape == (2,) and \
            player_2_pos.shape == (2,) and \
            player_3_pos.shape == (2,)
        top_wall_y = max_ball_position_y
        bottom_wall_y = min_ball_position_y
        left_wall_x = min_ball_position_x
        right_wall_x = max_ball_position_x

        if np.linalg.norm(ball_velocity) > 0.000001:
            distances = np.array([
                calculate_distance(ball_position, player_0_pos),
                calculate_distance(ball_position, player_1_pos),
                calculate_distance(ball_position, player_2_pos),
                calculate_distance(ball_position, player_3_pos),
                np.abs(ball_position[1] - top_wall_y),
                np.abs(ball_position[1] - bottom_wall_y),
                np.abs(ball_position[0] - left_wall_x),
                np.abs(ball_position[0] - right_wall_x)
            ])

            nearest = np.argmin(distances)
            if nearest < 4:
                self.last_ball_toucher = nearest

        return self.last_ball_toucher

    def _calculate_reward(self, reward: float, player_id: int, info: Dict, splitted_returns=False) -> float:
        # print('calculating reward')
        if reward != 0.0:
            self._update_scoreboard(player_id, reward)


        self._update_avg_ball_speed_to_goal(
            player_id, calculate_ball_to_goal_scalar_velocity(player_id, info))

        env_reward = reward
        
        ball_to_goal_speed_reward = np.clip(SPEED_IMPORTANCE * self.last_ball_speed_mean_per_player[player_id] / max_ball_abs_avg_velocity, -SPEED_IMPORTANCE,
                                            SPEED_IMPORTANCE) if CLIP_SPEED_REWARD_BY_SPEED_IMPORTANCE else SPEED_IMPORTANCE * self.last_ball_speed_mean_per_player[player_id] / max_ball_abs_avg_velocity
        ball_to_goal_speed_reward = (
            player_id == self.last_ball_toucher) * ball_to_goal_speed_reward

        if splitted_returns:
            return (env_reward, ball_to_goal_speed_reward)
        return env_reward + ball_to_goal_speed_reward

    def _update_avg_ball_speed_to_goal(self, player_id: int, ball_speed: float):
        assert player_id in [0, 1, 2, 3]

        self.ball_speed_deque_per_player[player_id].append(ball_speed)
        avg = np.mean(self.ball_speed_deque_per_player[player_id])

        self.last_ball_speed_mean_per_player[player_id] = avg

    def _update_scoreboard(self, player_id, reward):
        # global max_goals_one_team, max_goals_one_match

        if player_id == 0 and reward == -1.0:
            self.scoreboard["team_1"] += 1

        elif player_id == 2 and reward == -1.0:
            self.scoreboard["team_0"] += 1

## Utils

In [None]:
class RLLibWrapper(gym.core.Wrapper, MultiAgentEnv):
    """
    A RLLib wrapper so our env can inherit from MultiAgentEnv.
    """
    pass


def create_rllib_env(env_config: dict = {}):
    """
    Creates a RLLib environment and prepares it to be instantiated by Ray workers.
    Args:
        env_config: configuration for the environment.
            You may specify the following keys:
            - variation: one of soccer_twos.EnvType. Defaults to EnvType.multiagent_player.
            - opponent_policy: a Callable for your agent to train against. Defaults to a random policy.
    """
    if hasattr(env_config, "worker_index"):
        env_config["worker_id"] = (
            env_config.worker_index * env_config.get("num_envs_per_worker", 1)
            + env_config.vector_index
        )
    env = soccer_twos.make(**env_config)
    if "multiagent" in env_config and not env_config["multiagent"]:
        # is multiagent by default, is only disabled if explicitly set to False
        return env
    return RLLibWrapper(env)


def create_custom_env(env_config: dict = {}):
    env = create_rllib_env(env_config)
    return CustomRewardWrapper(env)

## Callback

In [None]:
import random

import numpy as np
from gym.spaces import Box

from ray.rllib.policy.policy import Policy
from ray.rllib.utils.annotations import override
from ray.rllib.utils.typing import ModelWeights


class RandomPolicy(Policy):
    """Hand-coded policy that returns random actions."""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # Whether for compute_actions, the bounds given in action_space
        # should be ignored (default: False). This is to test action-clipping
        # and any Env's reaction to boon_episode_stepunds breaches.
        if self.config.get("ignore_action_bounds", False) and \
                isinstance(self.action_space, Box):
            self.action_space_for_sampling = Box(
                -float("inf"),
                float("inf"),
                shape=self.action_space.shape,
                dtype=self.action_space.dtype)
        else:
            self.action_space_for_sampling = self.action_space

    @override(Policy)
    def compute_actions(self,
                        obs_batch,
                        state_batches=None,
                        prev_action_batch=None,
                        prev_reward_batch=None,
                        **kwargs):
        # Alternatively, a numpy array would work here as well.
        # e.g.: np.array([random.choice([0, 1])] * len(obs_batch))
        return [self.action_space_for_sampling.sample() for _ in obs_batch], \
               [], {}

    @override(Policy)
    def learn_on_batch(self, samples):
        """No learning."""
        return {}

    @override(Policy)
    def compute_log_likelihoods(self,
                                actions,
                                obs_batch,
                                state_batches=None,
                                prev_action_batch=None,
                                prev_reward_batch=None):
        return np.array([random.random()] * len(obs_batch))

    @override(Policy)
    def get_weights(self) -> ModelWeights:
        """No weights to save."""
        return {}

    @override(Policy)
    def set_weights(self, weights: ModelWeights) -> None:
        """No weights to set."""
        pass

In [None]:
class OpponentSelector:
    def __init__(self):
        self.opponent_policy = 'random'

    def choose_new_policy(self, n_opponents):
        if n_opponents == 0:
            self.opponent_policy = 'random'
        elif n_opponents > 0:
            self.opponent_policy = f'main_v{np.random.choice(list(range(1, n_opponents + 1)))}'
        return self.opponent_policy
    

In [None]:
MAX_POLICIES = 40

@ray.remote
class PolicyHandler:
    def __init__(self) -> None:
        self.n_opponents = 0
        self.policies = ['main', 'random']
        self.opponnent_selector = OpponentSelector()
        self.main_left_side = True
        self.opponent_policy = 'random'
        self.last_policy_n = 0

    def add_policy(self):
        """Add a new Policy to handler

        Returns:
            str: policy_id of the policy updated
        """
        self.n_opponents = np.min([self.n_opponents + 1, MAX_POLICIES])
        print(f'PolicyHandler add_policy: {self.n_opponents}')
        pol_id = f'main_v{self.n_opponents}'
        if pol_id not in self.policies:
            self.policies.append(pol_id)
        print(f'Policies: {self.policies}')
        self.last_policy_n %= MAX_POLICIES
        self.last_policy_n += 1
        return f'main_v{self.last_policy_n}'

    def _step_agent(self, agent_id):
        if agent_id == 0:
            self.main_left_side = not self.main_left_side
            self.opponent_policy = self.opponnent_selector.choose_new_policy(self.n_opponents)

    def _mapping_fn(self, agent_id):
        self._step_agent(agent_id)
        
        if self.main_left_side and agent_id in [0,1]:
            selected_pol = "main"

        elif self.main_left_side and agent_id in [2,3]:
            selected_pol = self.opponent_policy

        elif not self.main_left_side and agent_id in [0,1]:
            selected_pol = self.opponent_policy
        
        elif not self.main_left_side and agent_id in [2,3]:
            selected_pol = "main"
        
        print(f'_mapping_fn agent_id: {agent_id}, n_oppoonents: {self.n_opponents}')
        print(f'policy_mapping_fn selected_pol: {selected_pol}')
        return selected_pol


ray.init(num_cpus=cores, include_dashboard=False, ignore_reinit_error=True)
ph_actor = PolicyHandler.options(name='policy_handler').remote()

In [None]:
WIN_RATE_THEWSHOLD = .7

class SelfPlayCallback(DefaultCallbacks):
    def __init__(self):
        super().__init__()


    def on_episode_step(self,
                        *,
                        worker: "RolloutWorker",
                        base_env: BaseEnv,
                        episode: MultiAgentEpisode,
                        env_index: Optional[int] = None,
                        **kwargs) -> None:
        total_timesteps = episode.last_info_for(
            0)["ep_metrics"]["total_timesteps"]
        total_goals = float(episode.last_info_for(0)[
                            "ep_metrics"]["total_goals"])

        if not episode.user_data:
            episode.user_data = {
                0: {
                    "total_env_reward": 0.0,
                    "total_ball_to_goal_speed_reward": 0.0,
                },
                1: {
                    "total_env_reward": 0.0,
                    "total_ball_to_goal_speed_reward": 0.0,
                },
                2: {
                    "total_env_reward": 0.0,
                    "total_ball_to_goal_speed_reward": 0.0,
                },
                3: {
                    "total_env_reward": 0.0,
                    "total_ball_to_goal_speed_reward": 0.0,
                }
            }

        episode.user_data = {
            **episode.user_data,
            0: {
                "total_env_reward": episode.user_data[0]["total_env_reward"] + episode.last_info_for(0)["ep_metrics"]["env_reward"],
                "total_ball_to_goal_speed_reward": episode.user_data[0]["total_ball_to_goal_speed_reward"] + episode.last_info_for(0)["ep_metrics"]["ball_to_goal_speed_reward"],
            },
            1: {
                "total_env_reward": episode.user_data[1]["total_env_reward"] + episode.last_info_for(1)["ep_metrics"]["env_reward"],
                "total_ball_to_goal_speed_reward": episode.user_data[1]["total_ball_to_goal_speed_reward"] + episode.last_info_for(1)["ep_metrics"]["ball_to_goal_speed_reward"],
            },
            2: {
                "total_env_reward": episode.user_data[2]["total_env_reward"] + episode.last_info_for(2)["ep_metrics"]["env_reward"],
                "total_ball_to_goal_speed_reward": episode.user_data[2]["total_ball_to_goal_speed_reward"] + episode.last_info_for(2)["ep_metrics"]["ball_to_goal_speed_reward"],
            },
            3: {
                "total_env_reward": episode.user_data[3]["total_env_reward"] + episode.last_info_for(3)["ep_metrics"]["env_reward"],
                "total_ball_to_goal_speed_reward": episode.user_data[3]["total_ball_to_goal_speed_reward"] + episode.last_info_for(3)["ep_metrics"]["ball_to_goal_speed_reward"],
            }
        }

        episode.custom_metrics = {
            "agent_0_total_env_reward": episode.user_data[0]["total_env_reward"],
            "agent_0_total_ball_to_goal_speed_reward": episode.user_data[0]["total_ball_to_goal_speed_reward"],
        }

    def __init__(self):
        super().__init__()
        # 0=RandomPolicy, 1=1st main policy snapshot,
        # 2=2nd main policy snapshot, etc..
        self.current_opponent = 0

    def on_train_result(self, *, trainer, result, **kwargs):
        # Get the win rate for the train batch.
        # Note that normally, one should set up a proper evaluation config,
        # such that evaluation always happens on the already updated policy,
        # instead of on the already used train_batch.
        main_rew = result["hist_stats"].pop("policy_main_reward")
        
        result["hist_stats"].pop('episode_reward')
        result["hist_stats"].pop('episode_lengths')

        opponent_rew_lists = list(result["hist_stats"].values())
        opponent_rew = []
        for rew_list in opponent_rew_lists:
            opponent_rew += rew_list

        assert len(main_rew) == len(opponent_rew)
        won = 0
        for r_main, r_opponent in zip(main_rew, opponent_rew):
            if r_main > r_opponent:
                won += 1
        win_rate = won / len(main_rew)
        result["win_rate"] = win_rate
        print(f"Iter={trainer.iteration} win-rate={win_rate} -> ", end="")
        # If win rate is good -> Snapshot current policy and play against
        # it next, keeping the snapshot fixed and only improving the "main"
        # policy.
        if win_rate > WIN_RATE_THEWSHOLD:
            self.current_opponent %= MAX_POLICIES
            self.current_opponent += 1
            new_pol_id = ray.get(ph_actor.add_policy.remote())
            print(f"adding new opponent to the mix ({new_pol_id}).")

            # Set the weights of the new policy to the main policy.
            # We'll keep training the main policy, whereas `new_pol_id` will
            # remain fixed.
            main_state = trainer.get_policy("main").get_state()
            trainer.get_policy(new_pol_id).set_state(main_state)
            # We need to sync the just copied local weights (from main policy)
            # to all the remote workers as well.
            trainer.workers.sync_weights()
        else:
            print("not good enough; will keep learning ...")

        # +2 = main + random
        result["league_size"] = self.current_opponent + 2

## Stop

In [None]:
stop = {
    "timesteps_total": 15000000,  # 15M
    # "time_total_s": 14400, # 4h
    # "episodes_total": 10,
    # "training_iteration": 1,
}


## Config


In [None]:
NUM_ENVS_PER_WORKER = 4
ENVIRONMENT_ID = "Soccer"

ENVIRONMENT_CONFIG = {
    "num_envs_per_worker": NUM_ENVS_PER_WORKER,
    "variation": EnvType.multiagent_player,
}


temp_env = create_custom_env(ENVIRONMENT_CONFIG)
obs_space = temp_env.observation_space
act_space = temp_env.action_space
temp_env.close()

def policy_mapping_fn(agent_id):
    actor = ray.get_actor('policy_handler')
    return ray.get(actor._mapping_fn.remote(agent_id))

def create_policy_pool(max_policies=MAX_POLICIES, obs_space=obs_space, act_space=act_space):
    pool = {
        "main": (None, obs_space, act_space, {}),
        "random": (RandomPolicy, obs_space, act_space, {}),
    }

    for i in range(1, max_policies+1):
        pool[f'main_v{i}'] = None, obs_space, act_space, {},

    return pool

gpu_count = 1
num_workers = 3
num_cpus_for_driver = 2

num_gpus_for_driver = 1 / (num_workers + 1) # Driver GPU
num_gpus_per_worker = (gpu_count - num_gpus_for_driver) / num_workers if num_workers > 0 else 0
num_cpu_per_worker = math.floor(cores - num_cpus_for_driver) / num_workers

config = {
    # system settings
    "num_gpus": num_gpus_for_driver,
    "num_workers": num_workers,
    "num_envs_per_worker": NUM_ENVS_PER_WORKER,
    "num_cpus_for_driver": num_cpus_for_driver,
    "num_cpus_per_worker": num_cpu_per_worker,
    "num_gpus_per_worker": num_gpus_per_worker,
    "log_level": "INFO",
    "framework": "torch",
    # RL setup
    "multiagent": {
        "policies": create_policy_pool(),
        "policy_mapping_fn": policy_mapping_fn,
        "policies_to_train": ["main"],
    },
    "env": ENVIRONMENT_ID,
    "env_config": {
        **ENVIRONMENT_CONFIG,
    },
    "callbacks": SelfPlayCallback,
}

## Run experiment

### Train PPO SelfPlay

In [16]:
def run_experiment():
    tune.registry.register_env(ENVIRONMENT_ID, create_custom_env)

    analysis = tune.run(
        "PPO",
        num_samples=1,
        name="PPO_multiagent_league_and_rewards_1.4",
        config=config,
        stop=stop,
        checkpoint_freq=50,
        checkpoint_at_end=True,
        local_dir=RAY_RESULTS_PATH,
        # restore="../../ray_results/PPO_selfplay_1/PPO_Soccer_ID/checkpoint_00X/checkpoint-X",
        # resume=True
    )

    # Gets best trial based on max accuracy across all training iterations.
    best_trial = analysis.get_best_trial("episode_reward_mean", mode="max")
    print(best_trial)
    # Gets best checkpoint for trial based on accuracy.
    best_checkpoint = analysis.get_best_checkpoint(
        trial=best_trial, metric="episode_reward_mean", mode="max"
    )
    print(best_checkpoint)
    print("Done training")
    return analysis, best_trial, best_checkpoint

run_experiment()


## Export agent

In [None]:
# this_path = os.path.dirname(os.path.realpath("__file__"))0

def export_agent(agent_file: str, TRIAL, agent_name="bajai_belzonte", makeZip=False):
    agent_path = os.path.join(AGENTS_PATH, agent_name)
    os.makedirs(agent_path, exist_ok=True)


    shutil.rmtree(agent_path)
    os.makedirs(agent_path)

    # salva a classe do agente
    with open(os.path.join(agent_path, "agent.py"), "w") as f:
        f.write(agent_file)

    # salva um __init__ para criar o módulo Python
    with open(os.path.join(agent_path, "__init__.py"), "w") as f:
        f.write("from .agent import MyRaySoccerAgent")

    # copia o trial inteiro, incluindo os arquivos de configuração do experimento
    print(f"TRIALLL {TRIAL}")
    shutil.copytree(TRIAL, os.path.join(
        agent_path, TRIAL.split("ray_results/")[1]), )

    # empacota tudo num arquivo .zip
    if makeZip:
        shutil.make_archive(os.path.join(agent_path, agent_name),
                            "zip", os.path.join(agent_path, agent_name))


def get_agent_file_str(ALGORITHM, CHECKPOINT, POLICY_NAME="main"):
    return f"""
import pickle
import os
from typing import Dict

import gym
import numpy as np
import ray
from ray import tune
from ray.rllib.env.base_env import BaseEnv
from ray.tune.registry import get_trainable_cls

from soccer_twos import AgentInterface

ALGORITHM = "{ALGORITHM}"
CHECKPOINT_PATH = os.path.join(
    os.path.dirname(os.path.abspath(__file__)), 
    "{CHECKPOINT.split("ray_results/")[1]}"
)
POLICY_NAME = "{POLICY_NAME}"


class MyRaySoccerAgent(AgentInterface):
    def __init__(self, env: gym.Env):
        super().__init__()
        ray.init(ignore_reinit_error=True)

        # Load configuration from checkpoint file.
        config_path = ""
        if CHECKPOINT_PATH:
            config_dir = os.path.dirname(CHECKPOINT_PATH)
            config_path = os.path.join(config_dir, "params.pkl")
            # Try parent directory.
            if not os.path.exists(config_path):
                config_path = os.path.join(config_dir, "../params.pkl")

        # Load the config from pickled.
        if os.path.exists(config_path):
            with open(config_path, "rb") as f:
                config = pickle.load(f)
        else:
            # If no config in given checkpoint -> Error.
            raise ValueError(
                "Could not find params.pkl in either the checkpoint dir or "
                "its parent directory!"
            )

        # no need for parallelism on evaluation
        config["num_workers"] = 0
        config["num_gpus"] = 0

        # create a dummy env since it's required but we only care about the policy
        tune.registry.register_env("DummyEnv", lambda *_: BaseEnv())
        config["env"] = "DummyEnv"

        # create the Trainer from config
        cls = get_trainable_cls(ALGORITHM)
        agent = cls(env=config["env"], config=config)
        # load state from checkpoint
        agent.restore(CHECKPOINT_PATH)
        # get policy for evaluation
        self.policy = agent.get_policy(POLICY_NAME)

    def act(self, observation: Dict[int, np.ndarray]) -> Dict[int, np.ndarray]:
        actions = {{}}
        for player_id in observation:
            # compute_single_action returns a tuple of (action, action_info, ...)
            # as we only need the action, we discard the other elements
            actions[player_id], *_ = self.policy.compute_single_action(
                observation[player_id]
            )
        return actions

"""


def getAnalysis(experiment: str):
    return Analysis(experiment)


def export():
    analysis = getAnalysis(
        "/home/bruno/Workspace/soccer-tows-player/src/ray_results/PPO_multiagent_league_and_rewards_1.4")

    ALGORITHM = "PPO"
    TRIAL = analysis.get_best_logdir("training_iteration", "max")
    CHECKPOINT = analysis.get_best_checkpoint(
        TRIAL,
        "training_iteration",
        "max",
    )

    print(TRIAL, CHECKPOINT)
    agent_file = get_agent_file_str(ALGORITHM, CHECKPOINT)
    export_agent(agent_file, TRIAL)


export()
