# Iniciar ambiente

## Iniciar Local

In [None]:
import os
isColab = False

## (Sempre) Outras configurações

In [None]:
# # Ambiente da competição
# !pip install --upgrade ceia-soccer-twos --force-reinstall > /dev/null 2>&1
# # a versão do ray compatível com a implementação dos agentes disponibilizada é a 1.4.0
# !pip install 'aioredis' --upgrade > /dev/null 2>&1
# !pip install 'aiohttp' --upgrade > /dev/null 2>&1
# !pip install 'ray[default]' --upgrade > /dev/null 2>&1
# !pip install 'ray[rllib]' --upgrade > /dev/null 2>&1
# !pip install 'ray[tune]' --upgrade > /dev/null 2>&1
# !pip install torch --upgrade > /dev/null 2>&1
# !pip install lz4 --upgrade > /dev/null 2>&1
# !pip install GPUtil --upgrade > /dev/null 2>&1
# !pip install tensorboard --upgrade > /dev/null 2>&1

# # Dependências necessárias para gravar os vídeos
# # !apt-get install - y xvfb x11-utils > /dev/null 2>&1
# # !pip install 'pyvirtualdisplay==0.2.*' > /dev/null 2>&1

# Soccer Twos

Como tarefa bônus, experimente com os algoritmos aprendidos no ambiente `soccer_twos`, que será utilizado na competição final deste curso*. Para facilitar, utilize a variação `team_vs_policy` como no laboratório anterior.

<img src="https://raw.githubusercontent.com/bryanoliveira/soccer-twos-env/master/images/screenshot.png" height="400">

> Visualização do ambiente

Este ambiente consiste em um jogo de futebol de carros 2x2, ou seja, o objetivo é marcar um gol no adversário o mais rápido possível. Na variação `team_vs_policy`, seu agente controla um jogador do time azul e joga contra um time aleatório. Mais informações sobre o ambiente podem ser encontradas [no repositório](https://github.com/bryanoliveira/soccer-twos-env) e [na documentação do Unity ml-agents](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Learning-Environment-Examples.md#soccer-twos).


**Sua tarefa é treinar um agente com a interface do Ray apresentada, experimentando com diferentes algoritmos e hiperparâmetros.**


<br>

*A variação utilizada na competição será a `multiagent_player`, mas agentes treinados para `team_vs_policy` podem ser facilmente adaptados. Na seção "Exportando seu agente treinado" o agente "MyDqnSoccerAgent" faz exatamente isso.

## Imports

In [None]:
import gym

import ray
from ray import tune
from ray.tune import ExperimentAnalysis
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.env import BaseEnv
from ray.rllib.evaluation.episode import MultiAgentEpisode
from ray.rllib.evaluation.rollout_worker import RolloutWorker
from ray.rllib.policy import Policy
from ray.rllib.agents.callbacks import DefaultCallbacks
from ray.rllib.utils.typing import PolicyID
from ray.tune.registry import get_trainable_cls
from ray.rllib.policy.policy import PolicySpec

import numpy as np
from typing import Any, Dict, List, Union, Optional
from collections import deque
import pickle

import soccer_twos
from soccer_twos import EnvType

import shutil

## Utils

In [None]:
class RLLibWrapper(gym.core.Wrapper, MultiAgentEnv):
    """
    A RLLib wrapper so our env can inherit from MultiAgentEnv.
    """

    pass


def create_rllib_env(env_config: dict = {}):
    """
    Creates a RLLib environment and prepares it to be instantiated by Ray workers.
    Args:
        env_config: configuration for the environment.
            You may specify the following keys:
            - variation: one of soccer_twos.EnvType. Defaults to EnvType.multiagent_player.
            - opponent_policy: a Callable for your agent to train against. Defaults to a random policy.
    """
    if hasattr(env_config, "worker_index"):
        env_config["worker_id"] = (
            env_config.worker_index * env_config.get("num_envs_per_worker", 1)
            + env_config.vector_index
        )
    env = soccer_twos.make(**env_config)
    if "multiagent" in env_config and not env_config["multiagent"]:
        # is multiagent by default, is only disabled if explicitly set to False
        return env
    return RLLibWrapper(env)

## Callback

In [None]:
import random

import numpy as np
from gym.spaces import Box

from ray.rllib.policy.policy import Policy
from ray.rllib.utils.annotations import override
from ray.rllib.utils.typing import ModelWeights


class RandomPolicy(Policy):
    """Hand-coded policy that returns random actions."""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # Whether for compute_actions, the bounds given in action_space
        # should be ignored (default: False). This is to test action-clipping
        # and any Env's reaction to bounds breaches.
        if self.config.get("ignore_action_bounds", False) and \
                isinstance(self.action_space, Box):
            self.action_space_for_sampling = Box(
                -float("inf"),
                float("inf"),
                shape=self.action_space.shape,
                dtype=self.action_space.dtype)
        else:
            self.action_space_for_sampling = self.action_space

    @override(Policy)
    def compute_actions(self,
                        obs_batch,
                        state_batches=None,
                        prev_action_batch=None,
                        prev_reward_batch=None,
                        **kwargs):
        # Alternatively, a numpy array would work here as well.
        # e.g.: np.array([random.choice([0, 1])] * len(obs_batch))
        return [self.action_space_for_sampling.sample() for _ in obs_batch], \
               [], {}

    @override(Policy)
    def learn_on_batch(self, samples):
        """No learning."""
        return {}

    @override(Policy)
    def compute_log_likelihoods(self,
                                actions,
                                obs_batch,
                                state_batches=None,
                                prev_action_batch=None,
                                prev_reward_batch=None):
        return np.array([random.random()] * len(obs_batch))

    @override(Policy)
    def get_weights(self) -> ModelWeights:
        """No weights to save."""
        return {}

    @override(Policy)
    def set_weights(self, weights: ModelWeights) -> None:
        """No weights to set."""
        pass

In [None]:
WIN_RATE_THEWSHOLD = .6

class SelfPlayCallback(DefaultCallbacks):
    # def on_episode_step(self,
    #                     *,
    #                     worker: "RolloutWorker",
    #                     base_env: BaseEnv,
    #                     episode: MultiAgentEpisode,
    #                     env_index: Optional[int] = None,
    #                     **kwargs) -> None:
    #     total_timesteps = episode.last_info_for(
    #         0)["ep_metrics"]["total_timesteps"]
    #     total_goals = float(episode.last_info_for(0)[
    #                         "ep_metrics"]["total_goals"])
    #     estimated_goals_in_match = total_goals * MATCH_STEPS / \
    #         float(total_timesteps) if total_goals > 0 else 0.0
    #     timesteps_to_goal = float(
    #         total_timesteps) if total_goals > 0 else 9999.0

    #     if not episode.user_data:
    #         episode.user_data = {
    #             0: {
    #                 "total_env_reward": 0.0,
    #                 "total_ball_to_goal_speed_reward": 0.0,
    #                 "total_agent_position_to_ball_reward": 0.0,
    #             },
    #             1: {
    #                 "total_env_reward": 0.0,
    #                 "total_ball_to_goal_speed_reward": 0.0,
    #                 "total_agent_position_to_ball_reward": 0.0,
    #             },
    #             2: {
    #                 "total_env_reward": 0.0,
    #                 "total_ball_to_goal_speed_reward": 0.0,
    #                 "total_agent_position_to_ball_reward": 0.0,
    #             },
    #             3: {
    #                 "total_env_reward": 0.0,
    #                 "total_ball_to_goal_speed_reward": 0.0,
    #                 "total_agent_position_to_ball_reward": 0.0,
    #             }
    #         }

    #     episode.user_data = {
    #         **episode.user_data,
    #         0: {
    #             "total_env_reward": episode.user_data[0]["total_env_reward"] + episode.last_info_for(0)["ep_metrics"]["env_reward"],
    #             "total_ball_to_goal_speed_reward": episode.user_data[0]["total_ball_to_goal_speed_reward"] + episode.last_info_for(0)["ep_metrics"]["ball_to_goal_speed_reward"],
    #             "total_agent_position_to_ball_reward": episode.user_data[0]["total_agent_position_to_ball_reward"] + episode.last_info_for(0)["ep_metrics"]["agent_position_to_ball_reward"],
    #         },
    #         1: {
    #             "total_env_reward": episode.user_data[1]["total_env_reward"] + episode.last_info_for(1)["ep_metrics"]["env_reward"],
    #             "total_ball_to_goal_speed_reward": episode.user_data[1]["total_ball_to_goal_speed_reward"] + episode.last_info_for(1)["ep_metrics"]["ball_to_goal_speed_reward"],
    #             "total_agent_position_to_ball_reward": episode.user_data[1]["total_agent_position_to_ball_reward"] + episode.last_info_for(1)["ep_metrics"]["agent_position_to_ball_reward"],
    #         },
    #         2: {
    #             "total_env_reward": episode.user_data[2]["total_env_reward"] + episode.last_info_for(2)["ep_metrics"]["env_reward"],
    #             "total_ball_to_goal_speed_reward": episode.user_data[2]["total_ball_to_goal_speed_reward"] + episode.last_info_for(2)["ep_metrics"]["ball_to_goal_speed_reward"],
    #             "total_agent_position_to_ball_reward": episode.user_data[2]["total_agent_position_to_ball_reward"] + episode.last_info_for(2)["ep_metrics"]["agent_position_to_ball_reward"],
    #         },
    #         3: {
    #             "total_env_reward": episode.user_data[3]["total_env_reward"] + episode.last_info_for(3)["ep_metrics"]["env_reward"],
    #             "total_ball_to_goal_speed_reward": episode.user_data[3]["total_ball_to_goal_speed_reward"] + episode.last_info_for(3)["ep_metrics"]["ball_to_goal_speed_reward"],
    #             "total_agent_position_to_ball_reward": episode.user_data[3]["total_agent_position_to_ball_reward"] + episode.last_info_for(3)["ep_metrics"]["agent_position_to_ball_reward"],
    #         }
    #     }

    #     episode.custom_metrics = {
    #         # "total_timesteps": total_timesteps,
    #         # "timesteps_to_goal": timesteps_to_goal,
    #         # "estimated_goals_in_match": estimated_goals_in_match,
    #         # "team_0_goals": episode.last_info_for(0)["ep_metrics"]["team_0_goals"],
    #         # "team_1_goals": episode.last_info_for(0)["ep_metrics"]["team_1_goals"],
    #         # "have_goals": episode.last_info_for(0)["ep_metrics"]["have_goals"],
    #         "agent_0_total_env_reward": episode.user_data[0]["total_env_reward"],
    #         "agent_0_total_ball_to_goal_speed_reward": episode.user_data[0]["total_ball_to_goal_speed_reward"],
    #         "agent_0_total_agent_position_to_ball_reward": episode.user_data[0]["total_agent_position_to_ball_reward"],
    #     }

    def on_episode_end(self,
                       *,
                       worker: "RolloutWorker",
                       base_env: BaseEnv,
                       policies: Dict[PolicyID, Policy],
                       episode: MultiAgentEpisode,
                       env_index: Optional[int] = None,
                       **kwargs) -> None:
        print('on_episode_end worker:')
        print(worker)

    def __init__(self):
        super().__init__()
        # 0=RandomPolicy, 1=1st main policy snapshot,
        # 2=2nd main policy snapshot, etc..
        self.current_opponent = 0

    def on_train_result(self, *, trainer, result, **kwargs):
        # Get the win rate for the train batch.
        # Note that normally, one should set up a proper evaluation config,
        # such that evaluation always happens on the already updated policy,
        # instead of on the already used train_batch.
        print(result)
        main_rew = result["hist_stats"].pop("policy_main_reward")
        opponent_rew = list(result["hist_stats"].values())[0]
        assert len(main_rew) == len(opponent_rew)
        won = 0
        for r_main, r_opponent in zip(main_rew, opponent_rew):
            if r_main > r_opponent:
                won += 1
        win_rate = won / len(main_rew)
        result["win_rate"] = win_rate
        print(f"Iter={trainer.iteration} win-rate={win_rate} -> ", end="")
        # If win rate is good -> Snapshot current policy and play against
        # it next, keeping the snapshot fixed and only improving the "main"
        # policy.
        if win_rate > WIN_RATE_THEWSHOLD:
            self.current_opponent += 1
            new_pol_id = f"main_v{self.current_opponent}"
            print(f"adding new opponent to the mix ({new_pol_id}).")

            # Re-define the mapping function, such that "main" is forced
            # to play against any of the previously played policies
            # (excluding "random").
            def policy_mapping_fn(agent_id, episode, worker, **kwargs):
                # agent_id = [0|1] -> policy depends on episode ID
                # This way, we make sure that both policies sometimes play
                # (start player) and sometimes agent1 (player to move 2nd).
                return "main" if episode.episode_id % 2 == agent_id \
                    else "main_v{}".format(np.random.choice(
                        list(range(1, self.current_opponent + 1))))

            new_policy = trainer.add_policy(
                policy_id=new_pol_id,
                policy_cls=type(trainer.get_policy("main")),
                policy_mapping_fn=policy_mapping_fn,
            )

            # Set the weights of the new policy to the main policy.
            # We'll keep training the main policy, whereas `new_pol_id` will
            # remain fixed.
            main_state = trainer.get_policy("main").get_state()
            new_policy.set_state(main_state)
            # We need to sync the just copied local weights (from main policy)
            # to all the remote workers as well.
            trainer.workers.sync_weights()
        else:
            print("not good enough; will keep learning ...")

        # +2 = main + random
        result["league_size"] = self.current_opponent + 2

## Stop

In [None]:
stop = {
    # "timesteps_total": 15000000,  # 15M
    # "time_total_s": 14400, # 4h
    "episodes_total": 1,
}


## Config


In [None]:
# NUM_ENVS_PER_WORKER = 1
NUM_ENVS_PER_WORKER = 4
ENVIRONMENT_ID = "Soccer"

ENVIRONMENT_CONFIG = {
    "num_envs_per_worker": NUM_ENVS_PER_WORKER,
    "variation": EnvType.multiagent_player,
}


temp_env = create_rllib_env(ENVIRONMENT_CONFIG)
obs_space = temp_env.observation_space
act_space = temp_env.action_space
temp_env.close()


def policy_mapping_fn(agent_id, episode, worker, **kwargs):
    # agent_id = [0|1] -> policy depends on episode ID
    # This way, we make sure that both policies sometimes play agent0
    # (start player) and sometimes agent1 (player to move 2nd).
    return "main" if episode.episode_id % 2 == agent_id else "random"

config = {
    # system settings
    "num_gpus": 1,
    # "num_workers": 3,
    "num_workers": 7,
    "num_envs_per_worker": NUM_ENVS_PER_WORKER,
    "num_cpus_for_driver": 1,
    "num_cpus_per_worker": 1,
    "num_gpus_per_worker": 0,
    "log_level": "INFO",
    "framework": "torch",
    # RL setup
    "multiagent": {
        "policies": {
            "main": (None, obs_space, act_space, {}),
            "random": (RandomPolicy, obs_space, act_space, {}),
        },
        "policy_mapping_fn": policy_mapping_fn,
        "policies_to_train": ["main"],
    },
    "env": ENVIRONMENT_ID,
    "env_config": {**ENVIRONMENT_CONFIG},
    "callbacks": SelfPlayCallback,
}

## Run experiment

### Train PPO SelfPlay

In [None]:
def run_experiment():
    ray.init(num_cpus=8, include_dashboard=False, ignore_reinit_error=True)

    tune.registry.register_env(ENVIRONMENT_ID, create_rllib_env)

    analysis = tune.run(
        "PPO",
        num_samples=1,
        name="PPO_multiagent_league",
        # name="Measuring_rewards",
        config=config,
        stop=stop,
        checkpoint_freq=100,
        checkpoint_at_end=True,
        local_dir="../../ray_results",
        # restore="../../ray_results/PPO_selfplay_1/PPO_Soccer_ID/checkpoint_00X/checkpoint-X",
        # resume=True
    )

    # Gets best trial based on max accuracy across all training iterations.
    best_trial = analysis.get_best_trial("episode_reward_mean", mode="max")
    print(best_trial)
    # Gets best checkpoint for trial based on accuracy.
    best_checkpoint = analysis.get_best_checkpoint(
        trial=best_trial, metric="episode_reward_mean", mode="max"
    )
    print(best_checkpoint)
    print("Done training")
    return analysis, best_trial, best_checkpoint


run_experiment()


## Export agent

In [None]:
this_path = os.path.dirname(os.path.realpath("__file__"))
print('this_path', this_path)


def export_agent(agent_file: str, TRIAL, agent_name="my_ray_soccer_agent", makeZip=False):
    agent_path = os.path.join(f'{this_path}/agents', agent_name)
    os.makedirs(agent_path, exist_ok=True)


    shutil.rmtree(agent_path)
    os.makedirs(agent_path)

    # salva a classe do agente
    with open(os.path.join(agent_path, "agent.py"), "w") as f:
        f.write(agent_file)

    # salva um __init__ para criar o módulo Python
    with open(os.path.join(agent_path, "__init__.py"), "w") as f:
        f.write("from .agent import MyRaySoccerAgent")

    # copia o trial inteiro, incluindo os arquivos de configuração do experimento
    print(f"TRIALLL {TRIAL}")
    shutil.copytree(TRIAL, os.path.join(
        agent_path, TRIAL.split("ray_results/")[1]), )

    # empacota tudo num arquivo .zip
    if makeZip:
        shutil.make_archive(os.path.join(agent_path, agent_name),
                            "zip", os.path.join(agent_path, agent_name))


def get_agent_file_str(ALGORITHM, CHECKPOINT, POLICY_NAME="default"):
    return f"""
import pickle
import os
from typing import Dict

import gym
import numpy as np
import ray
from ray import tune
from ray.rllib.env.base_env import BaseEnv
from ray.tune.registry import get_trainable_cls

from soccer_twos import AgentInterface

ALGORITHM = "{ALGORITHM}"
CHECKPOINT_PATH = os.path.join(
    os.path.dirname(os.path.abspath(__file__)), 
    "{CHECKPOINT.split("ray_results/")[1]}"
)
POLICY_NAME = "{POLICY_NAME}"


class MyRaySoccerAgent(AgentInterface):
    def __init__(self, env: gym.Env):
        super().__init__()
        ray.init(ignore_reinit_error=True)

        # Load configuration from checkpoint file.
        config_path = ""
        if CHECKPOINT_PATH:
            config_dir = os.path.dirname(CHECKPOINT_PATH)
            config_path = os.path.join(config_dir, "params.pkl")
            # Try parent directory.
            if not os.path.exists(config_path):
                config_path = os.path.join(config_dir, "../params.pkl")

        # Load the config from pickled.
        if os.path.exists(config_path):
            with open(config_path, "rb") as f:
                config = pickle.load(f)
        else:
            # If no config in given checkpoint -> Error.
            raise ValueError(
                "Could not find params.pkl in either the checkpoint dir or "
                "its parent directory!"
            )

        # no need for parallelism on evaluation
        config["num_workers"] = 0
        config["num_gpus"] = 0

        # create a dummy env since it's required but we only care about the policy
        tune.registry.register_env("DummyEnv", lambda *_: BaseEnv())
        config["env"] = "DummyEnv"

        # create the Trainer from config
        cls = get_trainable_cls(ALGORITHM)
        agent = cls(env=config["env"], config=config)
        # load state from checkpoint
        agent.restore(CHECKPOINT_PATH)
        # get policy for evaluation
        self.policy = agent.get_policy(POLICY_NAME)

    def act(self, observation: Dict[int, np.ndarray]) -> Dict[int, np.ndarray]:
        actions = {{}}
        for player_id in observation:
            # compute_single_action returns a tuple of (action, action_info, ...)
            # as we only need the action, we discard the other elements
            actions[player_id], *_ = self.policy.compute_single_action(
                observation[player_id]
            )
        return actions

"""


def getAnalysis(experiment: str):
    return ExperimentAnalysis(experiment)


def export():
    # PPO_Soccer_18d23_00000
    # /home/bruno/Workspace/soccer-tows-player/src/ray_results/Testing_env/PPO_Soccer_18d23_00000_0_2021-11-24_20-34-41/checkpoint_000500/checkpoint-500
    analysis = getAnalysis(
        "/home/bruno/Workspace/soccer-tows-player/src/ray_results/PPO_multiagent_player_custom_rewards")

    ALGORITHM = "PPO"
    TRIAL = analysis.get_best_logdir("training_iteration", "max")
    CHECKPOINT = analysis.get_best_checkpoint(
        TRIAL,
        "training_iteration",
        "max",
    )

    print(TRIAL, CHECKPOINT)
    agent_file = get_agent_file_str(ALGORITHM, CHECKPOINT)
    export_agent(agent_file, TRIAL)


export()
