In [None]:

import gym
#from gym.spaces import Discrete, MultiDiscrete
import numpy as np
import random

from ray.rllib.env.multi_agent_env import MultiAgentEnv


class MultiMinerMarket(MultiAgentEnv):
    def __init__(self , config = None):
        config = config or {}
        # define reward
        self.R = 100
        self.marginal_cost = [1,1,1,1,1]
        #number of agents
        self.num_agents = len(self.marginal_cost)
        # define a list for hash values
        self.hashes = [0]*self.num_agents
        # define list of probabilities
        self.probs = [0] * self.num_agents
        # observation_space: previous probability , previous hash
        self.observation_space = gym.spaces.Dict({
            "agent_" + str(i) : gym.spaces.Box(low=np.array([0 , 0]), high=np.array([1 , self.R/self.marginal_cost[i]]), dtype=np.float32)  for i in range(0,self.num_agents)
        })
        
        
        self.action_space = gym.spaces.Dict({
            "agent_" + str(i) : gym.spaces.Box(low=np.array([0]), high=np.array([self.R/self.marginal_cost[i]]), dtype=np.float32)  for i in range(0,self.num_agents)
        })
        
        self.reset()
        
    def reset(self):
        #obser = self._get_obs()
        
        #self.marginal_valuation_vector = np.random.uniform(0,10,self.num_agents).tolist()
        
        #self.action_space = gym.spaces.Dict({
         #   "agent_" + str(i) : gym.spaces.Box(low=np.array([0]), high=np.array([self.marginal_valuation_vector[i]]), dtype=np.float32)  for i in range(0,self.num_agents)
        #})
        
        return self._get_obs()
    

    def step(self, action: dict):
        
        # calculate hashes
        hashes = []
        for i in range(0 , self.num_agents):
            hashes.append(action["agent_" + str(i)][0])
            
        self.hashes = hashes
            
        # calculate prs as function of hashes
        if max(hashes)==0:
            prs = [0]*self.num_agents # exclude cases where all the chosen actions are zeros.
        else:
            prs = [hashes[k]/sum(hashes) for k in range(self.num_agents)] # 0<prs[i]<1  
            
        self.probs = prs
            
        # calculate rewards    
        rew = [prs[l] * self.R - self.marginal_cost[l] * hashes[l] for l in range(self.num_agents)]
            
         
        
        rewards = {
             "agent_" + str(i) : float(rew[i]) for i in range(0,self.num_agents)
        }

        obs = self._get_obs()
        
        
        is_done = True
        
        dones = {
           "agent_0" : is_done,
            "agent_1" : is_done,
            "agent_2" : is_done,
            "agent_3" : is_done,
            "agent_4" : is_done,
            # special `__all__` key indicates that the episode is done for all agents.
            "__all__": is_done,
        }
        
        return obs, rewards, dones, {}  # <- info dict (not needed here).

    def _get_obs(self):
        return {
            "agent_" + str(i) : [self.probs[i] ,  self.hashes[i]] for i in range(0,self.num_agents)
        }

    def render(self, mode=None):
        pass


In [None]:
game = MultiMinerMarket()

In [None]:
#test
game = MultiMinerMarket()
game.reset()
count = 0
while count < 1:
    count = count + 1
    action = game.action_space.sample()
    hashes = []
    for i in range(0 , game.num_agents):
        hashes.append(action["agent_" + str(i)][0])
    #print("marginal_value" , game.marginal_valuation_vector)
    #print("bids" , bids)
    obs , rew , done , info = game.step(action)
    for i in range(0 , game.num_agents):
        print(hashes[i] , game.probs[i] , rew["agent_" + str(i)])
    game.reset()

In [None]:
game.reset()
a = game.observation_space
print(a['agent_0'])

In [None]:
a['agent_0'][0]

In [None]:
import numpy as np
import pprint
import ray

# Start a new instance of Ray (when running this tutorial locally) or
# connect to an already running one (when running this tutorial through Anyscale).

ray.init()  # Hear the engine humming? ;)


In [None]:
from ray.rllib.agents.ppo import PPOTrainer


policies= {
    "policy_" + str(i): (None , game.observation_space["agent_" + str(i)] , game.action_space["agent_" + str(i)] , {"gamma" : 0.04}) for i in range(0,game.num_agents)
}

def policy_mapping_fn (agent_id: str):
    #assert agent_id in [str(i) for i in range(0,5)], f"ERROR: invalid agent id {agent_id}!!!"
    return "policy_" + str(agent_id[len(agent_id)-1])
        
config={
    "env": MultiMinerMarket,  # "my_env" <- if we previously have registered the env with `tune.register_env("[name]", lambda config: [returns env object])`.
    #"framework": "torch",
    "model":{
        "fcnet_hiddens": [512 , 512],
    },
    "num_workers": 4,
    "create_env_on_driver": True,
    "multiagent": {
        "policies": policies,
        "policy_mapping_fn": policy_mapping_fn,
},
}




In [None]:
# Override the DefaultCallbacks with your own and implement any methods (hooks)
# that you need.
from ray.rllib.agents.callbacks import DefaultCallbacks
from ray.rllib.evaluation.episode import MultiAgentEpisode


class MyCallbacks(DefaultCallbacks):
    def on_episode_start(self,
                         *,
                         worker,
                         base_env,
                         policies,
                         episode: MultiAgentEpisode,
                         env_index,
                         **kwargs):
        # We will use the `MultiAgentEpisode` object being passed into
        # all episode-related callbacks. It comes with a user_data property (dict),
        # which we can write arbitrary data into.

        # At the end of an episode, we'll transfer that data into the `hist_data`, and `custom_metrics`
        # properties to make sure our custom data is displayed in TensorBoard.

        # The episode is starting:
        # Set per-episode object to capture, which states (observations)
        # have been visited by agent1.
        #episode.user_data["market_clearing_price"] = 0
        # Set per-episode agent2-blocks counter (how many times has agent2 blocked agent1?).
        #episode.user_data["average_accepted_bid"] = 0
        pass
        

    def on_episode_step(self,
                        *,
                        worker,
                        base_env,
                        episode: MultiAgentEpisode,
                        env_index,
                        **kwargs):

        pass

    def on_episode_end(self,
                       *,
                       worker,
                       base_env,
                       policies,
                       episode: MultiAgentEpisode,
                       env_index,
                       **kwargs):
        # Episode is done:
        # Write scalar values (sum over rewards) to `custom_metrics` and
        # time-series data (rewards per time step) to `hist_data`.
        # Both will be visible then in TensorBoard.
        
        #res = []
        #for i in range(0 , game.num_agents):
         #   if episode.last_observation_for("agent_" + str(i))[1]!=0:
          #      res.append(episode.last_observation_for("agent_" + str(i))[0] - episode.last_observation_for("agent_" + str(i))[1])  
        episode.user_data["agent_0_hash"] = episode.last_observation_for('agent_0')[1]
        episode.user_data["agent_1_hash"] = episode.last_observation_for('agent_1')[1]
        episode.user_data["agent_2_hash"] = episode.last_observation_for('agent_2')[1]
        episode.user_data["agent_3_hash"] = episode.last_observation_for('agent_3')[1]
        episode.user_data["agent_4_hash"] = episode.last_observation_for('agent_4')[1]
        
        episode.custom_metrics["agent_0_hash"] = episode.user_data["agent_0_hash"]
        episode.custom_metrics["agent_1_hash"] = episode.user_data["agent_1_hash"]
        episode.custom_metrics["agent_2_hash"] = episode.user_data["agent_2_hash"]
        episode.custom_metrics["agent_3_hash"] = episode.user_data["agent_3_hash"]
        episode.custom_metrics["agent_4_hash"] = episode.user_data["agent_4_hash"]
        

In [None]:
from ray import tune

tune_config = config.copy()
tune_config["callbacks"] = MyCallbacks

tune.run(
    "PPO",
    config=tune_config,
    stop={"training_iteration": 100},
    checkpoint_at_end=True,
    checkpoint_freq=5,
    # If you'd like to restore the tune run from an existing checkpoint file, you can do the following:
    #restore="/Users/sven/ray_results/PPO/PPO_MultiAgentArena_fd451_00000_0_2021-05-25_15-13-26/checkpoint_000010/checkpoint-10",
    local_dir = "5_miners_12"
)