In [1]:
import environment.job_search_environment as job_search_env
import argparse
import os
import random
from supersuit import pad_observations_v0, pad_action_space_v0

import ray
from ray import tune
from ray.tune.registry import register_env




In [2]:
from gym.spaces import Discrete, Dict, Tuple, Box

from gym.spaces.utils import flatten, flatdim, flatten_space

In [3]:
# NOTE: This is working on the Zoo, but not on my local machine (M1 compatibility issues)
from ray.rllib.algorithms.ppo import PPOConfig

In [4]:
import numpy as np

In [5]:
import gym

In [6]:
from ray import tune
from ray.rllib.models.modelv2 import restore_original_dimensions
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv
from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.tune.registry import register_env
import torch
from torch import nn
from ray.rllib.utils.framework import try_import_tf, try_import_torch
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
from ray.rllib.utils.torch_utils import FLOAT_MIN
from ray.rllib.models.preprocessors import Preprocessor, DictFlatteningPreprocessor

In [7]:
tf1, tf, tfv = try_import_tf()
torch, _ = try_import_torch()

In [8]:
ray.shutdown()

In [9]:
ray.init()

E1214 14:17:03.896116766 1074645 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies
E1214 14:17:03.928415111 1074645 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies
E1214 14:17:06.573928855 1074645 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies
E1214 14:17:06.609438915 1074645 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies
2022-12-14 14:17:07,065	INFO worker.py:1528 -- Started a local Ray instance.


0,1
Python version:,3.10.8
Ray version:,2.1.0


In [10]:
"""
In order to deal with the Dictionary space, need to pass a custom model to RLlib.
See: https://medium.com/@nima.siboni/rllib-with-dictionary-state-baa06b64470f
"""
class CandidateModelV0(TorchModelV2, nn.Module):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name, **kwargs):
        orig_space = getattr(obs_space, "original_space", obs_space)
        assert (
            isinstance(orig_space, Dict)
            and "action_mask" in orig_space.spaces
            and "observation" in orig_space.spaces
        )
        print("Orig space")
        print(orig_space)
        print("Obs space")
        print(obs_space)
        print("Flattened obs space")
        print(flatten_space(orig_space["observation"]))
        print("Act space")
        print(action_space)
        print("Num outputs")
        print(num_outputs)
        print("Model config") 
        print(model_config)
        
#         self.orig_space = orig_space
        
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name, **kwargs)
        nn.Module.__init__(self)
        
        self.internal_model = TorchFC(
            flatten_space(orig_space["observation"]),
            action_space,
            num_outputs,
            model_config,
            name + "_internal",
        )
        # disable action masking --> will likely lead to invalid actions
        self.no_masking = False
        if "no_masking" in model_config["custom_model_config"]:
            self.no_masking = model_config["custom_model_config"]["no_masking"]
        
    def forward(self, input_dict, state, seq_lens):
        # Extract the available actions tensor from the observation.
        action_mask = input_dict["obs"]["action_mask"]
        
#         print("original_model")
        
#         print(input_dict["obs_flat"][:,self.num_outputs:])
#         print(input_dict["obs_flat"][:,self.num_outputs:].size())
        
        # Compute the unmasked logits.
        logits, _ = self.internal_model({"obs": input_dict["obs_flat"][:,self.num_outputs:]})
        
        
#         print("logits:\n", logits)
        
        
        # If action masking is disabled, directly return unmasked logits
        if self.no_masking:
            return logits, state

        # Convert action_mask into a [0.0 || -inf]-type mask.
        inf_mask = torch.clamp(torch.log(action_mask), min=FLOAT_MIN)
        masked_logits = logits + inf_mask

#         print("masks:\n", inf_mask, "\n", masked_logits)
        
        # Return masked logits.
        return masked_logits, state

    def value_function(self):
        return self.internal_model.value_function()

In [11]:
def env_creator(args):
    env = job_search_env.env()
    return env

In [12]:
env_name = "job_search_env"
register_env(env_name, lambda config: ParallelPettingZooEnv(env_creator(config)))

In [13]:
ModelCatalog.register_custom_model("CandidateModelV0", CandidateModelV0)

In [14]:
# TODO: use policy_map to map different policies to candidate and employer agents

In [15]:
config = (
    PPOConfig()
    .environment(env=env_name, clip_actions=False)
    .debugging(log_level="ERROR")
    .framework(framework="torch")
    .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))

    .training(model={
                        "custom_model": CandidateModelV0,
                        "custom_model_config": {},
    })
).to_dict()

In [16]:
job_search_env.env().agents

['candidate_0',
 'candidate_1',
 'candidate_2',
 'candidate_3',
 'candidate_4',
 'employer_0',
 'employer_1',
 'employer_2',
 'employer_3',
 'employer_4']

In [17]:
config["observation_space"] = job_search_env.env().observation_space("candidate_0")
config["action_space"] = job_search_env.env().action_space("candidate_0")

In [18]:
config["observation_space"]

Dict(action_mask:Box(0.0, 1.0, (122,), float32), observation:Dict(candidate_obs:Dict(accepted_offer:Dict(employer_0:Discrete(101), employer_1:Discrete(101), employer_2:Discrete(101), employer_3:Discrete(101), employer_4:Discrete(101)), counter_offers:Dict(employer_0:Tuple(Discrete(101), Discrete(11)), employer_1:Tuple(Discrete(101), Discrete(11)), employer_2:Tuple(Discrete(101), Discrete(11)), employer_3:Tuple(Discrete(101), Discrete(11)), employer_4:Tuple(Discrete(101), Discrete(11))), current_offers:Dict(employer_0:Tuple(Discrete(101), Discrete(11)), employer_1:Tuple(Discrete(101), Discrete(11)), employer_2:Tuple(Discrete(101), Discrete(11)), employer_3:Tuple(Discrete(101), Discrete(11)), employer_4:Tuple(Discrete(101), Discrete(11))), job_openings:Dict(employer_0:Discrete(2), employer_1:Discrete(2), employer_2:Discrete(2), employer_3:Discrete(2), employer_4:Discrete(2)), rejected_offers:Dict(employer_0:Tuple(Discrete(2), Discrete(101)), employer_1:Tuple(Discrete(2), Discrete(101)), 

In [19]:
config["action_space"]

Tuple(Discrete(5), Discrete(5), Discrete(101), Discrete(11))

In [None]:
tune.run(
    "PPO",
    name="PPO",
    stop={"timesteps_total": 10000},
    checkpoint_freq=10,
    local_dir="~/ray_results/" + env_name,
    config=config,
    num_samples=5,
)

0,1
Current time:,2022-12-14 14:20:27
Running for:,00:03:17.65
Memory:,25.9/62.4 GiB

Trial name,status,loc,iter,total time (s),ts,reward,num_recreated_worker s,episode_reward_max,episode_reward_min
PPO_job_search_env_e8378_00000,RUNNING,128.36.108.32:1075355,2,156.546,8000,52.5058,0,165.494,0
PPO_job_search_env_e8378_00001,RUNNING,128.36.108.32:1075604,1,73.2469,4000,48.2991,0,219.334,0
PPO_job_search_env_e8378_00002,RUNNING,128.36.108.32:1075771,1,80.4032,4000,35.4808,0,124.133,0
PPO_job_search_env_e8378_00003,RUNNING,128.36.108.32:1076051,1,84.3503,4000,30.094,0,132.76,0
PPO_job_search_env_e8378_00004,RUNNING,128.36.108.32:1077139,1,86.6299,4000,36.2847,0,151.321,0


[2m[36m(pid=1075355)[0m 
[2m[36m(PPO pid=1075355)[0m 2022-12-14 14:17:19,670	INFO ppo.py:379 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(PPO pid=1075355)[0m 2022-12-14 14:17:19,671	INFO algorithm.py:457 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=1075452)[0m 
[2m[36m(pid=1075451)[0m 


[2m[36m(RolloutWorker pid=1075452)[0m Orig space
[2m[36m(RolloutWorker pid=1075452)[0m Dict(action_mask:Box(0.0, 1.0, (122,), float32), observation:Dict(candidate_obs:Dict(accepted_offer:Dict(employer_0:Discrete(101), employer_1:Discrete(101), employer_2:Discrete(101), employer_3:Discrete(101), employer_4:Discrete(101)), counter_offers:Dict(employer_0:Tuple(Discrete(101), Discrete(11)), employer_1:Tuple(Discrete(101), Discrete(11)), employer_2:Tuple(Discrete(101), Discrete(11)), employer_3:Tuple(Discrete(101), Discrete(11)), employer_4:Tuple(Discrete(101), Discrete(11))), current_offers:Dict(employer_0:Tuple(Discrete(101), Discrete(11)), employer_1:Tuple(Discrete(101), Discrete(11)), employer_2:Tuple(Discrete(101), Discrete(11)), employer_3:Tuple(Discrete(101), Discrete(11)), employer_4:Tuple(Discrete(101), Discrete(11))), job_openings:Dict(employer_0:Discrete(2), employer_1:Discrete(2), employer_2:Discrete(2), employer_3:Discrete(2), employer_4:Discrete(2)), rejected_offers:Dict

[2m[36m(PPO pid=1075355)[0m Orig space
[2m[36m(PPO pid=1075355)[0m Dict(action_mask:Box(0.0, 1.0, (122,), float32), observation:Dict(candidate_obs:Dict(accepted_offer:Dict(employer_0:Discrete(101), employer_1:Discrete(101), employer_2:Discrete(101), employer_3:Discrete(101), employer_4:Discrete(101)), counter_offers:Dict(employer_0:Tuple(Discrete(101), Discrete(11)), employer_1:Tuple(Discrete(101), Discrete(11)), employer_2:Tuple(Discrete(101), Discrete(11)), employer_3:Tuple(Discrete(101), Discrete(11)), employer_4:Tuple(Discrete(101), Discrete(11))), current_offers:Dict(employer_0:Tuple(Discrete(101), Discrete(11)), employer_1:Tuple(Discrete(101), Discrete(11)), employer_2:Tuple(Discrete(101), Discrete(11)), employer_3:Tuple(Discrete(101), Discrete(11)), employer_4:Tuple(Discrete(101), Discrete(11))), job_openings:Dict(employer_0:Discrete(2), employer_1:Discrete(2), employer_2:Discrete(2), employer_3:Discrete(2), employer_4:Discrete(2)), rejected_offers:Dict(employer_0:Tuple(Di

[2m[36m(RolloutWorker pid=1075451)[0m   ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
[2m[36m(pid=1075604)[0m 
[2m[36m(PPO pid=1075604)[0m 2022-12-14 14:17:39,518	INFO ppo.py:379 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(PPO pid=1075604)[0m 2022-12-14 14:17:39,519	INFO algorithm.py:457 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=1075675)[0m 
[2m[36m(pid=1075676)[0m 


[2m[36m(RolloutWorker pid=1075675)[0m Orig space
[2m[36m(RolloutWorker pid=1075675)[0m Dict(action_mask:Box(0.0, 1.0, (122,), float32), observation:Dict(candidate_obs:Dict(accepted_offer:Dict(employer_0:Discrete(101), employer_1:Discrete(101), employer_2:Discrete(101), employer_3:Discrete(101), employer_4:Discrete(101)), counter_offers:Dict(employer_0:Tuple(Discrete(101), Discrete(11)), employer_1:Tuple(Discrete(101), Discrete(11)), employer_2:Tuple(Discrete(101), Discrete(11)), employer_3:Tuple(Discrete(101), Discrete(11)), employer_4:Tuple(Discrete(101), Discrete(11))), current_offers:Dict(employer_0:Tuple(Discrete(101), Discrete(11)), employer_1:Tuple(Discrete(101), Discrete(11)), employer_2:Tuple(Discrete(101), Discrete(11)), employer_3:Tuple(Discrete(101), Discrete(11)), employer_4:Tuple(Discrete(101), Discrete(11))), job_openings:Dict(employer_0:Discrete(2), employer_1:Discrete(2), employer_2:Discrete(2), employer_3:Discrete(2), employer_4:Discrete(2)), rejected_offers:Dict

[2m[36m(PPO pid=1075604)[0m Orig space
[2m[36m(PPO pid=1075604)[0m Dict(action_mask:Box(0.0, 1.0, (122,), float32), observation:Dict(candidate_obs:Dict(accepted_offer:Dict(employer_0:Discrete(101), employer_1:Discrete(101), employer_2:Discrete(101), employer_3:Discrete(101), employer_4:Discrete(101)), counter_offers:Dict(employer_0:Tuple(Discrete(101), Discrete(11)), employer_1:Tuple(Discrete(101), Discrete(11)), employer_2:Tuple(Discrete(101), Discrete(11)), employer_3:Tuple(Discrete(101), Discrete(11)), employer_4:Tuple(Discrete(101), Discrete(11))), current_offers:Dict(employer_0:Tuple(Discrete(101), Discrete(11)), employer_1:Tuple(Discrete(101), Discrete(11)), employer_2:Tuple(Discrete(101), Discrete(11)), employer_3:Tuple(Discrete(101), Discrete(11)), employer_4:Tuple(Discrete(101), Discrete(11))), job_openings:Dict(employer_0:Discrete(2), employer_1:Discrete(2), employer_2:Discrete(2), employer_3:Discrete(2), employer_4:Discrete(2)), rejected_offers:Dict(employer_0:Tuple(Di

[2m[36m(PPO pid=1075604)[0m 2022-12-14 14:17:49,553	INFO trainable.py:164 -- Trainable.setup took 10.039 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[2m[36m(RolloutWorker pid=1075675)[0m   ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
[2m[36m(pid=1075771)[0m 
[2m[36m(PPO pid=1075771)[0m 2022-12-14 14:18:00,292	INFO ppo.py:379 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(PPO pid=1075771)[0m 2022-12-14 14:18:00,294	INFO algorithm.py:457 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=1075873)[0m 
[2m[36m(pid=1075874)[0m 


[2m[36m(RolloutWorker pid=1075873)[0m Orig space
[2m[36m(RolloutWorker pid=1075873)[0m Dict(action_mask:Box(0.0, 1.0, (122,), float32), observation:Dict(candidate_obs:Dict(accepted_offer:Dict(employer_0:Discrete(101), employer_1:Discrete(101), employer_2:Discrete(101), employer_3:Discrete(101), employer_4:Discrete(101)), counter_offers:Dict(employer_0:Tuple(Discrete(101), Discrete(11)), employer_1:Tuple(Discrete(101), Discrete(11)), employer_2:Tuple(Discrete(101), Discrete(11)), employer_3:Tuple(Discrete(101), Discrete(11)), employer_4:Tuple(Discrete(101), Discrete(11))), current_offers:Dict(employer_0:Tuple(Discrete(101), Discrete(11)), employer_1:Tuple(Discrete(101), Discrete(11)), employer_2:Tuple(Discrete(101), Discrete(11)), employer_3:Tuple(Discrete(101), Discrete(11)), employer_4:Tuple(Discrete(101), Discrete(11))), job_openings:Dict(employer_0:Discrete(2), employer_1:Discrete(2), employer_2:Discrete(2), employer_3:Discrete(2), employer_4:Discrete(2)), rejected_offers:Dict

[2m[36m(PPO pid=1075771)[0m Orig space
[2m[36m(PPO pid=1075771)[0m Dict(action_mask:Box(0.0, 1.0, (122,), float32), observation:Dict(candidate_obs:Dict(accepted_offer:Dict(employer_0:Discrete(101), employer_1:Discrete(101), employer_2:Discrete(101), employer_3:Discrete(101), employer_4:Discrete(101)), counter_offers:Dict(employer_0:Tuple(Discrete(101), Discrete(11)), employer_1:Tuple(Discrete(101), Discrete(11)), employer_2:Tuple(Discrete(101), Discrete(11)), employer_3:Tuple(Discrete(101), Discrete(11)), employer_4:Tuple(Discrete(101), Discrete(11))), current_offers:Dict(employer_0:Tuple(Discrete(101), Discrete(11)), employer_1:Tuple(Discrete(101), Discrete(11)), employer_2:Tuple(Discrete(101), Discrete(11)), employer_3:Tuple(Discrete(101), Discrete(11)), employer_4:Tuple(Discrete(101), Discrete(11))), job_openings:Dict(employer_0:Discrete(2), employer_1:Discrete(2), employer_2:Discrete(2), employer_3:Discrete(2), employer_4:Discrete(2)), rejected_offers:Dict(employer_0:Tuple(Di

[2m[36m(PPO pid=1075771)[0m 2022-12-14 14:18:10,477	INFO trainable.py:164 -- Trainable.setup took 10.190 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[2m[36m(RolloutWorker pid=1075873)[0m   ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
[2m[36m(pid=1076051)[0m 
[2m[36m(PPO pid=1076051)[0m 2022-12-14 14:18:21,884	INFO ppo.py:379 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(PPO pid=1076051)[0m 2022-12-14 14:18:21,886	INFO algorithm.py:457 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=1076811)[0m 
[2m[36m(pid=1076812)[0m 


[2m[36m(RolloutWorker pid=1076811)[0m Orig space
[2m[36m(RolloutWorker pid=1076811)[0m Dict(action_mask:Box(0.0, 1.0, (122,), float32), observation:Dict(candidate_obs:Dict(accepted_offer:Dict(employer_0:Discrete(101), employer_1:Discrete(101), employer_2:Discrete(101), employer_3:Discrete(101), employer_4:Discrete(101)), counter_offers:Dict(employer_0:Tuple(Discrete(101), Discrete(11)), employer_1:Tuple(Discrete(101), Discrete(11)), employer_2:Tuple(Discrete(101), Discrete(11)), employer_3:Tuple(Discrete(101), Discrete(11)), employer_4:Tuple(Discrete(101), Discrete(11))), current_offers:Dict(employer_0:Tuple(Discrete(101), Discrete(11)), employer_1:Tuple(Discrete(101), Discrete(11)), employer_2:Tuple(Discrete(101), Discrete(11)), employer_3:Tuple(Discrete(101), Discrete(11)), employer_4:Tuple(Discrete(101), Discrete(11))), job_openings:Dict(employer_0:Discrete(2), employer_1:Discrete(2), employer_2:Discrete(2), employer_3:Discrete(2), employer_4:Discrete(2)), rejected_offers:Dict

[2m[36m(PPO pid=1076051)[0m Orig space
[2m[36m(PPO pid=1076051)[0m Dict(action_mask:Box(0.0, 1.0, (122,), float32), observation:Dict(candidate_obs:Dict(accepted_offer:Dict(employer_0:Discrete(101), employer_1:Discrete(101), employer_2:Discrete(101), employer_3:Discrete(101), employer_4:Discrete(101)), counter_offers:Dict(employer_0:Tuple(Discrete(101), Discrete(11)), employer_1:Tuple(Discrete(101), Discrete(11)), employer_2:Tuple(Discrete(101), Discrete(11)), employer_3:Tuple(Discrete(101), Discrete(11)), employer_4:Tuple(Discrete(101), Discrete(11))), current_offers:Dict(employer_0:Tuple(Discrete(101), Discrete(11)), employer_1:Tuple(Discrete(101), Discrete(11)), employer_2:Tuple(Discrete(101), Discrete(11)), employer_3:Tuple(Discrete(101), Discrete(11)), employer_4:Tuple(Discrete(101), Discrete(11))), job_openings:Dict(employer_0:Discrete(2), employer_1:Discrete(2), employer_2:Discrete(2), employer_3:Discrete(2), employer_4:Discrete(2)), rejected_offers:Dict(employer_0:Tuple(Di

[2m[36m(PPO pid=1076051)[0m 2022-12-14 14:18:32,426	INFO trainable.py:164 -- Trainable.setup took 10.548 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[2m[36m(RolloutWorker pid=1076811)[0m   ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
[2m[36m(pid=1077139)[0m 
[2m[36m(PPO pid=1077139)[0m 2022-12-14 14:18:43,368	INFO ppo.py:379 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(PPO pid=1077139)[0m 2022-12-14 14:18:43,370	INFO algorithm.py:457 -- Current log_level is ERROR. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=1077678)[0m 
[2m[36m(pid=1077679)[0m 


[2m[36m(RolloutWorker pid=1077678)[0m Orig space
[2m[36m(RolloutWorker pid=1077678)[0m Dict(action_mask:Box(0.0, 1.0, (122,), float32), observation:Dict(candidate_obs:Dict(accepted_offer:Dict(employer_0:Discrete(101), employer_1:Discrete(101), employer_2:Discrete(101), employer_3:Discrete(101), employer_4:Discrete(101)), counter_offers:Dict(employer_0:Tuple(Discrete(101), Discrete(11)), employer_1:Tuple(Discrete(101), Discrete(11)), employer_2:Tuple(Discrete(101), Discrete(11)), employer_3:Tuple(Discrete(101), Discrete(11)), employer_4:Tuple(Discrete(101), Discrete(11))), current_offers:Dict(employer_0:Tuple(Discrete(101), Discrete(11)), employer_1:Tuple(Discrete(101), Discrete(11)), employer_2:Tuple(Discrete(101), Discrete(11)), employer_3:Tuple(Discrete(101), Discrete(11)), employer_4:Tuple(Discrete(101), Discrete(11))), job_openings:Dict(employer_0:Discrete(2), employer_1:Discrete(2), employer_2:Discrete(2), employer_3:Discrete(2), employer_4:Discrete(2)), rejected_offers:Dict

[2m[36m(PPO pid=1077139)[0m Orig space
[2m[36m(PPO pid=1077139)[0m Dict(action_mask:Box(0.0, 1.0, (122,), float32), observation:Dict(candidate_obs:Dict(accepted_offer:Dict(employer_0:Discrete(101), employer_1:Discrete(101), employer_2:Discrete(101), employer_3:Discrete(101), employer_4:Discrete(101)), counter_offers:Dict(employer_0:Tuple(Discrete(101), Discrete(11)), employer_1:Tuple(Discrete(101), Discrete(11)), employer_2:Tuple(Discrete(101), Discrete(11)), employer_3:Tuple(Discrete(101), Discrete(11)), employer_4:Tuple(Discrete(101), Discrete(11))), current_offers:Dict(employer_0:Tuple(Discrete(101), Discrete(11)), employer_1:Tuple(Discrete(101), Discrete(11)), employer_2:Tuple(Discrete(101), Discrete(11)), employer_3:Tuple(Discrete(101), Discrete(11)), employer_4:Tuple(Discrete(101), Discrete(11))), job_openings:Dict(employer_0:Discrete(2), employer_1:Discrete(2), employer_2:Discrete(2), employer_3:Discrete(2), employer_4:Discrete(2)), rejected_offers:Dict(employer_0:Tuple(Di

[2m[36m(PPO pid=1077139)[0m 2022-12-14 14:18:54,757	INFO trainable.py:164 -- Trainable.setup took 11.394 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[2m[36m(RolloutWorker pid=1077678)[0m   ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Trial name,agent_timesteps_total,counters,custom_metrics,date,done,episode_len_mean,episode_media,episode_reward_max,episode_reward_mean,episode_reward_min,episodes_this_iter,episodes_total,experiment_id,hostname,info,iterations_since_restore,node_ip,num_agent_steps_sampled,num_agent_steps_trained,num_env_steps_sampled,num_env_steps_sampled_this_iter,num_env_steps_trained,num_env_steps_trained_this_iter,num_faulty_episodes,num_healthy_workers,num_recreated_workers,num_steps_trained_this_iter,perf,pid,policy_reward_max,policy_reward_mean,policy_reward_min,sampler_perf,sampler_results,time_since_restore,time_this_iter_s,time_total_s,timers,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
PPO_job_search_env_e8378_00000,8000,"{'num_env_steps_sampled': 8000, 'num_env_steps_trained': 8000, 'num_agent_steps_sampled': 8000, 'num_agent_steps_trained': 8000}",{},2022-12-14_14-20-22,False,11,{},165.494,52.5058,0,36,72,5631ab7233694d82a908c3ea5b5dc085,rhino.zoo.cs.yale.edu,"{'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 25.789734380347756, 'cur_kl_coeff': 0.20000000000000004, 'cur_lr': 5.0000000000000016e-05, 'total_loss': 2.0523761076632367, 'policy_loss': -0.01728771798993631, 'vf_loss': 2.0664526755450874, 'vf_explained_var': 0.007890033978287891, 'kl': 0.01605578989999667, 'entropy': 3.41777140478934, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 128.0}}, 'num_env_steps_sampled': 8000, 'num_env_steps_trained': 8000, 'num_agent_steps_sampled': 8000, 'num_agent_steps_trained': 8000}",2,128.36.108.32,8000,8000,8000,4000,8000,4000,0,2,0,4000,"{'cpu_util_percent': 61.439189189189186, 'ram_util_percent': 41.16283783783784}",1075355,{},{},{},"{'mean_raw_obs_processing_ms': 13.085766662635944, 'mean_inference_ms': 8.555934456707774, 'mean_action_processing_ms': 0.8080806697095734, 'mean_env_wait_ms': 1.2806151033363318, 'mean_env_render_ms': 0.0}","{'episode_reward_max': 165.49433884519982, 'episode_reward_min': 0.0, 'episode_reward_mean': 52.50577701791463, 'episode_len_mean': 11.0, 'episode_media': {}, 'episodes_this_iter': 36, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [115.29290901495457, 0.0, 105.67879724703621, 75.57161263005324, 61.592381944610516, 20.30518086086061, 62.52538808418302, 33.65076098166124, 96.39665907096999, 0.0, 0.0, 28.206941992864518, 0.0, 61.33246949573479, 60.83880342588924, 93.00111253955174, 83.96699915487383, 76.68158184531302, 73.87532426702612, 0.0, 83.61857494203952, 0.0, 54.82398832432364, 35.606968705364025, 60.94938455068326, 0.0, 31.576104508678892, 56.854506410409705, 8.122072344344243, 112.8616431469907, 36.564554435194744, 155.9471810739832, 4.264087980780728, 64.17452411074996, 0.0, 0.7835261664684592, 51.712726986918284, 92.26540930425662, 66.02164665942195, 15.670523329369175, 14.181396156791536, 0.0, 0.0, 27.718183397365276, 54.147148962294956, 97.18945897267687, 11.664351817274424, 35.9047166333313, 165.49433884519982, 0.0, 136.78808397864725, 120.28957233820745, 0.0, 110.9950935992638, 0.0, 126.62769550696173, 118.5404096971712, 151.25580880590365, 80.32019121041088, 74.3909384978015, 9.822612056652146, 12.247569408138146, 35.87248618752041, 55.86610607220908, 42.190605131997806, 24.305301490450148, 32.53740243766023, 73.26786093960536, 70.58429263006532, 63.17167378934411, 54.30830119134941, 0.0], 'episode_lengths': [11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11]}, 'sampler_perf': {'mean_raw_obs_processing_ms': 13.085766662635944, 'mean_inference_ms': 8.555934456707774, 'mean_action_processing_ms': 0.8080806697095734, 'mean_env_wait_ms': 1.2806151033363318, 'mean_env_render_ms': 0.0}, 'num_faulty_episodes': 0}",156.546,87.4234,156.546,"{'training_iteration_time_ms': 78262.57, 'load_time_ms': 0.813, 'load_throughput': 4917120.75, 'learn_time_ms': 73145.062, 'learn_throughput': 54.686, 'synch_weights_time_ms': 4.685}",1671045622,0,8000,2,e8378_00000,9.51092
PPO_job_search_env_e8378_00001,8000,"{'num_env_steps_sampled': 8000, 'num_env_steps_trained': 8000, 'num_agent_steps_sampled': 8000, 'num_agent_steps_trained': 8000}",{},2022-12-14_14-20-29,False,11,{},219.334,59.3363,0,36,72,0d1ad6ba05d643e2a4fe4844452bbeff,rhino.zoo.cs.yale.edu,"{'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 21.88394390498438, 'cur_kl_coeff': 0.20000000000000004, 'cur_lr': 5.0000000000000016e-05, 'total_loss': 2.244137389365063, 'policy_loss': -0.02175425861972154, 'vf_loss': 2.263128360368872, 'vf_explained_var': 0.0015155956309328797, 'kl': 0.01381642161428019, 'entropy': 3.378967484094763, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 128.0}}, 'num_env_steps_sampled': 8000, 'num_env_steps_trained': 8000, 'num_agent_steps_sampled': 8000, 'num_agent_steps_trained': 8000}",2,128.36.108.32,8000,8000,8000,4000,8000,4000,0,2,0,4000,"{'cpu_util_percent': 62.77560975609756, 'ram_util_percent': 41.49268292682928}",1075604,{},{},{},"{'mean_raw_obs_processing_ms': 13.222800150273864, 'mean_inference_ms': 8.823496178198353, 'mean_action_processing_ms': 0.8247781698509217, 'mean_env_wait_ms': 1.3031853895752596, 'mean_env_render_ms': 0.0}","{'episode_reward_max': 219.3340876960251, 'episode_reward_min': 0.0, 'episode_reward_mean': 59.33629585128698, 'episode_len_mean': 11.0, 'episode_media': {}, 'episodes_this_iter': 36, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [0.0, 0.0, 69.39803188720634, 10.66021995195182, 0.0, 70.61051439990719, 42.64087980780728, 68.16677648275592, 0.0, 33.579692848648236, 51.568713297423756, 41.957826214385975, 110.84496706039504, 0.0, 46.894041063137266, 19.954648526077094, 0.0, 42.31041298929678, 0.0, 57.65480787931382, 0.0, 219.3340876960251, 93.2033966592292, 0.0, 61.592381944610516, 143.0227845820814, 0.0, 77.11667977014142, 0.0, 60.77741210053516, 69.97229785544188, 66.4631789069017, 73.91690154212215, 97.84235730775688, 35.818339038558115, 73.46607818134234, 118.31245113673728, 99.12493753167807, 171.66664624898465, 60.44344712756682, 176.6310795341034, 92.32088898071291, 64.3178383228312, 77.3715218763624, 41.132187987230864, 10.152590430430305, 35.88322966945737, 94.02313997621506, 64.39847756490722, 82.04984743168158, 110.21150887122636, 35.45349039197884, 7.462153966366274, 39.12776121442029, 102.94897598395244, 0.0, 104.2424704512209, 57.58917323543173, 109.26858976883261, 43.8185802977372, 0.0, 41.132187987230864, 0.0, 25.719895757090107, 29.46783616995644, 57.565187740539834, 110.51371059596215, 125.01159986004261, 28.427253205204853, 62.90595793646769, 107.53361851059519, 147.21763753645462], 'episode_lengths': [11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11]}, 'sampler_perf': {'mean_raw_obs_processing_ms': 13.222800150273864, 'mean_inference_ms': 8.823496178198353, 'mean_action_processing_ms': 0.8247781698509217, 'mean_env_wait_ms': 1.3031853895752596, 'mean_env_render_ms': 0.0}, 'num_faulty_episodes': 0}",159.597,86.3505,159.597,"{'training_iteration_time_ms': 79788.132, 'load_time_ms': 0.762, 'load_throughput': 5247799.812, 'learn_time_ms': 74637.575, 'learn_throughput': 53.592, 'synch_weights_time_ms': 4.848}",1671045629,0,8000,2,e8378_00001,10.054
PPO_job_search_env_e8378_00002,4000,"{'num_env_steps_sampled': 4000, 'num_env_steps_trained': 4000, 'num_agent_steps_sampled': 4000, 'num_agent_steps_trained': 4000}",{},2022-12-14_14-19-30,False,11,{},124.133,35.4808,0,36,36,5842759c267742a0b4a41cdc65177642,rhino.zoo.cs.yale.edu,"{'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 20.045516858177802, 'cur_kl_coeff': 0.20000000000000004, 'cur_lr': 5.0000000000000016e-05, 'total_loss': 1.0203754210343925, 'policy_loss': -0.015016081184148789, 'vf_loss': 1.0325067707477968, 'vf_explained_var': -0.22447944873122758, 'kl': 0.014423643865088787, 'entropy': 3.2570295941445133, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 128.0}}, 'num_env_steps_sampled': 4000, 'num_env_steps_trained': 4000, 'num_agent_steps_sampled': 4000, 'num_agent_steps_trained': 4000}",1,128.36.108.32,4000,4000,4000,4000,4000,4000,0,2,0,4000,"{'cpu_util_percent': 60.83391304347827, 'ram_util_percent': 38.753043478260864}",1075771,{},{},{},"{'mean_raw_obs_processing_ms': 14.020051529158406, 'mean_inference_ms': 10.222694173974181, 'mean_action_processing_ms': 0.8495221683635047, 'mean_env_wait_ms': 1.3960421974979231, 'mean_env_render_ms': 0.0}","{'episode_reward_max': 124.13293123050298, 'episode_reward_min': 0.0, 'episode_reward_mean': 35.48075831205739, 'episode_len_mean': 11.0, 'episode_media': {}, 'episodes_this_iter': 36, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [0.0, 0.0, 0.0, 58.321759086372126, 47.75778538474415, 0.0, 0.0, 1.2892178324355943, 13.714880799147823, 118.64984128376261, 14.886997162900718, 79.8503775500951, 0.0, 9.0497272227107, 61.237847040690724, 58.740956700140366, 0.0, 0.0, 21.658859584917984, 0.0, 80.76289237798218, 38.66701631521845, 40.33298882667201, 18.274662774774548, 90.55887487291382, 17.237575662306092, 0.0, 0.0, 66.39471837043311, 69.10700788251808, 64.24914565041362, 0.0, 112.50152526510352, 69.92971035730996, 124.13293123050298, 0.0], 'episode_lengths': [11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11]}, 'sampler_perf': {'mean_raw_obs_processing_ms': 14.020051529158406, 'mean_inference_ms': 10.222694173974181, 'mean_action_processing_ms': 0.8495221683635047, 'mean_env_wait_ms': 1.3960421974979231, 'mean_env_render_ms': 0.0}, 'num_faulty_episodes': 0}",80.4032,80.4032,80.4032,"{'training_iteration_time_ms': 80393.788, 'load_time_ms': 0.415, 'load_throughput': 9642078.161, 'learn_time_ms': 74954.919, 'learn_throughput': 53.365, 'synch_weights_time_ms': 3.367}",1671045570,0,4000,1,e8378_00002,10.2045
PPO_job_search_env_e8378_00003,4000,"{'num_env_steps_sampled': 4000, 'num_env_steps_trained': 4000, 'num_agent_steps_sampled': 4000, 'num_agent_steps_trained': 4000}",{},2022-12-14_14-19-56,False,11,{},132.76,30.094,0,36,36,965a7a56024b4597be8f8e76dacd68ce,rhino.zoo.cs.yale.edu,"{'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 20.767451837722973, 'cur_kl_coeff': 0.20000000000000004, 'cur_lr': 5.0000000000000016e-05, 'total_loss': 1.1207759798053771, 'policy_loss': -0.01724407588061626, 'vf_loss': 1.1353573478073613, 'vf_explained_var': -0.051303563835800335, 'kl': 0.013313520703477674, 'entropy': 3.280034283156036, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 128.0}}, 'num_env_steps_sampled': 4000, 'num_env_steps_trained': 4000, 'num_agent_steps_sampled': 4000, 'num_agent_steps_trained': 4000}",1,128.36.108.32,4000,4000,4000,4000,4000,4000,0,2,0,4000,"{'cpu_util_percent': 61.81916666666667, 'ram_util_percent': 40.751666666666665}",1076051,{},{},{},"{'mean_raw_obs_processing_ms': 13.739566897871482, 'mean_inference_ms': 10.503789085653883, 'mean_action_processing_ms': 0.8373966264487497, 'mean_env_wait_ms': 1.3157087772046747, 'mean_env_render_ms': 0.0}","{'episode_reward_max': 132.7601070351783, 'episode_reward_min': 0.0, 'episode_reward_mean': 30.094030121143994, 'episode_len_mean': 11.0, 'episode_media': {}, 'episodes_this_iter': 36, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [46.701915979979404, 4.264087980780728, 0.0, 9.822612056652147, 0.7835261664684589, 46.228043821639076, 13.16323959667011, 0.0, 7.835261664684587, 0.0, 28.117913832199545, 132.7601070351783, 30.916211013372475, 50.71098055828146, 78.00453514739229, 23.858371768964574, 0.0, 44.062242468067524, 44.42593363876162, 21.5959399632869, 2.5784356648711886, 0.6446089162177971, 50.45837443923862, 58.441855866596036, 0.0, 6.875080424726641, 40.508835817416916, 0.0, 0.0, 0.0, 100.91674887847724, 116.62825275014725, 66.599724149819, 0.0, 50.08611279012283, 6.3961319711710924], 'episode_lengths': [11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11]}, 'sampler_perf': {'mean_raw_obs_processing_ms': 13.739566897871482, 'mean_inference_ms': 10.503789085653883, 'mean_action_processing_ms': 0.8373966264487497, 'mean_env_wait_ms': 1.3157087772046747, 'mean_env_render_ms': 0.0}, 'num_faulty_episodes': 0}",84.3503,84.3503,84.3503,"{'training_iteration_time_ms': 84341.791, 'load_time_ms': 0.427, 'load_throughput': 9367513.121, 'learn_time_ms': 78554.916, 'learn_throughput': 50.92, 'synch_weights_time_ms': 3.471}",1671045596,0,4000,1,e8378_00003,10.5645
PPO_job_search_env_e8378_00004,4000,"{'num_env_steps_sampled': 4000, 'num_env_steps_trained': 4000, 'num_agent_steps_sampled': 4000, 'num_agent_steps_trained': 4000}",{},2022-12-14_14-20-21,False,11,{},151.321,36.2847,0,36,36,a15f5b82de8b4569829dc4f23a0b4960,rhino.zoo.cs.yale.edu,"{'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 20.175255126049443, 'cur_kl_coeff': 0.20000000000000004, 'cur_lr': 5.0000000000000016e-05, 'total_loss': 1.2869643083182714, 'policy_loss': -0.018620791478503134, 'vf_loss': 1.3034242171656147, 'vf_explained_var': -0.12801709072564238, 'kl': 0.010804397656364968, 'entropy': 3.213193707312307, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 128.0}}, 'num_env_steps_sampled': 4000, 'num_env_steps_trained': 4000, 'num_agent_steps_sampled': 4000, 'num_agent_steps_trained': 4000}",1,128.36.108.32,4000,4000,4000,4000,4000,4000,0,2,0,4000,"{'cpu_util_percent': 62.849999999999994, 'ram_util_percent': 41.46774193548387}",1077139,{},{},{},"{'mean_raw_obs_processing_ms': 13.436183407532042, 'mean_inference_ms': 9.995926671953344, 'mean_action_processing_ms': 0.8102815542648089, 'mean_env_wait_ms': 1.3158131594681621, 'mean_env_render_ms': 0.0}","{'episode_reward_max': 151.32140342360083, 'episode_reward_min': 0.0, 'episode_reward_mean': 36.284660094365066, 'episode_len_mean': 11.0, 'episode_media': {}, 'episodes_this_iter': 36, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [49.12355052598061, 0.0, 0.0, 0.0, 75.86523199139044, 0.0, 60.87998313459926, 124.56214377722057, 0.0, 32.69134118598558, 6.139132535407591, 47.62956266644088, 30.65969618250578, 86.98062641430657, 57.093932579290595, 13.536787240573739, 0.0, 33.76522894474175, 81.34738909547924, 0.0, 0.0, 2.9848615865465096, 12.792263942342183, 151.32140342360083, 3.134104665873835, 46.02507661795072, 126.60606449409832, 89.96218438015744, 10.958351575702551, 23.964947982687157, 73.14579379520183, 0.0, 0.0, 44.772923798197645, 0.0, 20.30518086086061], 'episode_lengths': [11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11]}, 'sampler_perf': {'mean_raw_obs_processing_ms': 13.436183407532042, 'mean_inference_ms': 9.995926671953344, 'mean_action_processing_ms': 0.8102815542648089, 'mean_env_wait_ms': 1.3158131594681621, 'mean_env_render_ms': 0.0}, 'num_faulty_episodes': 0}",86.6299,86.6299,86.6299,"{'training_iteration_time_ms': 86619.78, 'load_time_ms': 0.393, 'load_throughput': 10180349.515, 'learn_time_ms': 81286.106, 'learn_throughput': 49.209, 'synch_weights_time_ms': 3.48}",1671045621,0,4000,1,e8378_00004,11.4109


### Use gym.space instead of gymnasium.spaces to fix this AHHHHH
```
from gym.spaces import Discrete, Dict, Tuple

from gym.spaces.utils import flatten, flatdim
```

In [234]:
from gym.spaces import Discrete, Dict, Tuple, Box

from gym.spaces.utils import flatten, flatdim

In [243]:
space = Dict({"test": Discrete(10)})
space["test"]

Discrete(10)

In [235]:
np.zeros(flatdim(Tuple((Discrete(4), Discrete(2)))))

array([0., 0., 0., 0., 0., 0.])

In [236]:
Box(0.0, 1.0, shape=(10,)).sample()

array([0.66815686, 0.8099814 , 0.6728993 , 0.3833307 , 0.83226   ,
       0.01675155, 0.36507058, 0.9905923 , 0.5650419 , 0.37177613],
      dtype=float32)

In [237]:
Box(0.0, 1, shape=(10,)).sample()

array([0.8192585 , 0.21756119, 0.7595661 , 0.06519676, 0.5593857 ,
       0.6580883 , 0.89733326, 0.01662113, 0.39561164, 0.0882608 ],
      dtype=float32)

In [238]:
np.zeros(10)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [239]:
flatten(Tuple((Discrete(10), Discrete(5))), (0,0))

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [240]:
action_mask = torch.from_numpy(np.array([1, 0]))
action_mask 

tensor([1, 0])

In [241]:
inf_mask = torch.clamp(torch.log(action_mask), min=FLOAT_MIN)
inf_mask

tensor([ 0.0000e+00, -3.4000e+38])

In [17]:
logits = torch.tensor([1,1])
logits + inf_mask

tensor([ 1.0000e+00, -3.4000e+38])

In [16]:
torch.tensor([0, 1]) + torch.tensor([1, 0])

tensor([1, 1])

# Run 1
Got the error: 
```
AssertionError: Observation spaces for all agents must be identical. Perhaps SuperSuit's pad_observations wrapper can help (useage: `supersuit.aec_wrappers.pad_observations(env)`
```

# Run 2

```
AssertionError: homogenization only supports Discrete and Box spaces
```

# Run 3

Same error
```
Traceback (most recent call last):
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/tune/execution/ray_trial_executor.py", line 1050, in get_next_executor_event
    future_result = ray.get(ready_future)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
    return func(*args, **kwargs)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/_private/worker.py", line 2291, in get
    raise value
ray.exceptions.RayActorError: The actor died because of an error raised in its creation task, ray::PPO.__init__() (pid=1879526, ip=128.36.108.57, repr=PPO)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/worker_set.py", line 139, in __init__
    self.add_workers(
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/worker_set.py", line 490, in add_workers
    self.foreach_worker(lambda w: w.assert_healthy())
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/worker_set.py", line 620, in foreach_worker
    remote_results = ray.get([w.apply.remote(func) for w in self.remote_workers()])
ray.exceptions.RayActorError: The actor died because of an error raised in its creation task, ray::RolloutWorker.__init__() (pid=1879573, ip=128.36.108.57, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7fa145f35d50>)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/rollout_worker.py", line 492, in __init__
    self.env = env_creator(copy.deepcopy(self.env_context))
  File "/tmp/ipykernel_1861406/2067092441.py", line 2, in <lambda>
  File "/tmp/ipykernel_1861406/841127696.py", line 3, in env_creator
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/supersuit/multiagent_wrappers/padding_wrappers.py", line 33, in pad_observations_v0
    homogenize_ops.check_homogenize_spaces(spaces)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/supersuit/utils/action_transforms/homogenize_ops.py", line 30, in check_homogenize_spaces
    assert False, "homogenization only supports Discrete and Box spaces"
AssertionError: homogenization only supports Discrete and Box spaces

During handling of the above exception, another exception occurred:

ray::PPO.__init__() (pid=1879526, ip=128.36.108.57, repr=PPO)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/algorithms/algorithm.py", line 414, in __init__
    super().__init__(config=config, logger_creator=logger_creator, **kwargs)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 161, in __init__
    self.setup(copy.deepcopy(self.config))
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/algorithms/algorithm.py", line 549, in setup
    raise e.args[0].args[2]
AssertionError: homogenization only supports Discrete and Box spaces
```

i.e. I cannot use the SuperSuit wrapper to fix the issue of observation spaces for all agents needing to be identical.

# Run 4

Fixed the issue by making all observation spaces and action spaces the same for all agents.

New issue:

```
Failure # 1 (occurred at 2022-12-12_17-37-17)
Traceback (most recent call last):
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/tune/execution/ray_trial_executor.py", line 1050, in get_next_executor_event
    future_result = ray.get(ready_future)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
    return func(*args, **kwargs)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/_private/worker.py", line 2291, in get
    raise value
ray.exceptions.RayActorError: The actor died because of an error raised in its creation task, ray::PPO.__init__() (pid=499392, ip=128.36.232.21, repr=PPO)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/worker_set.py", line 139, in __init__
    self.add_workers(
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/worker_set.py", line 490, in add_workers
    self.foreach_worker(lambda w: w.assert_healthy())
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/worker_set.py", line 620, in foreach_worker
    remote_results = ray.get([w.apply.remote(func) for w in self.remote_workers()])
ray.exceptions.RayActorError: The actor died because of an error raised in its creation task, ray::RolloutWorker.__init__() (pid=499452, ip=128.36.232.21, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7f8059b28df0>)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/rollout_worker.py", line 567, in __init__
    self.policy_dict = _determine_spaces_for_multi_agent_dict(
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/rollout_worker.py", line 2121, in _determine_spaces_for_multi_agent_dict
    raise ValueError(
ValueError: `observation_space` not provided in PolicySpec for default_policy and env does not have an observation space OR no spaces received from other workers' env(s) OR no `observation_space` specified in config!

During handling of the above exception, another exception occurred:

ray::PPO.__init__() (pid=499392, ip=128.36.232.21, repr=PPO)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/algorithms/algorithm.py", line 414, in __init__
    super().__init__(config=config, logger_creator=logger_creator, **kwargs)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 161, in __init__
    self.setup(copy.deepcopy(self.config))
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/algorithms/algorithm.py", line 549, in setup
    raise e.args[0].args[2]
ValueError: `observation_space` not provided in PolicySpec for default_policy and env does not have an observation space OR no spaces received from other workers' env(s) OR no `observation_space` specified in config!
```

# Run 5

```
Failure # 1 (occurred at 2022-12-12_20-29-44)
Traceback (most recent call last):
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/tune/execution/ray_trial_executor.py", line 1050, in get_next_executor_event
    future_result = ray.get(ready_future)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
    return func(*args, **kwargs)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/_private/worker.py", line 2291, in get
    raise value
ray.exceptions.RayActorError: The actor died because of an error raised in its creation task, ray::PPO.__init__() (pid=2054378, ip=128.36.108.57, repr=PPO)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/worker_set.py", line 139, in __init__
    self.add_workers(
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/worker_set.py", line 490, in add_workers
    self.foreach_worker(lambda w: w.assert_healthy())
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/worker_set.py", line 620, in foreach_worker
    remote_results = ray.get([w.apply.remote(func) for w in self.remote_workers()])
ray.exceptions.RayActorError: The actor died because of an error raised in its creation task, ray::RolloutWorker.__init__() (pid=2054502, ip=128.36.108.57, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7f0abd339d50>)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/rollout_worker.py", line 625, in __init__
    self._build_policy_map(
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/rollout_worker.py", line 1882, in _build_policy_map
    preprocessor = ModelCatalog.get_preprocessor_for_space(
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/models/catalog.py", line 815, in get_preprocessor_for_space
    prep = cls(observation_space, options)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/models/preprocessors.py", line 42, in __init__
    self._size = int(np.product(self.shape))
TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'

During handling of the above exception, another exception occurred:

ray::PPO.__init__() (pid=2054378, ip=128.36.108.57, repr=PPO)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/algorithms/algorithm.py", line 414, in __init__
    super().__init__(config=config, logger_creator=logger_creator, **kwargs)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 161, in __init__
    self.setup(copy.deepcopy(self.config))
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/algorithms/algorithm.py", line 549, in setup
    raise e.args[0].args[2]
TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'
```

Note: the `Dict` space does not have a shape <https://github.com/openai/gym/blob/master/gym/spaces/dict.py#L118>

Nope, the issue was I was using the wrong spaces from gymnasium.spaces, when instead RLlib assumes using gym.spaces spaces

# Run 6

```
2022-12-12 21:59:59,682	ERROR trial_runner.py:993 -- Trial PPO_job_search_30f4a_00000: Error processing event.
ray.exceptions.RayTaskError(ValueError): ray::PPO.train() (pid=1671497, ip=128.36.232.24, repr=PPO)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 355, in train
    raise skipped from exception_cause(skipped)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 352, in train
    result = self.step()
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/algorithms/algorithm.py", line 772, in step
    results, train_iter_ctx = self._run_one_training_iteration()
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/algorithms/algorithm.py", line 2953, in _run_one_training_iteration
    num_recreated += self.try_recover_from_step_attempt(
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/algorithms/algorithm.py", line 2617, in try_recover_from_step_attempt
    raise error
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/algorithms/algorithm.py", line 2948, in _run_one_training_iteration
    results = self.training_step()
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/algorithms/ppo/ppo.py", line 408, in training_step
    train_batch = synchronous_parallel_sample(
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/execution/rollout_ops.py", line 100, in synchronous_parallel_sample
    sample_batches = ray.get(
ray.exceptions.RayTaskError(ValueError): ray::RolloutWorker.sample() (pid=1671528, ip=128.36.232.24, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7f25d5759db0>)
ValueError: The two structures don't have the same nested structure.

First structure: type=dict str={'observation': {'candidate_obs': {'job_openings': {'employer_0': 1}, 'accepted_offer': {'employer_0': 0}, 'current_offers': {'employer_0': (0, 0)}, 'rejected_offers': {'employer_0': (0, 0)}, 'counter_offers': {'employer_0': (0, 0)}}, 'employer_obs': {'candidate_strengths': {'candidate_0': 0}, 'job_applicants': {'candidate_0': 0}, 'outstanding_offers': {'candidate_0': (0, 0)}, 'accepted_offers': {'candidate_0': 0}, 'declined_offers': {'candidate_0': (0, 0)}, 'counter_offers': {'candidate_0': (0, 0)}, 'rejected_offers': {'candidate_0': (0, 0)}, 'remaining_budget': 100}}, 'action_mask': array([0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])}

Second structure: type=OrderedDict str=OrderedDict([('candidate_obs', OrderedDict([('accepted_offer', OrderedDict([('employer_0', 56)])), ('counter_offers', OrderedDict([('employer_0', (99, 8))])), ('current_offers', OrderedDict([('employer_0', (28, 10))])), ('job_openings', OrderedDict([('employer_0', 1)])), ('rejected_offers', OrderedDict([('employer_0', (1, 49))]))])), ('employer_obs', OrderedDict([('accepted_offers', OrderedDict([('candidate_0', 1)])), ('candidate_strengths', OrderedDict([('candidate_0', 79)])), ('counter_offers', OrderedDict([('candidate_0', (66, 5))])), ('declined_offers', OrderedDict([('candidate_0', (1, 7))])), ('job_applicants', OrderedDict([('candidate_0', 1)])), ('outstanding_offers', OrderedDict([('candidate_0', (28, 0))])), ('rejected_offers', OrderedDict([('candidate_0', (0, 4))])), ('remaining_budget', 96)]))])

More specifically: Substructure "type=OrderedDict str=OrderedDict([('accepted_offer', OrderedDict([('employer_0', 56)])), ('counter_offers', OrderedDict([('employer_0', (99, 8))])), ('current_offers', OrderedDict([('employer_0', (28, 10))])), ('job_openings', OrderedDict([('employer_0', 1)])), ('rejected_offers', OrderedDict([('employer_0', (1, 49))]))])" is a sequence, while substructure "type=ndarray str=[0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0]" is not

During handling of the above exception, another exception occurred:

ray::RolloutWorker.sample() (pid=1671528, ip=128.36.232.24, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7f25d5759db0>)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/rollout_worker.py", line 828, in sample
    batches = [self.input_reader.next()]
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/sampler.py", line 92, in next
    batches = [self.get_data()]
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/sampler.py", line 285, in get_data
    item = next(self._env_runner)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/sampler.py", line 671, in _env_runner
    active_envs, to_eval, outputs = _process_observations(
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/sampler.py", line 922, in _process_observations
    prep_obs = preprocessor.transform(raw_obs)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/models/preprocessors.py", line 283, in transform
    self.check_shape(observation)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/models/preprocessors.py", line 69, in check_shape
    observation = convert_element_to_space_type(
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/utils/spaces/space_utils.py", line 359, in convert_element_to_space_type
    return tree.map_structure(map_, element, sampled_element, check_types=False)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/tree/__init__.py", line 428, in map_structure
    assert_same_structure(structures[0], other, check_types=check_types)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/tree/__init__.py", line 284, in assert_same_structure
    raise type(e)("%s\n"
ValueError: The two structures don't have the same nested structure.

First structure: type=dict str={'observation': {'candidate_obs': {'job_openings': {'employer_0': 1}, 'accepted_offer': {'employer_0': 0}, 'current_offers': {'employer_0': (0, 0)}, 'rejected_offers': {'employer_0': (0, 0)}, 'counter_offers': {'employer_0': (0, 0)}}, 'employer_obs': {'candidate_strengths': {'candidate_0': 0}, 'job_applicants': {'candidate_0': 0}, 'outstanding_offers': {'candidate_0': (0, 0)}, 'accepted_offers': {'candidate_0': 0}, 'declined_offers': {'candidate_0': (0, 0)}, 'counter_offers': {'candidate_0': (0, 0)}, 'rejected_offers': {'candidate_0': (0, 0)}, 'remaining_budget': 100}}, 'action_mask': array([0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])}

Second structure: type=OrderedDict str=OrderedDict([('candidate_obs', OrderedDict([('accepted_offer', OrderedDict([('employer_0', 56)])), ('counter_offers', OrderedDict([('employer_0', (99, 8))])), ('current_offers', OrderedDict([('employer_0', (28, 10))])), ('job_openings', OrderedDict([('employer_0', 1)])), ('rejected_offers', OrderedDict([('employer_0', (1, 49))]))])), ('employer_obs', OrderedDict([('accepted_offers', OrderedDict([('candidate_0', 1)])), ('candidate_strengths', OrderedDict([('candidate_0', 79)])), ('counter_offers', OrderedDict([('candidate_0', (66, 5))])), ('declined_offers', OrderedDict([('candidate_0', (1, 7))])), ('job_applicants', OrderedDict([('candidate_0', 1)])), ('outstanding_offers', OrderedDict([('candidate_0', (28, 0))])), ('rejected_offers', OrderedDict([('candidate_0', (0, 4))])), ('remaining_budget', 96)]))])

More specifically: Substructure "type=OrderedDict str=OrderedDict([('accepted_offer', OrderedDict([('employer_0', 56)])), ('counter_offers', OrderedDict([('employer_0', (99, 8))])), ('current_offers', OrderedDict([('employer_0', (28, 10))])), ('job_openings', OrderedDict([('employer_0', 1)])), ('rejected_offers', OrderedDict([('employer_0', (1, 49))]))])" is a sequence, while substructure "type=ndarray str=[0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0]" is not
Entire first structure:
{'observation': {'candidate_obs': {'job_openings': {'employer_0': .}, 'accepted_offer': {'employer_0': .}, 'current_offers': {'employer_0': (., .)}, 'rejected_offers': {'employer_0': (., .)}, 'counter_offers': {'employer_0': (., .)}}, 'employer_obs': {'candidate_strengths': {'candidate_0': .}, 'job_applicants': {'candidate_0': .}, 'outstanding_offers': {'candidate_0': (., .)}, 'accepted_offers': {'candidate_0': .}, 'declined_offers': {'candidate_0': (., .)}, 'counter_offers': {'candidate_0': (., .)}, 'rejected_offers': {'candidate_0': (., .)}, 'remaining_budget': .}}, 'action_mask': .}
Entire second structure:
OrderedDict([('candidate_obs', OrderedDict([('accepted_offer', OrderedDict([('employer_0', .)])), ('counter_offers', OrderedDict([('employer_0', (., .))])), ('current_offers', OrderedDict([('employer_0', (., .))])), ('job_openings', OrderedDict([('employer_0', .)])), ('rejected_offers', OrderedDict([('employer_0', (., .))]))])), ('employer_obs', OrderedDict([('accepted_offers', OrderedDict([('candidate_0', .)])), ('candidate_strengths', OrderedDict([('candidate_0', .)])), ('counter_offers', OrderedDict([('candidate_0', (., .))])), ('declined_offers', OrderedDict([('candidate_0', (., .))])), ('job_applicants', OrderedDict([('candidate_0', .)])), ('outstanding_offers', OrderedDict([('candidate_0', (., .))])), ('rejected_offers', OrderedDict([('candidate_0', (., .))])), ('remaining_budget', .)]))])
```

# Run 7

Issues with observation/action_mask dictionary structure. Fixed by updating the observation space definition to also include the action mask.

# Run 8

```
Failure # 1 (occurred at 2022-12-13_13-53-45)
ray::PPO.train() (pid=2035053, ip=128.36.232.24, repr=PPO)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 355, in train
    raise skipped from exception_cause(skipped)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 352, in train
    result = self.step()
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/algorithms/algorithm.py", line 772, in step
    results, train_iter_ctx = self._run_one_training_iteration()
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/algorithms/algorithm.py", line 2953, in _run_one_training_iteration
    num_recreated += self.try_recover_from_step_attempt(
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/algorithms/algorithm.py", line 2617, in try_recover_from_step_attempt
    raise error
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/algorithms/algorithm.py", line 2948, in _run_one_training_iteration
    results = self.training_step()
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/algorithms/ppo/ppo.py", line 408, in training_step
    train_batch = synchronous_parallel_sample(
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/execution/rollout_ops.py", line 100, in synchronous_parallel_sample
    sample_batches = ray.get(
ray.exceptions.RayTaskError(ValueError): ray::RolloutWorker.sample() (pid=2035175, ip=128.36.232.24, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7fa1c5255d80>)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/rollout_worker.py", line 828, in sample
    batches = [self.input_reader.next()]
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/sampler.py", line 92, in next
    batches = [self.get_data()]
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/sampler.py", line 285, in get_data
    item = next(self._env_runner)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/sampler.py", line 721, in _env_runner
    base_env.send_actions(actions_to_send)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/env/multi_agent_env.py", line 615, in send_actions
    raise e
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/env/multi_agent_env.py", line 608, in send_actions
    obs, rewards, dones, infos = env.step(agent_dict)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/env/wrappers/pettingzoo_env.py", line 184, in step
    obss, rews, dones, infos = self.par_env.step(action_dict)
ValueError: too many values to unpack (expected 4)
```

# Run 9

```
Failure # 1 (occurred at 2022-12-13_14-18-31)
ray::PPO.train() (pid=2045783, ip=128.36.232.24, repr=PPO)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 355, in train
    raise skipped from exception_cause(skipped)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 352, in train
    result = self.step()
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/algorithms/algorithm.py", line 772, in step
    results, train_iter_ctx = self._run_one_training_iteration()
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/algorithms/algorithm.py", line 2953, in _run_one_training_iteration
    num_recreated += self.try_recover_from_step_attempt(
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/algorithms/algorithm.py", line 2617, in try_recover_from_step_attempt
    raise error
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/algorithms/algorithm.py", line 2948, in _run_one_training_iteration
    results = self.training_step()
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/algorithms/ppo/ppo.py", line 408, in training_step
    train_batch = synchronous_parallel_sample(
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/execution/rollout_ops.py", line 100, in synchronous_parallel_sample
    sample_batches = ray.get(
ray.exceptions.RayTaskError(KeyError): ray::RolloutWorker.sample() (pid=2045906, ip=128.36.232.24, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7f68b792dde0>)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/rollout_worker.py", line 828, in sample
    batches = [self.input_reader.next()]
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/sampler.py", line 92, in next
    batches = [self.get_data()]
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/sampler.py", line 285, in get_data
    item = next(self._env_runner)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/evaluation/sampler.py", line 721, in _env_runner
    base_env.send_actions(actions_to_send)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/env/multi_agent_env.py", line 615, in send_actions
    raise e
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/env/multi_agent_env.py", line 608, in send_actions
    obs, rewards, dones, infos = env.step(agent_dict)
  File "/home/accts/ahc49/.local/lib/python3.10/site-packages/ray/rllib/env/wrappers/pettingzoo_env.py", line 184, in step
    obss, rews, dones, infos = self.par_env.step(action_dict)
  File "/home/accts/ahc49/csec491/salary-negotation/environment/job_search_environment.py", line 394, in step
    action, target_index, new_offer_value, new_deadline = actions[agent]
KeyError: 'employer_0'
```

I didn't actually specify in the config to use the custom model sigh :(
