# Algorithm VF Functions?

In [50]:
import numpy as np
import sys
from glob import glob
from os import path

import ray
from ray.tune import Trainable, Tuner
from ray.tune.registry import register_trainable, validate_trainable
from ray.rllib.core.rl_module.rl_module import RLModuleSpec
from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec
from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv
from ray.rllib.utils.test_utils import (
    add_rllib_example_script_args,
    run_rllib_example_script_experiment,
)
from ray.rllib.policy.policy import Policy
from ray.tune.registry import get_trainable_cls, register_env

from pettingzoo.sisl import waterworld_v4




parser = add_rllib_example_script_args(
    default_iters=10,
    default_timesteps=1000000,
    default_reward=300,
)
args = parser.parse_args(args=[])
args.num_env_runners = 10
args.env = 'waterworld'
args.algo = 'PPO'
args.num_agents = 4
args.test_agents = 4

checkpoint_path = f"/root/test/{args.env}/{args.algo}/{args.num_agents}_agent"

sup = sorted(glob(checkpoint_path+'/*'))[0]

pols = glob(sup+"/policies/*")
specs = {path.basename(p) : Policy.from_checkpoint(p) for p in pols}

register_env(f"{args.num_agents}_agent_env", lambda _: ParallelPettingZooEnv(waterworld_v4.parallel_env(n_pursuers=args.num_agents)))
policies = {f"pursuer_{i}" for i in range(args.num_agents)}


resto_config = (
    get_trainable_cls("PPO")
    .get_default_config()
    .environment(f"{args.num_agents}_agent_env")
    .multi_agent(
        policies=policies,
        policy_mapping_fn=(lambda aid, *args, **kwargs: aid),
    )
    .rl_module(
        rl_module_spec=MultiRLModuleSpec(
            rl_module_specs={p: RLModuleSpec() for p in policies},
        ),
    )
    .evaluation(
        evaluation_interval=1,
    )
)
resto_algo = resto_config.build()
""" Known-good weight transfer

for test_id in range(args.test_agents):
    train_id = np.random.randint(args.num_agents)
    resto_algo.get_policy(f"pursuer_{test_id}").set_weights(specs[f"pursuer_{train_id}"].get_weights())
"""

for test_id in range(args.test_agents):
    resto_algo.remove_policy(f"pursuer_{test_id}")
    resto_algo.add_policy(f"pursuer_{test_id}", policy=specs[f"pursuer_{test_id}"])

print(f"Iter 0 eval = {resto_algo.evaluate()['env_runners']['episode_reward_mean']}")
print(f"Iter 1 train = {resto_algo.train()['env_runners']['episode_reward_mean']}")



Iter 0 eval = 148.19188696756157
Iter 1 train = 176.58240143762504


In [12]:
print(f"Iter 0 eval = {resto_algo.evaluate()['env_runners']['episode_reward_max']}")
print(f"Iter 1 train = {resto_algo.train()['env_runners']['episode_reward_max']}")

{'env_runners': {'episode_reward_max': 461.5181588385271,
  'episode_reward_min': -20.753103857376605,
  'episode_reward_mean': 217.74148432444377,
  'episode_len_mean': 500.0,
  'episode_media': {},
  'episodes_timesteps_total': 5000,
  'policy_reward_min': {'pursuer_0': -6.865795740206084,
   'pursuer_1': -35.87920937729004,
   'pursuer_2': -3.864316381727646,
   'pursuer_3': -120.14498397644783},
  'policy_reward_max': {'pursuer_0': 230.30065341508384,
   'pursuer_1': 177.46971310653652,
   'pursuer_2': 205.22379008417556,
   'pursuer_3': 75.05780568295731},
  'policy_reward_mean': {'pursuer_0': 107.08053636479556,
   'pursuer_1': 78.92683764095337,
   'pursuer_2': 71.6896900930275,
   'pursuer_3': -39.955579774332435},
  'custom_metrics': {},
  'hist_stats': {'episode_reward': [217.63442235367253,
    253.2640716211442,
    461.5181588385271,
    175.2852620876895,
    -20.753103857376605,
    13.641541068033922,
    172.43116994939476,
    285.90102996606805,
    296.8287696208130

In [5]:
from deepdiff import DeepDiff

#type(weights) # == dict
print(DeepDiff(
    resto_algo.get_weights()['pursuer_0'], resto_algo.get_policy('pursuer_0').get_weights()
))


{}


In [7]:
dir(resto_algo)

['CLASS_AND_CTOR_ARGS_FILE_NAME',
 'METADATA_FILE_NAME',
 'STATE_FILE_NAME',
 '__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_allow_unknown_configs',
 '_allow_unknown_subkeys',
 '_annotated',
 '_annotated_api_group',
 '_annotated_type',
 '_before_evaluate',
 '_check_component',
 '_checkpoint_info_to_algorithm_state',
 '_close_logfiles',
 '_compile_iteration_results_new_api_stack',
 '_compile_iteration_results_old_api_stack',
 '_counters',
 '_create_local_replay_buffer_if_necessary',
 '_create_logger',
 '_env_id',
 '_episode_history',
 '_episodes_to_be_collected',
 '_episodes_total',


In [9]:
resto_algo.local_replay_buffer

NoneType

In [10]:
resto_algo.train_buffered(1)



[{'evaluation': {'env_runners': {'episode_reward_max': -52.373777901785495,
    'episode_reward_min': -338.1536294188786,
    'episode_reward_mean': -227.8391591015032,
    'episode_len_mean': 500.0,
    'episode_media': {},
    'episodes_timesteps_total': 5000,
    'policy_reward_min': {'pursuer_0': -80.39057154248555,
     'pursuer_1': -112.01512383383537,
     'pursuer_2': -92.73860323874007,
     'pursuer_3': -217.62897900094632},
    'policy_reward_max': {'pursuer_0': 59.426371364178536,
     'pursuer_1': 64.65349960660842,
     'pursuer_2': -10.637238889812515,
     'pursuer_3': -73.83915884197592},
    'policy_reward_mean': {'pursuer_0': -14.874647104117807,
     'pursuer_1': -11.869194607727556,
     'pursuer_2': -55.06799439135102,
     'pursuer_3': -146.02732299830666},
    'custom_metrics': {},
    'hist_stats': {'episode_reward': [-191.34267064270588,
      -149.20912915222485,
      -289.72358942023067,
      -338.1536294188786,
      -52.373777901785495,
      -301.522822

In [6]:
#resto_algo.get_policy('pursuer_0')
resto_algo.get_weights()
resto_algo.remove_policy('pursuer_2')
resto_algo.get_weights()

{'pursuer_3': {'_logits._model.0.weight': array([[ 2.3142239e-03, -2.4724182e-02, -1.9171336e-03, ...,
          -2.6517278e-05,  5.1097013e-03, -6.9465190e-03],
         [-1.0626209e-02,  1.9553106e-03, -1.4944297e-03, ...,
          -4.1192584e-03, -4.1518095e-03, -3.9532766e-02],
         [ 6.8616673e-02, -2.9739959e-02,  6.4301185e-02, ...,
          -6.1141219e-02,  4.1081619e-02,  1.7240876e-03],
         [ 7.1848422e-02, -1.5112576e-02,  4.9385548e-02, ...,
          -4.6515837e-03,  3.5398480e-02, -3.5645299e-02]], dtype=float32),
  '_logits._model.0.bias': array([ 0.00071499, -0.00054015, -0.0220495 , -0.02974158], dtype=float32),
  '_hidden_layers.0._model.0.weight': array([[-0.00517256, -0.00125211,  0.05103198, ...,  0.08859777,
           0.00102746, -0.06655066],
         [ 0.01659291,  0.04072387, -0.02137943, ..., -0.01457614,
          -0.14001028,  0.00451833],
         [-0.01810568, -0.12035925, -0.04207049, ...,  0.08195617,
          -0.04359509,  0.137205  ],
    

In [16]:
resto_algo.train()
resto_algo.reward_estimators



{}

In [17]:
for test_id in range(args.test_agents):
    train_id = np.random.randint(args.num_agents)
    resto_algo.get_policy(f"pursuer_{test_id}").set_weights(specs[f"pursuer_{train_id}"].get_weights())

resto_algo.evaluate()

{'env_runners': {'episode_reward_max': 234.72584415847234,
  'episode_reward_min': 42.19071693515648,
  'episode_reward_mean': 153.27134966308262,
  'episode_len_mean': 500.0,
  'episode_media': {},
  'episodes_timesteps_total': 5000,
  'policy_reward_min': {'pursuer_0': -9.452450210476977,
   'pursuer_1': -2.7238402920415705,
   'pursuer_2': -14.521364073043907,
   'pursuer_3': -75.20296778646306},
  'policy_reward_max': {'pursuer_0': 134.02984716196303,
   'pursuer_1': 89.23919986410138,
   'pursuer_2': 123.82971170556901,
   'pursuer_3': 59.63598218668296},
  'policy_reward_mean': {'pursuer_0': 57.398083928333186,
   'pursuer_1': 41.57426397495998,
   'pursuer_2': 53.818113684033776,
   'pursuer_3': 0.4808880757557382},
  'custom_metrics': {},
  'hist_stats': {'episode_reward': [234.72584415847234,
    203.36865594139715,
    193.93940173401737,
    144.18766626772376,
    42.19071693515648,
    116.41693485886351,
    216.84900296562824,
    109.68114730424016,
    202.118466377012

In [45]:
#pi = specs["pursuer_0"]
#algo_2.add_policy('pursuer_4', specs["pursuer_0"])
#algo_2.add_policy('pursuer_4', Policy.from_checkpoint(pols[0]))

algo_2.add_policy('pursuer_4', policy=specs["pursuer_0"])


PPOTorchPolicy

In [46]:
algo_2.get_weights()

{'pursuer_4': {'_logits._model.0.weight': array([[ 0.01304588,  0.01424849,  0.01597021, ..., -0.02167078,
           0.01854686, -0.00942758],
         [-0.00173817, -0.00119736,  0.01867135, ...,  0.00908531,
           0.01366587,  0.00868121],
         [ 0.06990474,  0.00417062,  0.00581203, ..., -0.05097717,
          -0.01261698, -0.00735434],
         [ 0.03756424,  0.03448802, -0.01689162, ...,  0.02340009,
          -0.07817476, -0.04342073]], dtype=float32),
  '_logits._model.0.bias': array([-0.00473963,  0.00148863, -0.02152874, -0.03009672], dtype=float32),
  '_hidden_layers.0._model.0.weight': array([[-9.67424139e-02, -1.76573601e-02,  1.54389506e-02, ...,
           1.33281732e-02, -1.96825132e-01, -1.05867006e-01],
         [-8.38706642e-02,  1.04095727e-01,  3.91054489e-02, ...,
           5.62631451e-02, -3.48104537e-02, -1.19530105e-05],
         [-2.70693768e-02,  3.97277214e-02,  1.43193137e-02, ...,
          -6.73908144e-02,  4.42212410e-02, -8.34644735e-02],
    

In [40]:
config_2 = (
    get_trainable_cls("PPO")
    .get_default_config()
    .environment(f"{args.num_agents}_agent_env")
    .multi_agent(
        policies=policies,
        policy_mapping_fn=(lambda aid, *args, **kwargs: aid),
    )
    .rl_module(
        rl_module_spec=MultiRLModuleSpec(
            rl_module_specs={p: RLModuleSpec() for p in policies},
            #rl_module_specs={**specs},
        ),
    )
    .evaluation(
        evaluation_interval=1,
    )
)
algo_2 = config_2.build()

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
