In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np

from ray.rllib.agents.trainer import Trainer, with_common_config
from ray.rllib.utils.annotations import override


Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
# yapf: disable
# __sphinx_doc_begin__
class RandomAgent(Trainer):
    """Policy that takes random actions and never learns."""

    _name = "RandomAgent"
    _default_config = with_common_config({
        "rollouts_per_iteration": 10,
    })

    @override(Trainer)
    def _init(self, config, env_creator):
        self.env = env_creator(config["env_config"])

    @override(Trainer)
    def _train(self):
        rewards = []
        steps = 0
        for _ in range(self.config["rollouts_per_iteration"]):
            obs = self.env.reset()
            done = False
            reward = 0.0
            while not done:
                action = self.env.action_space.sample()
                obs, r, done, info = self.env.step(action)

                reward += r
                steps += 1
            rewards.append(reward)
        return {
            "episode_reward_mean": np.mean(rewards),
            "timesteps_this_iter": steps,
        }

class VIAgent(Trainer):
    """Value Iteration.
    #TODO Make it Generalized PI.
    """

    _name = "VIAgent"
    _default_config = with_common_config({
        "tolerance": 0.01,
        "discount_factor": 0.5,
        "rollouts_per_iteration": 10,
        "episode_length": 200,
        # "lr": 0.5
    })

    @override(Trainer)
    def _init(self, config, env_creator):
        self.env = env_creator(config["env_config"])
        self.V = np.zeros(self.env.observation_space.n)
        self.policy = np.zeros(self.env.observation_space.n, dtype=int)
        self.policy[:] = -1 #IMP # To avoid initing it to a value within action_space range

    @override(Trainer)
    def _train(self):
        max_diff = np.inf # Maybe keep a state variable so that we don't need to update every train iteration??
        state_space_size = self.env.observation_space.n
        gamma = self.config["discount_factor"]
        total_iterations = 0
        while max_diff > self.config["tolerance"]:
            total_iterations += 1
            for s in range(state_space_size):
                # print("self.V[:]", s, max_diff, self.V, [self.env.R(s, a) for a in range(self.env.action_space.n)], self.policy[s])
                self.V_old = self.V.copy() # Is this asynchronous? V_old should be held constant for all states in the for loop?
                # print([self.env.R(s, a) for a in range(self.env.action_space.n)], [gamma * self.V[self.env.P(s, a)] for a in range(self.env.action_space.n)], [self.env.R(s, a) + gamma * self.V[self.env.P(s, a)] for a in range(self.env.action_space.n)])
                self.policy[s] = np.argmax([self.env.R(s, a) + gamma * self.V[self.env.P(s, a)] for a in range(self.env.action_space.n)])
                self.V[s] = np.max([self.env.R(s, a) + gamma * self.V[self.env.P(s, a)] for a in range(self.env.action_space.n)]) # We want R to be a callable function, so I guess we have to keep a for loop here??
                # print("self.V, self.V_old, self.policy[s]", self.V, self.V_old, self.policy[s], self.env.P(s, self.policy[s]))

                max_diff = np.max(np.absolute(self.V_old - self.V))
        # import time
        # time.sleep(2)
#         for s in range(state_space_size):
#             print("FINAL self.V[:]", s, max_diff, self.V[:], [self.env.R(s, a) for a in range(self.env.action_space.n)])

        print("Total iterations:", total_iterations)
        rewards = []
        steps = 0
        for _ in range(self.config["rollouts_per_iteration"]):
            obs = self.env.reset()
            done = False
            reward = 0.0
            for _ in range(self.config["episode_length"]):
                action = self.policy[obs]
                obs, r, done, info = self.env.step(action)

                reward += r
                steps += 1
            rewards.append(reward)
        return {
            "episode_reward_mean": np.mean(rewards),
            "timesteps_this_iter": steps,
        }


In [3]:
import ray
from ray import tune
from ray.rllib.utils.seed import seed as rllib_seed
import rl_toy
from rl_toy.envs import RLToyEnv
from ray.tune.registry import register_env
register_env("RLToy-v0", lambda config: RLToyEnv(config))

# rllib_seed(0, 0, 0)
ray.init()


2019-08-17 21:51:40,487	INFO node.py:498 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-08-17_21-51-40_487449_24872/logs.
2019-08-17 21:51:40,608	INFO services.py:409 -- Waiting for redis server at 127.0.0.1:28790 to respond...
2019-08-17 21:51:40,726	INFO services.py:409 -- Waiting for redis server at 127.0.0.1:56305 to respond...
2019-08-17 21:51:40,729	INFO services.py:809 -- Starting Redis shard with 6.72 GB max memory.
2019-08-17 21:51:40,754	INFO node.py:512 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-08-17_21-51-40_487449_24872/logs.
2019-08-17 21:51:40,757	INFO services.py:1475 -- Starting the Plasma object store with 10.08 GB memory using /dev/shm.


{'node_ip_address': '10.5.150.104',
 'redis_address': '10.5.150.104:28790',
 'object_store_address': '/tmp/ray/session_2019-08-17_21-51-40_487449_24872/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2019-08-17_21-51-40_487449_24872/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2019-08-17_21-51-40_487449_24872'}

In [19]:
# stats = {}
# aaaa = 3

# fout = open('rl_stats_temp.csv', 'a') #hardcoded
# fout.write('# basename, n_points, n_features, n_trees ')


def on_train_result(info):
    print("#############trainer.train() result: {} -> {} episodes".format(
        info["trainer"], info["result"]["episodes_this_iter"]), info)
    # you can mutate the result dict to add new fields to return
    stats['episode_len_mean'] = info['result']['episode_len_mean']
#     print("++++++++", aaaa, stats)
    state_space_size = info["result"]["config"]["env_config"]["state_space_size"]
#     action_space_size = 
    fout = open('rl_stats_temp.csv', 'a') #hardcoded
    fout.write('# basename, n_points, n_features, n_trees ' + str(state_space_size) + ' ' + str() + ' ' + str() + ' ' + str() + '\n')
    fout.close()

    info["result"]["callback_ok"] = True
    

# tune.run(
#     RandomAgent,
#     stop={
#         "timesteps_total": 20000,
#           },
#     config={
#       "rollouts_per_iteration": 10,
#       "env": "RLToy-v0",
#       "env_config": {
#         'state_space_type': 'discrete',
#         'action_space_type': 'discrete',
#         'state_space_size': 16,
#         'action_space_size': 16,
#         'generate_random_mdp': True,
#         'delay': 6,
#         'sequence_length': 1,
#         'reward_density': 0.25,
#         'terminal_state_density': 0.25
#         },
#     },
# )

# tune.run(
#     VIAgent,
#     stop={
#         "timesteps_total": 20000,
#           },
#     config={
#         "tolerance": 0.01,
#         "discount_factor": 0.99,
#         "rollouts_per_iteration": 10,
#       "env": "RLToy-v0",
#       "env_config": {
#         'state_space_type': 'discrete',
#         'action_space_type': 'discrete',
#         'state_space_size': 10,
#         'action_space_size': 10,
#         'generate_random_mdp': True,
#         'delay': 0,
#         'sequence_length': 1,
#         'reward_density': 0.25,
#         'terminal_state_density': 0.25
#         },
#     },
# )


tune.run(
    "DQN",
    stop={
        "timesteps_total": 20000,
          },
    config={
      "adam_epsilon": 0.00015,
      "beta_annealing_fraction": 1.0,
      "buffer_size": 1000000,
      "double_q": False,
      "dueling": False,
      "env": "RLToy-v0",
      "env_config": {
        'state_space_type': 'discrete',
        'action_space_type': 'discrete',
        'state_space_size': 16,
        'action_space_size': 16,
        'generate_random_mdp': True,
        'delay': 6,
        'sequence_length': 1,
        'reward_density': 0.25,
        'terminal_state_density': 0.25
        },
      "exploration_final_eps": 0.01,
      "exploration_fraction": 0.1,
      "final_prioritized_replay_beta": 1.0,
      "hiddens": [
        256
      ],
      "learning_starts": 2000,
      "lr": 6.25e-05, # "lr": grid_search([1e-2, 1e-4, 1e-6]),
      "n_step": 1,
      "noisy": False,
      "num_atoms": 1,
      "prioritized_replay": False,
      "prioritized_replay_alpha": 0.5,
      "sample_batch_size": 4,
      "schedule_max_timesteps": 20000,
      "target_network_update_freq": 80,
      "timesteps_per_iteration": 100,
      "train_batch_size": 32,
        
              "callbacks": {
#                 "on_episode_start": tune.function(on_episode_start),
#                 "on_episode_step": tune.function(on_episode_step),
#                 "on_episode_end": tune.function(on_episode_end),
#                 "on_sample_end": tune.function(on_sample_end),
                "on_train_result": tune.function(on_train_result),
#                 "on_postprocess_traj": tune.function(on_postprocess_traj),
            },
    },
)


2019-08-18 00:41:15,614	INFO trial_runner.py:176 -- Starting a new experiment.


== Status ==
Using FIFO scheduling algorithm.
Resources requested: 0/4 CPUs, 0/0 GPUs
Memory usage on this node: 17.0/33.6 GB

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 1/4 CPUs, 0/0 GPUs
Memory usage on this node: 17.0/33.6 GB
Result logdir: /home/rajanr/ray_results/DQN
Number of trials: 1 ({'RUNNING': 1})
RUNNING trials:
 - DQN_RLToy-v0_0:	RUNNING

[2m[36m(pid=515)[0m Instructions for updating:
[2m[36m(pid=515)[0m non-resource variables are not supported in the long term
[2m[36m(pid=515)[0m Inited terminal states to: [15 14 13 12] total 4
[2m[36m(pid=515)[0m specific_sequence that will be rewarded [1]
[2m[36m(pid=515)[0m specific_sequence that will be rewarded [5]
[2m[36m(pid=515)[0m specific_sequence that will be rewarded [0]
[2m[36m(pid=515)[0m Total no. of sequences reward: 3 Out of 12
[2m[36m(pid=515)[0m [[12 3 8 8 6 0 7 1 13 11 2 6 4 1 10 4]
[2m[36m(pid=515)[0m  [1 7 14 13 12 8 2 13 14 13 2 12 13 6 14 5]
[2m[36m(pid=515)[0m 

Result for DQN_RLToy-v0_0:
  callback_ok: true
  custom_metrics: {}
  date: 2019-08-18_00-41-22
  done: false
  episode_len_mean: 3.475836431226766
  episode_reward_max: 5.0
  episode_reward_mean: 0.895910780669145
  episode_reward_min: 0.0
  episodes_this_iter: 269
  episodes_total: 269
  experiment_id: 52abb92d04254715928ae3620e2b94a8
  hostname: mlstaff04
  info:
    grad_time_ms: .nan
    learner: {}
    max_exploration: 1.0
    min_exploration: 1.0
    num_steps_sampled: 936
    num_steps_trained: 0
    num_target_updates: 11
    opt_peak_throughput: 0.0
    opt_samples: .nan
    replay_time_ms: .nan
    sample_time_ms: 3.46
    update_time_ms: 0.001
  iterations_since_restore: 1
  node_ip: 10.5.150.104
  num_healthy_workers: 0
  off_policy_estimator: {}
  pid: 515
  policy_reward_mean: {}
  sampler_perf:
    mean_env_wait_ms: 0.059969524436470294
    mean_inference_ms: 0.5707908719937089
    mean_processing_ms: 0.21816139669021387
  time_since_restore: 1.0066626071929932
  time_t

[2m[36m(pid=515)[0m 2019-08-18 00:41:23,250	INFO rollout_worker.py:575 -- Training on concatenated sample batches:
[2m[36m(pid=515)[0m 
[2m[36m(pid=515)[0m { 'count': 32,
[2m[36m(pid=515)[0m   'policy_batches': { 'default_policy': { 'data': { 'actions': np.ndarray((32,), dtype=int64, min=0.0, max=15.0, mean=7.625),
[2m[36m(pid=515)[0m                                                     'batch_indexes': np.ndarray((32,), dtype=int64, min=-1.0, max=-1.0, mean=-1.0),
[2m[36m(pid=515)[0m                                                     'dones': np.ndarray((32,), dtype=bool, min=0.0, max=1.0, mean=0.344),
[2m[36m(pid=515)[0m                                                     'new_obs': np.ndarray((32, 16), dtype=float32, min=0.0, max=1.0, mean=0.062),
[2m[36m(pid=515)[0m                                                     'obs': np.ndarray((32, 16), dtype=float32, min=0.0, max=1.0, mean=0.062),
[2m[36m(pid=515)[0m                                               

[2m[36m(pid=515)[0m #############trainer.train() result: <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978> -> 50 episodes {'trainer': <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978>, 'result': {'episode_reward_max': 3.0, 'episode_reward_min': 0.0, 'episode_reward_mean': 0.71, 'episode_len_mean': 7.8, 'episodes_this_iter': 50, 'policy_reward_mean': {}, 'custom_metrics': {}, 'sampler_perf': {'mean_env_wait_ms': 0.06167557679553973, 'mean_processing_ms': 0.21702204265660222, 'mean_inference_ms': 0.5702078057393867}, 'off_policy_estimator': {}, 'info': {'min_exploration': 0.010000000000000009, 'max_exploration': 0.010000000000000009, 'num_target_updates': 33, 'num_steps_trained': 6400, 'num_steps_sampled': 2800, 'sample_time_ms': 4.616, 'replay_time_ms': 2.502, 'grad_time_ms': 2.419, 'update_time_ms': 0.003, 'opt_peak_throughput': 13227.331, 'opt_samples': 32.0, 'learner': {'default_policy': {'cur_lr': 6.25000029685907e-05, 'mean_q': 0.5450195, 'min_q': -

Result for DQN_RLToy-v0_0:
  callback_ok: true
  custom_metrics: {}
  date: 2019-08-18_00-41-27
  done: false
  episode_len_mean: 13.43
  episode_reward_max: 213.0
  episode_reward_mean: 3.56
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 720
  experiment_id: 52abb92d04254715928ae3620e2b94a8
  hostname: mlstaff04
  info:
    grad_time_ms: 2.489
    learner:
      default_policy:
        cur_lr: 6.25000029685907e-05
        max_q: 5.943103790283203
        mean_q: 3.706404685974121
        mean_td_error: -0.7341693043708801
        min_q: 0.9697749614715576
        model: {}
    max_exploration: 0.010000000000000009
    min_exploration: 0.010000000000000009
    num_steps_sampled: 3628
    num_steps_trained: 13024
    num_target_updates: 43
    opt_peak_throughput: 12856.843
    opt_samples: 32.0
    replay_time_ms: 2.877
    sample_time_ms: 3.916
    update_time_ms: 0.003
  iterations_since_restore: 6
  node_ip: 10.5.150.104
  num_healthy_workers: 0
  off_policy_est

[2m[36m(pid=515)[0m #############trainer.train() result: <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978> -> 0 episodes {'trainer': <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978>, 'result': {'episode_reward_max': 213.0, 'episode_reward_min': 0.0, 'episode_reward_mean': 5.88, 'episode_len_mean': 18.13, 'episodes_this_iter': 0, 'policy_reward_mean': {}, 'custom_metrics': {}, 'sampler_perf': {'mean_env_wait_ms': 0.0617899345590867, 'mean_processing_ms': 0.2158308585847963, 'mean_inference_ms': 0.5708792901071646}, 'off_policy_estimator': {}, 'info': {'min_exploration': 0.010000000000000009, 'max_exploration': 0.010000000000000009, 'num_target_updates': 54, 'num_steps_trained': 20512, 'num_steps_sampled': 4564, 'sample_time_ms': 3.556, 'replay_time_ms': 2.478, 'grad_time_ms': 2.429, 'update_time_ms': 0.002, 'opt_peak_throughput': 13174.229, 'opt_samples': 32.0, 'learner': {'default_policy': {'cur_lr': 6.25000029685907e-05, 'mean_q': 7.3580704, 'min_q': 

[2m[36m(pid=515)[0m #############trainer.train() result: <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978> -> 0 episodes {'trainer': <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978>, 'result': {'episode_reward_max': 437.0, 'episode_reward_min': 0.0, 'episode_reward_mean': 10.51, 'episode_len_mean': 28.32, 'episodes_this_iter': 0, 'policy_reward_mean': {}, 'custom_metrics': {}, 'sampler_perf': {'mean_env_wait_ms': 0.061823860044204725, 'mean_processing_ms': 0.21505562243929277, 'mean_inference_ms': 0.5708799998714108}, 'off_policy_estimator': {}, 'info': {'min_exploration': 0.010000000000000009, 'max_exploration': 0.010000000000000009, 'num_target_updates': 65, 'num_steps_trained': 27776, 'num_steps_sampled': 5472, 'sample_time_ms': 3.82, 'replay_time_ms': 2.595, 'grad_time_ms': 2.469, 'update_time_ms': 0.003, 'opt_peak_throughput': 12958.506, 'opt_samples': 32.0, 'learner': {'default_policy': {'cur_lr': 6.25000029685907e-05, 'mean_q': 9.670429, 'min_q'

[2m[36m(pid=515)[0m #############trainer.train() result: <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978> -> 1 episodes {'trainer': <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978>, 'result': {'episode_reward_max': 437.0, 'episode_reward_min': 0.0, 'episode_reward_mean': 16.74, 'episode_len_mean': 40.96, 'episodes_this_iter': 1, 'policy_reward_mean': {}, 'custom_metrics': {}, 'sampler_perf': {'mean_env_wait_ms': 0.061862206756178394, 'mean_processing_ms': 0.2136594930965912, 'mean_inference_ms': 0.570670992686581}, 'off_policy_estimator': {}, 'info': {'min_exploration': 0.010000000000000009, 'max_exploration': 0.010000000000000009, 'num_target_updates': 76, 'num_steps_trained': 35136, 'num_steps_sampled': 6392, 'sample_time_ms': 3.362, 'replay_time_ms': 2.471, 'grad_time_ms': 2.073, 'update_time_ms': 0.002, 'opt_peak_throughput': 15434.066, 'opt_samples': 32.0, 'learner': {'default_policy': {'cur_lr': 6.25000029685907e-05, 'mean_q': 17.614458, 'min_q'

[2m[36m(pid=515)[0m #############trainer.train() result: <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978> -> 2 episodes {'trainer': <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978>, 'result': {'episode_reward_max': 442.0, 'episode_reward_min': 0.0, 'episode_reward_mean': 21.56, 'episode_len_mean': 50.83, 'episodes_this_iter': 2, 'policy_reward_mean': {}, 'custom_metrics': {}, 'sampler_perf': {'mean_env_wait_ms': 0.061893187852741456, 'mean_processing_ms': 0.212632621293418, 'mean_inference_ms': 0.5705420506837384}, 'off_policy_estimator': {}, 'info': {'min_exploration': 0.010000000000000009, 'max_exploration': 0.010000000000000009, 'num_target_updates': 86, 'num_steps_trained': 42144, 'num_steps_sampled': 7268, 'sample_time_ms': 3.631, 'replay_time_ms': 2.803, 'grad_time_ms': 2.082, 'update_time_ms': 0.002, 'opt_peak_throughput': 15369.733, 'opt_samples': 32.0, 'learner': {'default_policy': {'cur_lr': 6.25000029685907e-05, 'mean_q': 20.904346, 'min_q'

Result for DQN_RLToy-v0_0:
  callback_ok: true
  custom_metrics: {}
  date: 2019-08-18_00-41-37
  done: false
  episode_len_mean: 60.52
  episode_reward_max: 449.0
  episode_reward_mean: 26.04
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 731
  experiment_id: 52abb92d04254715928ae3620e2b94a8
  hostname: mlstaff04
  info:
    grad_time_ms: 2.418
    learner:
      default_policy:
        cur_lr: 6.25000029685907e-05
        max_q: 29.109338760375977
        mean_q: 24.061538696289062
        mean_td_error: 0.6820318102836609
        min_q: 0.45398035645484924
        model: {}
    max_exploration: 0.010000000000000009
    min_exploration: 0.010000000000000009
    num_steps_sampled: 8196
    num_steps_trained: 49568
    num_target_updates: 97
    opt_peak_throughput: 13236.201
    opt_samples: 32.0
    replay_time_ms: 2.649
    sample_time_ms: 3.753
    update_time_ms: 0.003
  iterations_since_restore: 16
  node_ip: 10.5.150.104
  num_healthy_workers: 0
  off_policy

[2m[36m(pid=515)[0m #############trainer.train() result: <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978> -> 1 episodes {'trainer': <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978>, 'result': {'episode_reward_max': 449.0, 'episode_reward_min': 0.0, 'episode_reward_mean': 30.22, 'episode_len_mean': 69.69, 'episodes_this_iter': 1, 'policy_reward_mean': {}, 'custom_metrics': {}, 'sampler_perf': {'mean_env_wait_ms': 0.061917190705405495, 'mean_processing_ms': 0.21150380612167707, 'mean_inference_ms': 0.5702711379623148}, 'off_policy_estimator': {}, 'info': {'min_exploration': 0.010000000000000009, 'max_exploration': 0.010000000000000009, 'num_target_updates': 108, 'num_steps_trained': 56896, 'num_steps_sampled': 9112, 'sample_time_ms': 3.53, 'replay_time_ms': 2.75, 'grad_time_ms': 2.211, 'update_time_ms': 0.002, 'opt_peak_throughput': 14473.255, 'opt_samples': 32.0, 'learner': {'default_policy': {'cur_lr': 6.25000029685907e-05, 'mean_q': 27.60534, 'min_q'

[2m[36m(pid=515)[0m #############trainer.train() result: <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978> -> 2 episodes {'trainer': <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978>, 'result': {'episode_reward_max': 449.0, 'episode_reward_min': 0.0, 'episode_reward_mean': 34.04, 'episode_len_mean': 78.57, 'episodes_this_iter': 2, 'policy_reward_mean': {}, 'custom_metrics': {}, 'sampler_perf': {'mean_env_wait_ms': 0.06195099876704971, 'mean_processing_ms': 0.2091096380469297, 'mean_inference_ms': 0.5694939262916346}, 'off_policy_estimator': {}, 'info': {'min_exploration': 0.010000000000000009, 'max_exploration': 0.010000000000000009, 'num_target_updates': 119, 'num_steps_trained': 64448, 'num_steps_sampled': 10056, 'sample_time_ms': 3.366, 'replay_time_ms': 2.645, 'grad_time_ms': 1.955, 'update_time_ms': 0.002, 'opt_peak_throughput': 16366.618, 'opt_samples': 32.0, 'learner': {'default_policy': {'cur_lr': 6.25000029685907e-05, 'mean_q': 31.767694, 'min_

[2m[36m(pid=515)[0m #############trainer.train() result: <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978> -> 0 episodes {'trainer': <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978>, 'result': {'episode_reward_max': 449.0, 'episode_reward_min': 0.0, 'episode_reward_mean': 34.04, 'episode_len_mean': 78.57, 'episodes_this_iter': 0, 'policy_reward_mean': {}, 'custom_metrics': {}, 'sampler_perf': {'mean_env_wait_ms': 0.06195099876704969, 'mean_processing_ms': 0.2091096380469296, 'mean_inference_ms': 0.5694939262916345}, 'off_policy_estimator': {}, 'info': {'min_exploration': 0.010000000000000009, 'max_exploration': 0.010000000000000009, 'num_target_updates': 130, 'num_steps_trained': 71936, 'num_steps_sampled': 10992, 'sample_time_ms': 3.385, 'replay_time_ms': 2.648, 'grad_time_ms': 2.49, 'update_time_ms': 0.003, 'opt_peak_throughput': 12852.042, 'opt_samples': 32.0, 'learner': {'default_policy': {'cur_lr': 6.25000029685907e-05, 'mean_q': 36.5079, 'min_q':

[2m[36m(pid=515)[0m #############trainer.train() result: <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978> -> 0 episodes {'trainer': <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978>, 'result': {'episode_reward_max': 449.0, 'episode_reward_min': 0.0, 'episode_reward_mean': 34.04, 'episode_len_mean': 78.57, 'episodes_this_iter': 0, 'policy_reward_mean': {}, 'custom_metrics': {}, 'sampler_perf': {'mean_env_wait_ms': 0.06195099876704969, 'mean_processing_ms': 0.2091096380469296, 'mean_inference_ms': 0.5694939262916345}, 'off_policy_estimator': {}, 'info': {'min_exploration': 0.010000000000000009, 'max_exploration': 0.010000000000000009, 'num_target_updates': 141, 'num_steps_trained': 79328, 'num_steps_sampled': 11916, 'sample_time_ms': 3.603, 'replay_time_ms': 2.735, 'grad_time_ms': 2.809, 'update_time_ms': 0.002, 'opt_peak_throughput': 11392.825, 'opt_samples': 32.0, 'learner': {'default_policy': {'cur_lr': 6.25000029685907e-05, 'mean_q': 38.93161, 'min_q

Result for DQN_RLToy-v0_0:
  callback_ok: true
  custom_metrics: {}
  date: 2019-08-18_00-41-47
  done: false
  episode_len_mean: 103.77
  episode_reward_max: 1211.0
  episode_reward_mean: 46.15
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 737
  experiment_id: 52abb92d04254715928ae3620e2b94a8
  hostname: mlstaff04
  info:
    grad_time_ms: 2.298
    learner:
      default_policy:
        cur_lr: 6.25000029685907e-05
        max_q: 48.27271270751953
        mean_q: 43.93482208251953
        mean_td_error: -1.295918345451355
        min_q: 0.2549872100353241
        model: {}
    max_exploration: 0.010000000000000009
    min_exploration: 0.010000000000000009
    num_steps_sampled: 12780
    num_steps_trained: 86240
    num_target_updates: 152
    opt_peak_throughput: 13923.145
    opt_samples: 32.0
    replay_time_ms: 2.642
    sample_time_ms: 3.986
    update_time_ms: 0.003
  iterations_since_restore: 26
  node_ip: 10.5.150.104
  num_healthy_workers: 0
  off_polic

[2m[36m(pid=515)[0m #############trainer.train() result: <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978> -> 1 episodes {'trainer': <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978>, 'result': {'episode_reward_max': 1211.0, 'episode_reward_min': 0.0, 'episode_reward_mean': 50.34, 'episode_len_mean': 112.8, 'episodes_this_iter': 1, 'policy_reward_mean': {}, 'custom_metrics': {}, 'sampler_perf': {'mean_env_wait_ms': 0.06196631145034155, 'mean_processing_ms': 0.20780366111882106, 'mean_inference_ms': 0.569050887177934}, 'off_policy_estimator': {}, 'info': {'min_exploration': 0.010000000000000009, 'max_exploration': 0.010000000000000009, 'num_target_updates': 162, 'num_steps_trained': 93056, 'num_steps_sampled': 13632, 'sample_time_ms': 3.451, 'replay_time_ms': 2.793, 'grad_time_ms': 2.067, 'update_time_ms': 0.002, 'opt_peak_throughput': 15483.921, 'opt_samples': 32.0, 'learner': {'default_policy': {'cur_lr': 6.25000029685907e-05, 'mean_q': 45.086327, 'min

[2m[36m(pid=515)[0m #############trainer.train() result: <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978> -> 1 episodes {'trainer': <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978>, 'result': {'episode_reward_max': 1211.0, 'episode_reward_min': 0.0, 'episode_reward_mean': 55.95, 'episode_len_mean': 125.22, 'episodes_this_iter': 1, 'policy_reward_mean': {}, 'custom_metrics': {}, 'sampler_perf': {'mean_env_wait_ms': 0.061992421845797875, 'mean_processing_ms': 0.20581015801649016, 'mean_inference_ms': 0.5683366393119538}, 'off_policy_estimator': {}, 'info': {'min_exploration': 0.010000000000000009, 'max_exploration': 0.010000000000000009, 'num_target_updates': 173, 'num_steps_trained': 100704, 'num_steps_sampled': 14588, 'sample_time_ms': 3.429, 'replay_time_ms': 2.747, 'grad_time_ms': 2.029, 'update_time_ms': 0.003, 'opt_peak_throughput': 15772.507, 'opt_samples': 32.0, 'learner': {'default_policy': {'cur_lr': 6.25000029685907e-05, 'mean_q': 51.933098, 

[2m[36m(pid=515)[0m #############trainer.train() result: <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978> -> 0 episodes {'trainer': <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978>, 'result': {'episode_reward_max': 1211.0, 'episode_reward_min': 0.0, 'episode_reward_mean': 55.95, 'episode_len_mean': 125.22, 'episodes_this_iter': 0, 'policy_reward_mean': {}, 'custom_metrics': {}, 'sampler_perf': {'mean_env_wait_ms': 0.061992421845797875, 'mean_processing_ms': 0.20581015801649014, 'mean_inference_ms': 0.5683366393119538}, 'off_policy_estimator': {}, 'info': {'min_exploration': 0.010000000000000009, 'max_exploration': 0.010000000000000009, 'num_target_updates': 184, 'num_steps_trained': 108192, 'num_steps_sampled': 15524, 'sample_time_ms': 3.518, 'replay_time_ms': 2.781, 'grad_time_ms': 2.079, 'update_time_ms': 0.003, 'opt_peak_throughput': 15390.529, 'opt_samples': 32.0, 'learner': {'default_policy': {'cur_lr': 6.25000029685907e-05, 'mean_q': 52.188484, 

[2m[36m(pid=515)[0m #############trainer.train() result: <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978> -> 0 episodes {'trainer': <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978>, 'result': {'episode_reward_max': 1211.0, 'episode_reward_min': 0.0, 'episode_reward_mean': 62.53, 'episode_len_mean': 139.58, 'episodes_this_iter': 0, 'policy_reward_mean': {}, 'custom_metrics': {}, 'sampler_perf': {'mean_env_wait_ms': 0.06200759330388528, 'mean_processing_ms': 0.20375051878870937, 'mean_inference_ms': 0.567490249707434}, 'off_policy_estimator': {}, 'info': {'min_exploration': 0.010000000000000009, 'max_exploration': 0.010000000000000009, 'num_target_updates': 195, 'num_steps_trained': 115328, 'num_steps_sampled': 16416, 'sample_time_ms': 4.867, 'replay_time_ms': 2.947, 'grad_time_ms': 2.716, 'update_time_ms': 0.003, 'opt_peak_throughput': 11784.132, 'opt_samples': 32.0, 'learner': {'default_policy': {'cur_lr': 6.25000029685907e-05, 'mean_q': 58.38085, 'mi

Result for DQN_RLToy-v0_0:
  callback_ok: true
  custom_metrics: {}
  date: 2019-08-18_00-41-57
  done: false
  episode_len_mean: 139.58
  episode_reward_max: 1211.0
  episode_reward_mean: 62.53
  episode_reward_min: 0.0
  episodes_this_iter: 0
  episodes_total: 744
  experiment_id: 52abb92d04254715928ae3620e2b94a8
  hostname: mlstaff04
  info:
    grad_time_ms: 2.237
    learner:
      default_policy:
        cur_lr: 6.25000029685907e-05
        max_q: 61.372074127197266
        mean_q: 58.63371658325195
        mean_td_error: -0.23699043691158295
        min_q: 6.222259998321533
        model: {}
    max_exploration: 0.010000000000000009
    min_exploration: 0.010000000000000009
    num_steps_sampled: 17176
    num_steps_trained: 121408
    num_target_updates: 204
    opt_peak_throughput: 14303.437
    opt_samples: 32.0
    replay_time_ms: 3.376
    sample_time_ms: 3.666
    update_time_ms: 0.002
  iterations_since_restore: 36
  node_ip: 10.5.150.104
  num_healthy_workers: 0
  off_po

[2m[36m(pid=515)[0m #############trainer.train() result: <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978> -> 1 episodes {'trainer': <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978>, 'result': {'episode_reward_max': 1211.0, 'episode_reward_min': 0.0, 'episode_reward_mean': 70.12, 'episode_len_mean': 157.12, 'episodes_this_iter': 1, 'policy_reward_mean': {}, 'custom_metrics': {}, 'sampler_perf': {'mean_env_wait_ms': 0.062033557148359, 'mean_processing_ms': 0.20237771370318175, 'mean_inference_ms': 0.5670882314107144}, 'off_policy_estimator': {}, 'info': {'min_exploration': 0.010000000000000009, 'max_exploration': 0.010000000000000009, 'num_target_updates': 215, 'num_steps_trained': 128800, 'num_steps_sampled': 18100, 'sample_time_ms': 3.619, 'replay_time_ms': 2.939, 'grad_time_ms': 2.145, 'update_time_ms': 0.002, 'opt_peak_throughput': 14918.385, 'opt_samples': 32.0, 'learner': {'default_policy': {'cur_lr': 6.25000029685907e-05, 'mean_q': 64.10534, 'min

[2m[36m(pid=515)[0m #############trainer.train() result: <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978> -> 0 episodes {'trainer': <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978>, 'result': {'episode_reward_max': 1211.0, 'episode_reward_min': 0.0, 'episode_reward_mean': 72.7, 'episode_len_mean': 163.15, 'episodes_this_iter': 0, 'policy_reward_mean': {}, 'custom_metrics': {}, 'sampler_perf': {'mean_env_wait_ms': 0.062044891471392895, 'mean_processing_ms': 0.20168397268716354, 'mean_inference_ms': 0.5668711526235305}, 'off_policy_estimator': {}, 'info': {'min_exploration': 0.010000000000000009, 'max_exploration': 0.010000000000000009, 'num_target_updates': 226, 'num_steps_trained': 136416, 'num_steps_sampled': 19052, 'sample_time_ms': 3.481, 'replay_time_ms': 2.797, 'grad_time_ms': 2.147, 'update_time_ms': 0.002, 'opt_peak_throughput': 14903.642, 'opt_samples': 32.0, 'learner': {'default_policy': {'cur_lr': 6.25000029685907e-05, 'mean_q': 58.53141, 'm

Result for DQN_RLToy-v0_0:
  callback_ok: true
  custom_metrics: {}
  date: 2019-08-18_00-42-03
  done: true
  episode_len_mean: 175.43
  episode_reward_max: 1211.0
  episode_reward_mean: 78.49
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 748
  experiment_id: 52abb92d04254715928ae3620e2b94a8
  hostname: mlstaff04
  info:
    grad_time_ms: 2.046
    learner:
      default_policy:
        cur_lr: 6.25000029685907e-05
        max_q: 69.28202819824219
        mean_q: 61.954063415527344
        mean_td_error: 0.09763866662979126
        min_q: -0.3660565912723541
        model: {}
    max_exploration: 0.010000000000000009
    min_exploration: 0.010000000000000009
    num_steps_sampled: 20008
    num_steps_trained: 144064
    num_target_updates: 238
    opt_peak_throughput: 15639.993
    opt_samples: 32.0
    replay_time_ms: 2.711
    sample_time_ms: 3.511
    update_time_ms: 0.003
  iterations_since_restore: 42
  node_ip: 10.5.150.104
  num_healthy_workers: 0
  off_po

<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7f51bd808828>

[2m[36m(pid=515)[0m #############trainer.train() result: <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978> -> 1 episodes {'trainer': <ray.rllib.agents.trainer_template.DQN object at 0x7f23b2207978>, 'result': {'episode_reward_max': 1211.0, 'episode_reward_min': 0.0, 'episode_reward_mean': 78.49, 'episode_len_mean': 175.43, 'episodes_this_iter': 1, 'policy_reward_mean': {}, 'custom_metrics': {}, 'sampler_perf': {'mean_env_wait_ms': 0.06205245643800219, 'mean_processing_ms': 0.20097573154931517, 'mean_inference_ms': 0.5666116418198061}, 'off_policy_estimator': {}, 'info': {'min_exploration': 0.010000000000000009, 'max_exploration': 0.010000000000000009, 'num_target_updates': 238, 'num_steps_trained': 144064, 'num_steps_sampled': 20008, 'sample_time_ms': 3.511, 'replay_time_ms': 2.711, 'grad_time_ms': 2.046, 'update_time_ms': 0.003, 'opt_peak_throughput': 15639.993, 'opt_samples': 32.0, 'learner': {'default_policy': {'cur_lr': 6.25000029685907e-05, 'mean_q': 61.954063, '

In [5]:
state_space_sizes = [2**i for i in range(1,6)]
action_space_sizes = [2**i for i in range(1,6)]
delays = [0] + [2**i for i in range(5)]
sequence_lengths = [i for i in range(1,6)]
reward_densities = np.linspace(0.0, 1.0, num=5)
# make_reward_dense = [True, False]
terminal_state_densities = np.linspace(0.1, 1.0, num=5)

print(state_space_sizes, action_space_sizes, delays, sequence_lengths, reward_densities, terminal_state_densities)


[2, 4, 8, 16, 32] [2, 4, 8, 16, 32] [0, 1, 2, 4, 8, 16] [1, 2, 3, 4, 5] [0.   0.25 0.5  0.75 1.  ] [0.1   0.325 0.55  0.775 1.   ]


In [17]:
stats
fout = open('rl_stats_temp.csv', 'a') #hardcoded
fout.write('# basename, n_points, n_features, n_trees ')

fout.close()
