Update to beta, fix linting

activatedgeek · Jun 6, 2018 · f27ba55 · f27ba55
1 parent fc71850
commit f27ba55
Show file tree

Hide file tree

Showing 14 changed files with 845 additions and 1,122 deletions.
diff --git a/.pylintrc b/.pylintrc
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 [![Build Status](https://travis-ci.org/activatedgeek/torchrl.svg?branch=master)](https://travis-ci.org/activatedgeek/torchrl)
 [![PyPI version](https://badge.fury.io/py/torchrl.svg)](https://pypi.org/project/torchrl/)
-![Project Status](https://img.shields.io/badge/status-alpha-orange.svg)
+![Project Status](https://img.shields.io/badge/status-beta-yellow.svg)
 
 
 ## Objectives

diff --git a/setup.py b/setup.py
@@ -35,11 +35,11 @@
       author='Sanyam Kapoor',
       license='Apache License 2.0',
       classifiers=[
-        'Programming Language :: Python :: 3.6',
-        'Development Status :: 3 - Alpha',
+        'Development Status :: 4 - Beta',
         'Intended Audience :: Developers',
+        'Intended Audience :: Science/Research',
+        'License :: OSI Approved :: Apache Software License',
         'Topic :: Scientific/Engineering :: Artificial Intelligence',
-        'License :: OSI Approved :: Apache Software License'
       ],
       packages=find_packages(),
       install_requires=install_requires,

diff --git a/torchrl/episode_runner.py b/torchrl/episode_runner.py
@@ -13,162 +13,180 @@
 
 
 class EpisodeRunner:
+  """
+  EpisodeRunner is a utility wrapper to run episodes on a single
+  Gym-like environment object. Each call to the `run()` method will
+  run one episode for specified number of steps. Call `reset()` to
+  reuse the same object again
+  """
+  def __init__(self, env: gym.Env,
+               max_steps: int = DEFAULT_MAX_STEPS):
     """
-    EpisodeRunner is a utility wrapper to run episodes on a single Gym-like environment
-    object. Each call to the `run()` method will run one episode for specified
-    number of steps. Call `reset()` to reuse the same object again
+    :param env: Environment with Gym-like API
+    :param max_steps: Maximum number of steps per episode
+    (useful for non-episodic environments)
     """
-    def __init__(self, env: gym.Env, max_steps: int = DEFAULT_MAX_STEPS):
-        """
-        :param env: Environment with Gym-like API
-        :param max_steps: Maximum number of steps per episode (useful for non-episodic environments)
-        """
-        self.env = env
+    self.env = env
 
-        self.max_steps = max_steps
-
-        self._obs = None
-        self._done = False
-
-        # Stats
-        self._rollout_duration = 0.0
-
-        self.reset()
-
-    def reset(self):
-        """
-        Reset internal state for the `run` method to be reused
-        """
-        self._obs = self.env.reset()
-        self._done = False
-
-        self._rollout_duration = 0.0
-
-    def is_done(self):
-        """
-        :return: True if the episode has ended, False otherwise
-        """
-        return self._done
+    self.max_steps = max_steps
 
-    def act(self, learner):
-        """
-        This routine is called from the `run` routine every time an action
-        is needed for the environment to step.
-        """
-        obs_tensor = torch.from_numpy(self._obs).unsqueeze(0).float()
-        obs_tensor = Variable(obs_tensor, volatile=True)
-        if learner.is_cuda:
-            obs_tensor = obs_tensor.cuda()
+    self._obs = None
+    self._done = False
 
-        action = learner.act(obs_tensor)
-        # `act` is a batch call but, for a single episode run this is always one action
-        return action[0][0]
+    # Stats
+    self._rollout_duration = 0.0
 
-    def collect(self, learner: BaseLearner, steps: int = None, render: bool = False, fps: int = 30, store: bool = False):
-        """
+    self.reset()
 
-        :param learner: An agent of type BaseLearner
-        :param steps: Number of maximum steps in the current rollout
-        :param render: Flag to render the environment, True or False
-        :param fps: Rendering rate of the environment, frames per second if render is True
-        :param store: Flag to store the history of the run
-        :return: batch of transitions
-        """
-        assert not self._done, 'EpisodeRunner has ended. Call .reset() to reuse.'
+  def reset(self):
+    """
+    Reset internal state for the `run` method to be reused
+    """
+    self._obs = self.env.reset()
+    self._done = False
 
-        steps = steps or self.max_steps
+    self._rollout_duration = 0.0
 
-        if render:
-            self.env.render()
-            time.sleep(1. / fps)
+  def is_done(self):
+    """
+    :return: True if the episode has ended, False otherwise
+    """
+    return self._done
 
-        is_discrete = self.env.action_space.__class__.__name__ == 'Discrete'
+  def act(self, learner):
+    """
+    This routine is called from the `run` routine every time an action
+    is needed for the environment to step.
+    """
+    obs_tensor = torch.from_numpy(self._obs).unsqueeze(0).float()
+    obs_tensor = Variable(obs_tensor, volatile=True)
+    if learner.is_cuda:
+      obs_tensor = obs_tensor.cuda()
 
-        obs_history, action_history, reward_history, next_obs_history, done_history = \
-            self.init_run_history(self.env.observation_space, self.env.action_space)
+    action = learner.act(obs_tensor)
+    # `act` is a batch call
+    return action[0][0]
 
-        rollout_start = time.time()
+  def collect(self, learner: BaseLearner, steps: int = None,
+              render: bool = False, fps: int = 30, store: bool = False):
+    """
 
-        while not self._done and steps:
-            action = self.act(learner)
-            next_obs, reward, self._done, _ = self.env.step(action)
+    :param learner: An agent of type BaseLearner
+    :param steps: Number of maximum steps in the current rollout
+    :param render: Flag to render the environment, True or False
+    :param fps: Rendering rate of the environment, frames per second
+    if render is True
+    :param store: Flag to store the history of the run
+    :return: batch of transitions
+    """
+    assert not self._done, 'EpisodeRunner has ended. Call .reset() to reuse.'
 
-            if store:
-                obs_history = np.append(obs_history, np.expand_dims(self._obs, axis=0), axis=0)
-                action = np.array([[action]]) if is_discrete else np.expand_dims(action, axis=0)
-                action_history = np.append(action_history, action, axis=0)
-                reward_history = np.append(reward_history, np.array([[reward]]), axis=0)
-                next_obs_history = np.append(next_obs_history, np.expand_dims(next_obs, axis=0), axis=0)
-                done_history = np.append(done_history, np.array([[int(self._done)]]), axis=0)
+    steps = steps or self.max_steps
 
-            self._obs = next_obs
-            steps -= 1
+    if render:
+      self.env.render()
+      time.sleep(1. / fps)
 
-            if render:
-                self.env.render()
-                time.sleep(1. / fps)
+    is_discrete = self.env.action_space.__class__.__name__ == 'Discrete'
 
-        self._rollout_duration = time.time() - rollout_start
+    obs_history, action_history, reward_history, \
+    next_obs_history, done_history = \
+        self.init_run_history(self.env.observation_space, self.env.action_space)
 
-        if store:
-            return obs_history, action_history, reward_history, next_obs_history, done_history
+    rollout_start = time.time()
 
-    def stop(self):
-        self.env.close()
+    while not self._done and steps:
+      action = self.act(learner)
+      next_obs, reward, self._done, _ = self.env.step(action)
 
-    def get_stats(self) -> dict:
-        return {
-            'duration': self._rollout_duration,
-        }
+      if store:
+        obs_history = np.append(obs_history,
+                                np.expand_dims(self._obs, axis=0), axis=0)
+        action = np.array([[action]]) if is_discrete else \
+            np.expand_dims(action, axis=0)
+        action_history = np.append(action_history, action, axis=0)
+        reward_history = np.append(reward_history, np.array([[reward]]), axis=0)
+        next_obs_history = np.append(next_obs_history,
+                                     np.expand_dims(next_obs, axis=0), axis=0)
+        done_history = np.append(done_history, np.array([[int(self._done)]]),
+                                 axis=0)
 
-    @staticmethod
-    def init_run_history(observation_space: gym.Space, action_space: gym.Space):
-        is_discrete = action_space.__class__.__name__ == 'Discrete'
+      self._obs = next_obs
+      steps -= 1
 
-        obs_history = np.empty((0, *observation_space.shape), dtype=np.float)
-        action_history = np.empty((0, *((1,) if is_discrete else action_space.shape)),
-                                  dtype=np.int if is_discrete else np.float)
-        reward_history = np.empty((0, 1), dtype=np.float)
-        next_obs_history = np.empty_like(obs_history)
-        done_history = np.empty((0, 1), dtype=np.int)
+      if render:
+        self.env.render()
+        time.sleep(1. / fps)
 
-        return obs_history, action_history, reward_history, next_obs_history, done_history
+    self._rollout_duration = time.time() - rollout_start
 
-    @staticmethod
-    def merge_histories(observation_space: gym.Space, action_space: gym.Space, *sources: tuple) -> tuple:
-        target = EpisodeRunner.init_run_history(observation_space, action_space)
-        return tuple([np.concatenate((tgt, *src), axis=0) for tgt, *src in zip(target, *sources)])
+    if store:
+      return obs_history, action_history, reward_history, \
+        next_obs_history, done_history
 
+  def stop(self):
+    self.env.close()
 
-def make_runner(env_id: str, seed: int = None, max_steps: int = DEFAULT_MAX_STEPS):
-    env = gym.make(env_id)
-    if seed is not None:
-        env.seed(seed)
-    return EpisodeRunner(env, max_steps=max_steps)
+  def get_stats(self) -> dict:
+    return {
+        'duration': self._rollout_duration,
+    }
 
+  @staticmethod
+  def init_run_history(observation_space: gym.Space, action_space: gym.Space):
+    is_discrete = action_space.__class__.__name__ == 'Discrete'
 
-class MultiEpisodeRunner(MultiProcWrapper):
-    """
-    This class is the parallel version of EpisodeRunner
-    """
+    obs_history = np.empty((0, *observation_space.shape), dtype=np.float)
+    action_history = np.empty((0, *((1,) if is_discrete else
+                                    action_space.shape)),
+                              dtype=np.int if is_discrete else np.float)
+    reward_history = np.empty((0, 1), dtype=np.float)
+    next_obs_history = np.empty_like(obs_history)
+    done_history = np.empty((0, 1), dtype=np.int)
+
+    return obs_history, action_history, reward_history, \
+        next_obs_history, done_history
 
-    def __init__(self, env_id: str, max_steps: int = DEFAULT_MAX_STEPS, n_runners=2, base_seed: int = 0,
-                 daemon=True, autostart=True):
-        obj_fns =[
-            functools.partial(make_runner, env_id,
-                              None if base_seed is None else base_seed + rank, max_steps=max_steps)
-            for rank in range(1, n_runners + 1)
-        ]
-        super(MultiEpisodeRunner, self).__init__(obj_fns, daemon=daemon, autostart=autostart)
+  @staticmethod
+  def merge_histories(observation_space: gym.Space,
+                      action_space: gym.Space, *sources: tuple) -> tuple:
+    target = EpisodeRunner.init_run_history(observation_space, action_space)
+    return tuple([np.concatenate((tgt, *src), axis=0)
+                  for tgt, *src in zip(target, *sources)])
 
-    def reset(self, env_id: int = None):
-        self.exec_remote('reset', proc=env_id)
 
-    def is_done(self):
-        return self.exec_remote('is_done')
+def make_runner(env_id: str, seed: int = None,
+                max_steps: int = DEFAULT_MAX_STEPS):
+  env = gym.make(env_id)
+  if seed is not None:
+    env.seed(seed)
+  return EpisodeRunner(env, max_steps=max_steps)
 
-    def collect(self, *args, **kwargs):
-        return self.exec_remote('collect', args=args, kwargs=kwargs)
 
-    def get_stats(self):
-        return self.exec_remote('get_stats')
+class MultiEpisodeRunner(MultiProcWrapper):
+  """
+  This class is the parallel version of EpisodeRunner
+  """
+
+  def __init__(self, env_id: str, max_steps: int = DEFAULT_MAX_STEPS,
+               n_runners=2, base_seed: int = 0, daemon=True, autostart=True):
+    obj_fns = [
+        functools.partial(make_runner, env_id,
+                          None if base_seed is None else base_seed + rank,
+                          max_steps=max_steps)
+        for rank in range(1, n_runners + 1)
+    ]
+    super(MultiEpisodeRunner, self).__init__(
+        obj_fns, daemon=daemon, autostart=autostart)
+
+  def reset(self, env_id: int = None):
+    self.exec_remote('reset', proc=env_id)
+
+  def is_done(self):
+    return self.exec_remote('is_done')
+
+  def collect(self, *args, **kwargs):
+    return self.exec_remote('collect', args=args, kwargs=kwargs)
+
+  def get_stats(self):
+    return self.exec_remote('get_stats')
diff --git a/torchrl/learners/__init__.py b/torchrl/learners/__init__.py
@@ -1,4 +1,4 @@
-from torchrl.learners.base_learner import BaseLearner
-from torchrl.learners.a2c_learner import BaseA2CLearner
-from torchrl.learners.ddpg_learner import BaseDDPGLearner
-from torchrl.learners.dqn_learner import BaseDQNLearner
+from .base_learner import BaseLearner
+from .a2c_learner import BaseA2CLearner
+from .ddpg_learner import BaseDDPGLearner
+from .dqn_learner import BaseDQNLearner