In this assignment you will learn how to apply the REINFORCE algorithm within the OpenAI Gym environment. Make sure OpenAI gym is installed on your machine. Now let's import some relevant packages.

In [2]:
import gym
from gym import wrappers, logger
import matplotlib.pyplot as plt
import tqdm
import numpy as np
from chainer import Chain
import chainer.links as L
import chainer.functions as F
from chainer.optimizers import Adam
from chainer import Variable

%matplotlib inline

We will make use of the classic CartPole environment provided by OpenAI Gym. Figure out what the details of this environment are.

In [3]:
env_id = 'CartPole-v0'

# You can set the level to logger.DEBUG or logger.WARN if you want to change the amount of output.
logger.set_level(logger.INFO)

In [18]:
### TEST
env = gym.make('CartPole-v0')
for i_episode in range(2):
    print('\n>> episode', i_episode)
    observation = env.reset()
    for t in range(20):
#         env.render()
        print('observation',observation, end=' ')
        action = env.action_space.sample()
        print('action', action, end=' ')
        observation, reward, done, info = env.step(action)
        print('reward', reward, 'info', info)
        if done:
            print("\nEpisode finished after {} timesteps".format(t+1))
            break

INFO: Making new env: CartPole-v0

>> episode 0
observation [ 0.02180306  0.04762818 -0.00456282 -0.02453863] action 0 reward 1.0 info {}
observation [ 0.02275563 -0.14742804 -0.00505359  0.26670119] action 1 reward 1.0 info {}
observation [ 0.01980707  0.04776567  0.00028043 -0.02757139] action 1 reward 1.0 info {}
observation [ 2.07623790e-02  2.42883596e-01 -2.70994966e-04 -3.20165823e-01] action 0 reward 1.0 info {}
observation [ 0.02562005  0.04776551 -0.00667431 -0.02756837] action 1 reward 1.0 info {}
observation [ 0.02657536  0.24298253 -0.00722568 -0.32234962] action 1 reward 1.0 info {}
observation [ 0.03143501  0.43820663 -0.01367267 -0.61730245] action 1 reward 1.0 info {}
observation [ 0.04019914  0.63351689 -0.02601872 -0.9142601 ] action 1 reward 1.0 info {}
observation [ 0.05286948  0.82898092 -0.04430392 -1.21500555] action 0 reward 1.0 info {}
observation [ 0.0694491   0.63445765 -0.06860403 -0.93652806] action 1 reward 1.0 info {}
observation [ 0.08213825  0.83043436

Let's define a baseline agent which just emits random actions.

In [0]:
class RandomAgent(object):
    """The world's simplest agent!"""

    def __init__(self, action_space):
        self.action_space = action_space

    def act(self, observation, reward, done):
        return self.action_space.sample()


Let's run the agent on the environment.

In [19]:
env = gym.make(env_id)
env.seed(0)
print(type(env.action_space))
agent = RandomAgent(env.action_space)

episode_count = 1000
done = False
reward = 0
    
R0 = np.zeros(episode_count)
for i in tqdm.trange(episode_count):

    ob = env.reset()

    while True:

        action = agent.act(ob, reward, done)
        ob, reward, done, _ = env.step(action)

        R0[i] += reward

        if done:
            break

# Close the env and write monitor result info to disk
env.close()
print(R0)

  result = entry_point.load(False)
 30%|███       | 304/1000 [00:00<00:00, 3036.22it/s]

INFO: Making new env: CartPole-v0
<class 'gym.spaces.discrete.Discrete'>


100%|██████████| 1000/1000 [00:00<00:00, 3474.36it/s]


[ 25.  39.  26.  14.  22.  27.  34.  33.  11.  13.  16.  36.  21.  16.
  59.  12.  35.  15.  14.  38.   8.  24.  26.  12.  13.  40.  36.  10.
  13.  19.  32.  40.  21.  12.  17.  58.  16.  13.  13.  15.  21.  29.
  29.  27.  16.   9.  28.  27.  10.  19.  14.  27.  11.  16.  50.  62.
  16.  24.  62.  13.  30.  17.  41.  55.  15.  13.  43.  20.  19.  19.
  29.  19.  24.  10.  20.  23.  22.  17.  20.  18.  15.  13.   9.  10.
  14.  15.  14.  16.  34.  12.  14.  14.  15.  14.  40.  41.  63.  16.
  22.  11.  23.  21.  23.  11.  42.  30.  23.   9.  20.  16.  16.  24.
  29.  10.  60.  28.  11.  14.  22.  21.  14.  21.  16.  15.  40.  14.
  44.  42.  17.  12.  19.  36.  14. 119.  12.  24.  16.  16.  48.  11.
  14.  13.  37.  17.  23.  22.  31.  13.  12.  21.  31.  30.  36.  28.
  13.  12.  25.  52.  18.  35.  12.  20.  18.  21.  33.  10.  39.  24.
  20.  33.  13.  30.  32.  15.  16.  30.  24.  15.  23.  19.  15.  15.
   9.  82.  17.  21.  46.  27.  19.  18.  50.  17.  41.  17.  18.  17.
  19. 

Let's create the REINFORCE agent. We assume that the policy is computed using an MLP with a softmax output.

In [0]:
class MLP(Chain):
    """Multilayer perceptron"""

    def __init__(self, n_output=1, n_hidden=5):
        super(MLP, self).__init__(l1=L.Linear(None, n_hidden), 
                                  l2=L.Linear(n_hidden, n_output))

    def __call__(self, x):
        return self.l2(F.relu(self.l1(x)))

1: A skeleton for the REINFORCEAgent is given. Implement the compute_loss and compute_score functions. 

In [0]:
class REINFORCEAgent(object):
    """Agent trained using REINFORCE"""

    def __init__(self, action_space, model, optimizer=Adam()):

        self.action_space = action_space

        self.model = model

        self.optimizer = optimizer
        self.optimizer.setup(self.model)

        # monitor score and reward
        self.rewards = []
        self.scores = []


    def act(self, observation, reward, done):

        # linear outputs reflecting the log action probabilities and the value
        policy = self.model(Variable(np.atleast_2d(np.asarray(observation, 'float32'))))

        # generate action according to policy
        p = F.softmax(policy).data

        # normalize p in case tiny floating precision problems occur
        row_sums = p.sum(axis=1)
        p /= row_sums[:, np.newaxis]

        action = np.asarray([np.random.choice(p.shape[1], None, True, p[0])])

        return action, policy


    def compute_loss(self):
        """
        Return loss for this episode based on computed scores and accumulated rewards
        """
    
        return Variable(np.array([0]))

    def compute_score(self, action, policy):
        """
        Computes score

        Args:
            action (int):
            policy:

        Returns:
            score
        """

        pass

Now we run the REINFORCE agent on the CartPole environment. Note that we update the agent after each episode for simplicity.

In [15]:
env = gym.make(env_id)
env.seed(0)

network = MLP(n_output=env.action_space.n, n_hidden=3)
agent = REINFORCEAgent(env.action_space, network, optimizer=Adam())

episode_count = 1000
done = False
reward = 0
    
R = np.zeros(episode_count)
for i in tqdm.trange(episode_count):

    ob = env.reset()

    loss = 0
    while True:

        action, policy = agent.act(ob, reward, done)

        ob, reward, done, _ = env.step(action[0])

        # get reward associated with taking the previous action in the previous state
        agent.rewards.append(reward)
        R[i] += reward

        # recompute score function: grad_theta log pi_theta (s_t, a_t) * v_t
        agent.scores.append(agent.compute_score(action, policy))

        # we learn at the end of each episode
        if done:
            
            loss += agent.compute_loss()
            
            agent.model.cleargrads()
            loss.backward()
            loss.unchain_backward()
            agent.optimizer.update()

            break

  result = entry_point.load(False)
  1%|          | 7/1000 [00:00<00:14, 67.69it/s]

INFO: Making new env: CartPole-v0


100%|██████████| 1000/1000 [00:12<00:00, 81.36it/s]


In [0]:
# You may want to run a video of the trained agent performing in the environment using the env.render() function.
#
# for i in range(3):
#
#     ob = env.reset()
#
#     while True:
#
#         action, policy = agent.act(ob, reward, done)
#
#         ob, reward, done, _ = env.step(action[0])
#
#         if done:
#             break
#       
#         env.render()

2: Plot the cumulative reward for both RandomAgent and REINFORCEAgent.