# Use a Fixed Deterministic Policy to Control Acrobot-v1

### Policy

Given the observation be $(x_0,y_0,x_1,y_1,\omega_0,\omega_1)$, action
$=\begin{cases}
0,&\omega_1<-0.3
\\
0,&-0.3\le\omega_1\le0.3\text{ and }y_1+x_0y_1+x_1y_0>0
\\
2,&-0.3\le\omega_1\le0.3\text{ and }y_1+x_0y_1+x_1y_0\le0
\\
2,&\omega_1>0.3
.\end{cases}$

### Test

In [1]:
import numpy as np
import gym
np.random.seed(0)
env = gym.make('Acrobot-v1')
env.seed(0)

[0]

In [2]:
class Agent:
    def decide(self, observation):
        x0, y0, x1, y1, v0, v1 = observation
        if v1 < -0.3:
            action = 0
        elif v1 > 0.3:
            action = 2
        else:
            y = y1 + x0 * y1 + x1 * y0
            if y > 0.:
                action = 0
            else:
                action = 2
        return action

agent = Agent()

In [3]:
def play_once(env, agent):
    observation = env.reset()
    episode_reward = 0.
    while True:
        action = agent.decide(observation)
        observation, reward, done, _ = env.step(action)
        episode_reward += reward
        if done:
            break
    return episode_reward

Test 100 episodes

In [4]:
episode_rewards = [play_once(env, agent) for _ in range(100)]
print('average episode rewards = {:.2f}'.format(np.mean(episode_rewards)))

average episode rewards = -94.81


In [5]:
average_episode_rewards = []
for _ in range(100):
    episode_rewards = [play_once(env, agent) for _ in range(100)]
    average_episode_reward = np.mean(episode_rewards)    
    average_episode_rewards.append(average_episode_reward)
mean, std = np.mean(average_episode_rewards), np.std(average_episode_rewards)
print('average episode reward = {:.2f} +- {:.2f}'.format(mean, std))

average episode reward = -88.36 +- 3.31


In [6]:
env.close()