# Use a Fixed Deterministic Policy to Control Pendulum-v0

### Policy

Let the observation be $(x,y,\omega)=(\cos{\theta},\sin{\theta},\omega)$. We have $\theta=\begin{cases}\arcsin{y},&x\ge0\\-\arcsin{y}+\pi,&x<0\text{ and }y>0\\-\arcsin{y}-\pi,&x<0\text{ and }y<0.\end{cases}$

If $\theta\ge{0}$, action $=\begin{cases}+2&\theta<-0.3\omega\text{ or }0.03{(\omega-2.5)}^2+1<\theta<0.15{(\omega+3)}^2+2\\-2,&\text{otherwise}\end{cases}$. If $\theta<0$, use symmetric to deterimine the action.

### Test

In [1]:
import numpy as np
import gym
np.random.seed(0)
env = gym.make('Pendulum-v0')
env.seed(0)

[0]

In [2]:
class Agent:
    def decide(self, observation):
        x, y, angle_velocity = observation
        flip = (y < 0.)
        if flip:
            y *= -1. # now y >= 0
            angle_velocity *= -1.
        angle = np.arcsin(y)
        if x < 0.:
            angle = np.pi - angle
        if (angle < -0.3 * angle_velocity) or \
                (angle > 0.03 * (angle_velocity - 2.5) ** 2. + 1. and \
                angle < 0.15 * (angle_velocity + 3.) ** 2. + 2.):
            force = 2.
        else:
            force = -2.
        if flip:
            force *= -1.
        action = np.array([force,])
        return action

agent = Agent()

In [3]:
def play_once(env, agent):
    observation = env.reset()
    episode_reward = 0.
    while True:
        action = agent.decide(observation)
        observation, reward, done, _ = env.step(action)
        episode_reward += reward
        if done:
            break
    return episode_reward

Test 100 episodes

In [4]:
episode_rewards = [play_once(env, agent) for _ in range(100)]
print('average episode rewards = {:.2f}'.format(np.mean(episode_rewards)))

average episode rewards = -145.09


In [5]:
average_episode_rewards = []
for _ in range(100):
    episode_rewards = [play_once(env, agent) for _ in range(100)]
    average_episode_reward = np.mean(episode_rewards)    
    average_episode_rewards.append(average_episode_reward)
mean, std = np.mean(average_episode_rewards), np.std(average_episode_rewards)
print('average episode reward = {:.2f} +- {:.2f}'.format(mean, std))

average episode reward = -145.18 +- 8.51


In [6]:
env.close()