# Use a Fixed Deterministic Policy to Control LunderLander-v2

### Policy

The policy is adopted from `gym.envs.box2d.lunar_lander.demo_heuristic_lander`.

Given the observation be $(x,y,v_x,v_y,\theta,v_\theta,i_\text{left},i_\text{right})$, define

$f_y=\begin{cases}5.5\left|x\right|-10y-10v_y-1,&i_\text{left}=0,i_\text{right}=0\\-10v_y-1,&\text{otherwise}\end{cases}$

$f_\theta=\begin{cases}\mathrm{clip}(5x+10v_x,-4,4)-10\theta-20v_\theta,&i_\text{left}=0,i_\text{right}=0\\0,&\text{otherwise},\end{cases}$

and then the action
$=\begin{cases}
0,&\left|f_\theta\right|\le1\text{ and }f_y\le0
\\
1,&f_\theta>f_y+1\left\{y<0\right\}
\\
2,&\left|f_\theta\right|<f_y
\\
3,&f_\theta<-(f_y+1\left\{y<0\right\})
.\end{cases}$


     
### Test

In [1]:
import numpy as np
import gym
np.random.seed(0)
env = gym.make('LunarLander-v2')
env.seed(0)

[0]

In [2]:
class Agent:
    def decide(self, observation):
        x, y, v_x, v_y, angle, v_angle, contact_left, contact_right = observation

        if contact_left or contact_right: # legs have contact
            f_y = -10. * v_y - 1.
            f_angle = 0.
        else:
            f_y = 5.5 * np.abs(x) - 10. * y - 10. * v_y - 1.
            f_angle = -np.clip(5. * x + 10. * v_x, -4, 4) + 10. * angle + 20. * v_angle

        if np.abs(f_angle) <= 1 and f_y <= 0:
            action = 0 # do nothing
        elif np.abs(f_angle) < f_y:
            action = 2 # main engine
        elif f_angle < 0.:
            action = 1 # left engine
        else:
            action = 3 # right engine
        return action

agent = Agent()

In [3]:
def play_once(env, agent):
    observation = env.reset()
    episode_reward = 0.
    while True:
        action = agent.decide(observation)
        observation, reward, done, _ = env.step(action)
        episode_reward += reward
        if done:
            break
    return episode_reward

Test 100 episodes

In [4]:
episode_rewards = [play_once(env, agent) for _ in range(100)]
print('average episode rewards = {:.2f}'.format(np.mean(episode_rewards)))

average episode rewards = 262.72


Test 1000 episodes

In [5]:
episode_rewards = [play_once(env, agent) for _ in range(1000)]
print('average episode rewards = {:.2f}'.format(np.mean(episode_rewards)))

average episode rewards = 262.56


In [6]:
env.close()