# Use a Fixed Deterministic Policy to Control BipedalWalker-v3

## Policy

The action $\mathbf{a}$ (as a row vector) is a linear function of the obeservation $\mathbf{o}$ (as a row vector), i.e. $\mathbf{a}=\mathbf{o}\mathbf{W}+\mathbf{b}$, where

$\mathbf{W}=\left(\begin{matrix}
 0.9 & -0.7 &  0.0 & -1.4 \\
 4.3 & -1.6 & -4.4 & -2.0 \\
 2.4 & -4.2 & -1.3 & -0.1 \\
-3.1 & -5.0 & -2.0 & -3.3 \\
-0.8 &  1.4 &  1.7 &  0.2 \\
-0.7 &  0.2 & -0.2 &  0.1 \\
-0.6 & -1.5 & -0.6 &  0.3 \\
-0.5 & -0.3 &  0.2 &  0.1 \\
 0.0 & -0.1 & -0.1 &  0.1 \\
 0.4 &  0.8 & -1.6 & -0.5 \\
-0.4 &  0.5 & -0.3 & -0.4 \\
 0.3 &  2.0 &  0.9 & -1.6 \\
 0.0 & -0.2 &  0.1 & -0.3 \\
 0.1 &  0.2 & -0.5 & -0.3 \\
 0.7 &  0.3 &  5.1 & -2.4 \\
-0.4 & -2.3 &  0.3 & -4.0 \\
 0.1 & -0.8 &  0.3 &  2.5 \\
 0.4 & -0.9 & -1.8 &  0.3 \\
-3.9 & -3.5 &  2.8 &  0.8 \\
 0.4 & -2.8 &  0.4 &  1.4 \\
-2.2 & -2.1 & -2.2 & -3.2 \\
-2.7 & -2.6 &  0.3 &  0.6 \\
 2.0 &  2.8 &  0.0 & -0.9 \\
-2.2 &  0.6 &  4.7 & -4.6
\end{matrix}\right)$

$\mathbf{b}=\left(3.2, 6.1, -4.0, 7.6\right)$

## Test

In [1]:
import numpy as np
np.random.seed(0)
import gym
env = gym.make('BipedalWalker-v3')
env.seed(0)
print(env.spec.reward_threshold)

300


In [2]:
class Agent:
    def decide(self, observation):
        weights = np.array([
            [ 0.9, -0.7,  0.0, -1.4],
            [ 4.3, -1.6, -4.4, -2.0],
            [ 2.4, -4.2, -1.3, -0.1],
            [-3.1, -5.0, -2.0, -3.3],
            [-0.8,  1.4,  1.7,  0.2],
            [-0.7,  0.2, -0.2,  0.1],
            [-0.6, -1.5, -0.6,  0.3],
            [-0.5, -0.3,  0.2,  0.1],
            [ 0.0, -0.1, -0.1,  0.1],
            [ 0.4,  0.8, -1.6, -0.5],
            [-0.4,  0.5, -0.3, -0.4],
            [ 0.3,  2.0,  0.9, -1.6],
            [ 0.0, -0.2,  0.1, -0.3],
            [ 0.1,  0.2, -0.5, -0.3],
            [ 0.7,  0.3,  5.1, -2.4],
            [-0.4, -2.3,  0.3, -4.0],
            [ 0.1, -0.8,  0.3,  2.5],
            [ 0.4, -0.9, -1.8,  0.3],
            [-3.9, -3.5,  2.8,  0.8],
            [ 0.4, -2.8,  0.4,  1.4],
            [-2.2, -2.1, -2.2, -3.2],
            [-2.7, -2.6,  0.3,  0.6],
            [ 2.0,  2.8,  0.0, -0.9],
            [-2.2,  0.6,  4.7, -4.6],
            ])
        bias = np.array([3.2, 6.1, -4.0, 7.6])
        action = np.matmul(observation, weights) + bias
        return action

agent = Agent()

In [3]:
def play_once(env, agent):
    observation = env.reset()
    episode_reward = 0.
    while True:
        action = agent.decide(observation)
        observation, reward, done, _ = env.step(action)
        episode_reward += reward
        if done:
            break
    return episode_reward

Test 100 episodes

In [4]:
episode_rewards = [play_once(env, agent) for _ in range(100)]
print('average episode rewards = {:.2f}'.format(np.mean(episode_rewards)))

average episode rewards = 311.94


In [5]:
env.close()