In [1]:
import gym
import numpy as np
from numpy.random import randn
from numpy.random import rand
import itertools
import random
import pprint as pp

## Observations
## Type: Box(24)

|Num|	  Observation |              Min	|     Max	|Mean|
| --- | --- | --- |--- | --- |
|0	|hull_angle	           |     0	  |  2*pi| 0.5|
|1	|hull_angularVelocity	 |  -inf	|    +inf|	-|
|2|	vel_x                    |   -1	    |+1	  |  -|
|3|	vel_y	                 |   -1	    |+1	|-|
|4|	hip_joint_1_angle	     |  -inf	    |+inf|	-|
|5	|hip_joint_1_speed	     |   -inf	|+inf	|-|
|6|	knee_joint_1_angle   	 |  -inf	    | +inf|	-|
|7|	knee_joint_1_speed	     |  -inf	     |+inf|	-|
|8	|leg_1_ground_contact_flag|	 0	     | 1	|-|
|9	|hip_joint_2_angle	      | -inf  	|+inf	|-|
|10	|hip_joint_2_speed	      |-inf   	|+inf	|-|
|11	|knee_joint_2_angle	      | -inf  	|+inf	|-|
|12	|knee_joint_2_speed	       |-inf	    |+inf|	-|
|13	|leg_2_ground_contact_flag	|  0	    |  1|	    -|
|14-23|	10 lidar readings	    |-inf	|+inf	|-|




## Actions :

|Num|	Name|	Min	|Max|
| --- | --- | --- |--- | 
|0	|Hip_1 (Torque / Velocity)	|-1	|+1|
|1|	Knee_1 (Torque / Velocity)	|-1	|+1|
|2	|Hip_2 (Torque / Velocity)	|-1	|+1
|3|	Knee_2 (Torque / Velocity)	|-1	|+1

In [2]:
env_name=  "BipedalWalker-v3"
env= gym.make(env_name)
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)

Observation space: Box(-inf, inf, (24,), float32)
Action space: Box(-1.0, 1.0, (4,), float32)




In [3]:
class HillClimbingAgent():
    def __init__(self, env):
        self.state_dim = 24
        self.build_model()
        
    def build_model(self):
        self.w1 = 1e-4*np.random.rand(self.state_dim, 10)
        self.w2 = 1e-4*np.random.rand(self.state_dim, 10)
        self.w3 = 1e-4*np.random.rand(self.state_dim, 10)
        self.w4 = 1e-4*np.random.rand(self.state_dim, 10)
        self.best_reward = -np.Inf
        self.best_w1 = np.copy(self.w1)
        self.best_w2 = np.copy(self.w2)
        self.best_w3 = np.copy(self.w3)
        self.best_w4 = np.copy(self.w4)
        self.noise_scale = 1e-2
        
    def get_action(self, state):
        p1 = np.dot(state, self.w1)
        p2 = np.dot(state, self.w2)
        p3 = np.dot(state, self.w3)
        p4 = np.dot(state, self.w4)
        action = []
        
        # take neighbor action with maximum weight : argmax
        action.append(self.random_from_action_range(np.argmax(p1)))
        action.append(self.random_from_action_range(np.argmax(p2)))
        action.append(self.random_from_action_range(np.argmax(p3)))
        action.append(self.random_from_action_range(np.argmax(p4)))
        return action
    
    # stochastic hill climbing: take random from nearest best neighbor
    def random_from_action_range(self, index):
        if index == 0:
            return random.uniform(-1, -0.8)
        elif index == 1:
            return random.uniform(-0.8, -0.6)
        elif index == 2:
            return random.uniform(-0.6, -0.4)
        elif index == 3:
            return random.uniform(-0.4, -0.2)
        elif index == 4:
            return random.uniform(-0.2, 0)
        elif index == 5:
            return random.uniform(0, 0.2)
        elif index == 6:
            return random.uniform(0.2, 0.4)
        elif index == 7:
            return random.uniform(0.4, 0.6)
        elif index == 8:
            return random.uniform(0.6, 0.8)
        elif index == 9:
            return random.uniform(0.8, 1)
    
    def update_model(self, reward):
        if reward >= self.best_reward:
            self.best_reward = reward
            self.best_w1 = np.copy(self.w1)
            self.best_w2 = np.copy(self.w2)
            self.best_w3 = np.copy(self.w3)
            self.best_w4 = np.copy(self.w4)
            self.noise_scale = max(self.noise_scale/2, 1e-3)
        else:
            self.noise_scale = min(self.noise_scale*2, 2)
            
        self.w1 = self.best_w1 + self.noise_scale * np.random.rand(self.state_dim, 10)
        self.w2 = self.best_w2 + self.noise_scale * np.random.rand(self.state_dim, 10)
        self.w3 = self.best_w3 + self.noise_scale * np.random.rand(self.state_dim, 10)
        self.w4 = self.best_w4 + self.noise_scale * np.random.rand(self.state_dim, 10)

In [4]:
agent = HillClimbingAgent(env)
num_episodes = 100

for ep in range(num_episodes):
    state = env.reset()
    total_reward = 0
    done = False
    while not done:
        action = agent.get_action(state)
        #print ("Action: ", action)
        state, reward, done, info = env.step(action)
        env.render()
        total_reward += reward
        
    agent.update_model(total_reward)
    print("Episode: {}, total_reward: {:.2f}".format(ep, total_reward))

Episode: 0, total_reward: -105.25
Episode: 1, total_reward: -105.93
Episode: 2, total_reward: -44.43
Episode: 3, total_reward: -136.03
Episode: 4, total_reward: -67.24
Episode: 5, total_reward: -61.52
Episode: 6, total_reward: -128.58
Episode: 7, total_reward: -50.22
Episode: 8, total_reward: -112.33
Episode: 9, total_reward: -106.33
Episode: 10, total_reward: -59.91
Episode: 11, total_reward: -126.66
Episode: 12, total_reward: -84.77
Episode: 13, total_reward: -118.34
Episode: 14, total_reward: -117.62
Episode: 15, total_reward: -114.53
Episode: 16, total_reward: -111.90
Episode: 17, total_reward: -94.60
Episode: 18, total_reward: -104.85
Episode: 19, total_reward: -107.85
Episode: 20, total_reward: -108.99
Episode: 21, total_reward: -95.77
Episode: 22, total_reward: -122.46
Episode: 23, total_reward: -107.07
Episode: 24, total_reward: -68.71
Episode: 25, total_reward: -105.36
Episode: 26, total_reward: -118.16
Episode: 27, total_reward: -36.02
Episode: 28, total_reward: -36.74
Episod

KeyboardInterrupt: 