In [4]:
import gym
import numpy as np
import math
import random

In [5]:
env = gym.make('CartPole-v0')

In [6]:
print(env.action_space.n)

2


In [7]:
print(env.observation_space)

Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)


In [8]:
print(env.observation_space.low)
print(env.observation_space.high)


[-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]
[4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]


In [9]:
NUM_BUCKETS = (1, 1, 6, 3)
# [cart position (left or right), cart velocity, pole position, pole angular velocity]
# [1 = ignore in q table, 1 = ignore, 6 = weight 6, 3 = weight 3]

In [10]:
NUM_ACTIONS = env.action_space.n

In [11]:
STATE_BOUNDS = list(zip(env.observation_space.low, env.observation_space.high))
print(STATE_BOUNDS)

[(-4.8, 4.8), (-3.4028235e+38, 3.4028235e+38), (-0.41887903, 0.41887903), (-3.4028235e+38, 3.4028235e+38)]


In [13]:
STATE_BOUNDS[1] = [-0.5, 0.5]
STATE_BOUNDS[3] = [-math.radians(50), math.radians(50)]
print(STATE_BOUNDS)

[(-4.8, 4.8), [-0.5, 0.5], (-0.41887903, 0.41887903), [-0.8726646259971648, 0.8726646259971648]]


In [16]:
q_table = np.zeros(NUM_BUCKETS + (NUM_ACTIONS,))
print(q_table.shape)

(1, 1, 6, 3, 2)


In [17]:
print(q_table)


[[[[[0. 0.]
    [0. 0.]
    [0. 0.]]

   [[0. 0.]
    [0. 0.]
    [0. 0.]]

   [[0. 0.]
    [0. 0.]
    [0. 0.]]

   [[0. 0.]
    [0. 0.]
    [0. 0.]]

   [[0. 0.]
    [0. 0.]
    [0. 0.]]

   [[0. 0.]
    [0. 0.]
    [0. 0.]]]]]


In [18]:
EXPLORE_RATE_MIN = 0.01
LEARNING_RATE_MIN = 0.1


In [19]:
def get_explore_rate(t):
    return max(EXPLORE_RATE_MIN, min(1, 1.0 - math.log10((t+1)/25)))

In [20]:
def get_learning_rate(t):
    return max(LEARNING_RATE_MIN, min(0.5, 1.0 - math.log10((t+1)/25)))

In [22]:
def select_action(state, explore_rate):
    if random.random() < explore_rate:
        return env.action_space.sample()
    return np.argmax(q_table[state])

In [25]:
def state_to_bucket(state):
    bucket_indices = []
    for i in range(len(state)):
        if state[i] <= STATE_BOUNDS[i][0]:
            bucket_index = 0

        elif state[i] >= STATE_BOUNDS[i][1]:
            bucket_index = NUM_BUCKETS[i] -1

        else:
            bound_width = STATE_BOUNDS[i][1] - STATE_BOUNDS[i][0]
            offset = (NUM_BUCKETS[i] - 1) * STATE_BOUNDS[i][0] / bound_width
            scaling = (NUM_BUCKETS[i] - 1) / bound_width
            bucket_index = int(round(scaling * state[i] - offset))

        bucket_indices.append(bucket_index)

    return tuple(bucket_indices)

In [26]:
def simulate():
    discount_factor = 0.99
    num_streaks = 0

    for episode in range(1000):
        learning_rate = get_learning_rate(episode)
        explore_rate = get_explore_rate(episode)
        obs = env.reset()
        state_0 = state_to_bucket(obs)
        for t in range(250):
            env.render()
            action = select_action(state_0, explore_rate)
            obs, reward, done, _ = env.step(action)
            state = state_to_bucket(obs)
            best_q = np.amax(q_table[state])
            q_table[state_0 + (action,)] += learning_rate * (reward + discount_factor*best_q - q_table[state_0 + (action,)])
            state_0 = state

            print("\nEpisode = %d" % episode)
            print("t = %d" % t)
            print("Action: %d" % action)
            print("State: %s" % str(state))
            print("Reward: %f" % reward)
            print("Best Q: %f" % best_q)
            print("Explore rate: %f" % explore_rate)
            print("Learning rate: %f" % learning_rate)
            print("Streaks: %d" % num_streaks)
            print("")

            if done:
                print("Episode %d finished after %f steps" % (episode, t))

                if t >= 199:
                    num_streaks += 1
                else:
                    num_streaks = 0
                break
        if num_streaks > 120:
            break


In [27]:
simulate()


Episode = 0
t = 0
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 0.000000
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 0
t = 1
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 0.000000
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 0
t = 2
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 0.000000
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 0
t = 3
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 0.500000
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 0
t = 4
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 0.997500
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 0
t = 5
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 1.492513
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 0
t = 6
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 1.492513
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episo