In [29]:
import numpy as np
import gym
from gym.spaces.box import Box
from gym.spaces.discrete import Discrete
from collections import defaultdict

In [30]:
Q = defaultdict(float)
gamma = 0.99 # discount factor
alpha = 0.5  # soft update param

env = gym.make('CartPole-v0')
actions = range(env.action_space.n)

# Q(s,a) <- (1 - alpha) * Q(s, a) + alpha * (r + rho * max_an(Q(sn, an)))

def updateQ(s, r, a, s_n, done):
    max_q_n = max([Q[s_n, a] for a in actions])
    Q[s, a] += alpha * (r + gamma * max_q_n * (1. - done) - Q[s, a])

In [31]:
class DiscretizedObservationWrapper(gym.ObservationWrapper):
    """This wrapper converts a Box observation into a single integer.
    """
    def __init__(self, env, n_bins=10, low=None, high=None):
        super().__init__(env)
        assert isinstance(env.observation_space, Box)

        low = self.observation_space.low if low is None else low
        high = self.observation_space.high if high is None else high

        self.n_bins = n_bins
        self.val_bins = [np.linspace(l, h, n_bins + 1) for l, h in
                         zip(low.flatten(), high.flatten())]
        self.observation_space = Discrete(n_bins ** low.flatten().shape[0])

    def _convert_to_one_number(self, digits):
        return sum([d * ((self.n_bins + 1) ** i) for i, d in enumerate(digits)])

    def observation(self, observation):
        digits = [np.digitize([x], bins)[0]
                  for x, bins in zip(observation.flatten(), self.val_bins)]
        return self._convert_to_one_number(digits)

In [32]:
env = DiscretizedObservationWrapper(
    env, 
    n_bins=8, 
    low=np.array([-2.4, -2.0, -0.42, -3.5]), 
    high=np.array([2.4, 2.0, 0.42, 3.5])
)

In [38]:
n_steps = 100000
epsilon = 0.1  # 10% chances to apply a random action

def act(ob):
    if np.random.random() < epsilon:
        # action_space.sample() is a convenient function to get a random action
        # that is compatible with this given action space.
        return env.action_space.sample()

    # Pick the action with highest q value.
    qvals = {a: Q[ob, a] for a in actions}
    max_q = max(qvals.values())
    # In case multiple actions have the same maximum q value.
    actions_with_max_q = [a for a, q in qvals.items() if q == max_q]
    return np.random.choice(actions_with_max_q)

ob = env.reset()
rewards = []
reward = 0.0

for step in range(n_steps):
    a = act(ob)
    ob_next, r, done, _ = env.step(a)
    updateQ(ob, r, a, ob_next, done)
    reward += r
    if done:
        rewards.append(reward)
        reward = 0.0
        ob = env.reset()
    else:
        ob = ob_next

In [39]:
Q

defaultdict(float,
            {(4010, 0): 80.48266660057284,
             (4010, 1): 78.77040355222542,
             (3290, 0): 78.31831898958434,
             (3290, 1): 55.60378553138088,
             (3281, 0): 77.88372339771992,
             (3281, 1): 76.71384010755162,
             (3280, 0): 78.52583779114286,
             (3280, 1): 66.96009363498439,
             (3289, 0): 85.5708252834113,
             (3289, 1): 76.24922772231085,
             (2479, 0): 46.79416222780124,
             (2479, 1): 11.264946827375521,
             (3209, 0): 74.77199983382403,
             (3209, 1): 54.4745166002334,
             (3200, 0): 74.02422656194013,
             (3200, 1): 65.0036574841906,
             (2399, 0): 0.0,
             (2399, 1): 0.0,
             (3371, 0): 63.74432201068791,
             (3371, 1): 72.74684574899896,
             (2570, 0): 76.71154131648856,
             (2570, 1): 52.73274701227445,
             (1850, 0): 24.272025025360623,
             (1850, 1