In [1]:
#Import libraries
import numpy as np
import gym
from gym import wrappers

In [2]:
# Parameters
n_states = 50
alpha = 0.03 
discount = 0.95
num_episodes = 600000
e = 0.1

In [6]:
# Functions

def discretize_state(env, state):
    """ discretize states based on number of states inputted by user """
    state_low = env.observation_space.low
    state_high = env.observation_space.high
    state_dx = (state_high - state_low) / n_states
    position = int((state[0] - state_low[0])/state_dx[0])
    velocity = int((state[1] - state_low[1])/state_dx[1])
    return position,velocity

def get_random_policy(env):
    """ returns a random policy """
    return np.random.choice(env.action_space.n, size=(n_states, n_states))

In [7]:
if __name__ == '__main__':
    env_name = 'MountainCar-v0'
    env = gym.make(env_name)

    print ('Q table learning with discrete states')
    q_table = np.zeros((n_states, n_states, 3))
    for i in range(num_episodes):
        state = env.reset()
        total_reward = 0
        for j in range(10000):
            pos,vel = discretize_state(env, state)
            if np.random.rand(1) < e/(i+1):
                action = env.action_space.sample()
            else:
                action = np.argmax(q_table[pos][vel])
            next_state, reward, done, _ = env.step(action)
            total_reward += reward
            # update q table
            pos_next, vel_next = discretize_state(env, next_state)
            q_table[pos][vel][action] = q_table[pos][vel][action] + alpha * (reward + discount *  np.max(q_table[pos_next][vel_next]) - q_table[pos][vel][action])
            if done:
                break
        if i % 100 == 0:
            print(i, ' - ', total_reward)
    policy = np.argmax(q_table, axis=2)
    eval_score = run_episode(env, policy, True)
    print("Evaluation score = ", eval_score)
    

[2017-07-05 11:38:18,675] Making new env: MountainCar-v0


Q table learning with discrete states
(0, ' - ', -200.0)
(100, ' - ', -200.0)
(200, ' - ', -200.0)
(300, ' - ', -200.0)
(400, ' - ', -200.0)


KeyboardInterrupt: 

In [None]:
def run_episode(env, policy=None, render=False):
    state = env.reset()
    total_reward = 0
    for t in itertools.count():
        if render:
            env.render()
        if policy is None:
            action = env.action_space.sample()
        else:
            pos,vel = discretize_state(env, state)
            action = policy[pos][vel]
        state, reward, done, _ = env.step(action)
        total_reward += reward
        if reward == 0:
            win_count += 1
        if done:
            break
    return total_reward


In [None]:
policy = np.argmax(q_table, axis=2)
tot_reward = run_episode(env, policy, True)
print("score = ", tot_reward)