# Q Learning for MountainCar

### Imports

In [1]:
import numpy as np

import gym
from gym import wrappers



import time

# Imports specifically so we can render outputs in Jupyter.
import matplotlib.pyplot as plt
%matplotlib inline
from JSAnimation.IPython_display import display_animation
from matplotlib import animation
from IPython.display import display

### Util functions (display)

In [2]:
def display_frames_as_gif(frames):
    """
    Displays a list of frames as a gif, with controls
    """
    patch = plt.imshow(frames[0])
    plt.axis('off')

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
    display(display_animation(anim, default_mode='loop'))

### Hyperparams

In [3]:
n_states = 60 #In how many slices discretize the continuous space, the bigger, the smoother. 
iter_max = 10000 #Number of epochs
t_max = 200 #Number of max actions taken per episode. If in 200 steps it's not done, the environment takes it as fail.

#Tweaking params
initial_lr = 1.0 #Initial Learning Rate
min_lr = 0.003 #Minimum Learning Rate
gamma = 1.0 #Discount factor
eps = 0.02 #Probability of take a random action

gamma_list = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
eps_list = [0.00, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]

### Create environment

In [4]:
env_name = 'MountainCar-v0'
env = gym.make(env_name)

[2017-12-03 14:50:49,466] Making new env: MountainCar-v0


### Observation to state (discretize)

Given an observations it returns a discretized [position,speed] state pair.

In [5]:
def obs_to_state(env, obs):
    """ Maps an observation to state """
    env_low = env.observation_space.low
    env_high = env.observation_space.high
    env_dx = (env_high - env_low) / n_states
    position = int((obs[0] - env_low[0])/env_dx[0])
    speed = int((obs[1] - env_low[1])/env_dx[1])
    return position, speed


In [6]:
obs = env.reset()
pos,speed = obs_to_state(env,obs)
print('Position index on Q-Table: %d' %(pos))
print('Speed index on Q-Table: %d' %(speed))

Position index on Q-Table: 20
Speed index on Q-Table: 30


### Run a single episode

In [7]:
def run_episode(env, policy=None, render=False):
    obs = env.reset()
    total_reward = 0
    step_idx = 0
    frames = []
    for _ in range(t_max):
        if render:
            frames.append(env.render(mode = 'rgb_array'))
        if policy is None:
            action = env.action_space.sample()
        else:
            pos,speed = obs_to_state(env, obs)
            action = policy[pos][speed]
        obs, reward, done, _ = env.step(action)
        total_reward += gamma ** step_idx * reward
        step_idx += 1
        if done:
            break
    
    if render:
        env.render(close=True)
        display_frames_as_gif(frames)
    return total_reward


#### Random episode for example

In [8]:
total_reward = run_episode(env,render=True)
print('Total reward: %d' % total_reward)

Total reward: -200


### Grid search

Choosing the right hyperparameters is always important.

Unfortunately, could be so time-expensive.

Here's a function that do a Grid search over _gamma_ and _eps_ parameters, feel free to use it. I'll skip and use the default values.

In [9]:
def grid_train(env,num_epochs,gamma,eps):    
    #Initialize Q-Table
    q_table = np.zeros((n_states, n_states, 3)) #[number_of_positions x number_of_speeds x number_of_actionst]
    for i in range(num_epochs):
        obs = env.reset()
        total_reward = 0
        eta = max(min_lr, initial_lr * (0.85 ** (i//100)))
        while True:
            pos, speed = obs_to_state(env, obs)
            if np.random.uniform(0, 1) < eps:
                action = np.random.choice(env.action_space.n)
            else:
                logits = q_table[pos][speed]
                logits_exp = np.exp(logits)
                probs = logits_exp / np.sum(logits_exp)
                action = np.random.choice(env.action_space.n, p=probs)
            obs, reward, done, _ = env.step(action)
            total_reward += reward
            pos_, speed_ = obs_to_state(env, obs)
            q_table[pos][speed][action] = q_table[pos][speed][action] + eta * \
            (reward + gamma *  np.max(q_table[pos_][speed_]) - q_table[pos][speed][action])
            if done:
                break
    solution_policy = np.argmax(q_table, axis=2)
    solution_policy_scores = [run_episode(env, solution_policy, False) for _ in range(100)]
    av_score = np.max(solution_policy_scores)
    print('eps: %.2f - gamma: %.2f - best score: %.f' % (eps,gamma,av_score))
    
    return av_score,eps,gamma

In [10]:
def grid_search(eps_list,gamma_list,num_epochs):
    start = time.time()
    estimated = False
    search_n = len(eps_list)*len(gamma_list)
    print('Starting grid search for:\n%s\n%s\n%d values' % (eps_list,gamma_list,search_n))
    search = []
    for eps_ in eps_list:
        for gamma_ in gamma_list:
            search.append(grid_train(env,num_epochs,gamma_,eps_))
            if estimated == False:
                estimated = True
                elapsed = time.time() - start
                print('Estimated time: %.1f minutes' % (elapsed*search_n/60))
                
    return search

In [None]:
search_result = grid_search(eps_list,gamma_list,iter_max)

### Train

In [11]:
def train_q_learning(env):
    print ('Start Q-Learning training:')
    display_freq = iter_max // 10
    
    #Initialize Q-Table
    q_table = np.zeros((n_states, n_states, 3)) #[number_of_positions x number_of_speeds x number_of_actionst]

    
    for i in range(iter_max):
        obs = env.reset()
        total_reward = 0
        ## eta: learning rate is decreased at each step
        eta = max(min_lr, initial_lr * (0.85 ** (i//100)))
        
        for j in range(t_max):
            pos, speed = obs_to_state(env, obs) #Get action,state to pick from Q-Table
            
            if np.random.uniform(0, 1) < eps: #Randomize sometimes
                action = np.random.choice(env.action_space.n)
            else:
                #Q-Table picking process
                logits = q_table[pos][speed] #Actions for 
                logits_exp = np.exp(logits)
                probs = logits_exp / np.sum(logits_exp)
                action = np.random.choice(env.action_space.n, p=probs)

            obs, reward, done, _ = env.step(action)
            total_reward += reward
            
            #Update Q-Table
            pos_, speed_ = obs_to_state(env, obs)
            q_table[pos][speed][action] = q_table[pos][speed][action] + eta * \
            (reward + gamma *  np.max(q_table[pos_][speed_]) - q_table[pos][speed][action])

            if done:
                break
        if i % display_freq == 0: #Write out partial results
            print('At epoch: %d - Reward last episode: %d' %(i+1, total_reward))
    
    print('Training finished!')
    solution_policy = np.argmax(q_table, axis=2)
    solution_policy_scores = [run_episode(env, solution_policy, False) for _ in range(100)]
    print("Average score of solution = ", np.mean(solution_policy_scores))
    
    return solution_policy

In [12]:
sol_policy = train_q_learning(env)

Start Q-Learning training:
At epoch: 1 - Reward last episode: -200
At epoch: 1001 - Reward last episode: -200
At epoch: 2001 - Reward last episode: -200
At epoch: 3001 - Reward last episode: -200
At epoch: 4001 - Reward last episode: -200
At epoch: 5001 - Reward last episode: -200
At epoch: 6001 - Reward last episode: -200
At epoch: 7001 - Reward last episode: -200
At epoch: 8001 - Reward last episode: -200
At epoch: 9001 - Reward last episode: -200
Training finished!
Average score of solution =  -200.0


### Test it!

Let's try to play the game with the optimal Q-Table got by the Q-Learning training process

In [13]:
# Animate it
run_episode(env, sol_policy, True)

-200.0

WOOHOO!