# Setup

In [1]:
import numpy as np
import gym
import random

In [2]:
env = gym.make("Taxi-v2")
state = env.reset()
env.render()

+---------+
|R: | : :[34;1mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[43mB[0m: |
+---------+



# Random

In [3]:
total_reward = 0
done = False

while not done:
    state, reward, done, info = env.step(env.action_space.sample())
    total_reward += reward
    env.render()

print('Total reward:', total_reward)

+---------+
|R: | : :[34;1mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[43mB[0m: |
+---------+
  (Pickup)
+---------+
|R: | : :[34;1mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[43mB[0m: |
+---------+
  (Dropoff)
+---------+
|R: | : :[34;1mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[43mB[0m: |
+---------+
  (South)
+---------+
|R: | : :[34;1mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[43mB[0m: |
+---------+
  (West)
+---------+
|R: | : :[34;1mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[43mB[0m: |
+---------+
  (West)
+---------+
|R: | : :[34;1mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[43mB[0m: |
+---------+
  (West)
+---------+
|R: | : :[34;1mG[0m|
| : : : : |
| : : : : |
| | : |[43m [0m: |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | : :[34;1mG[0m|
| : : : : |
| : : : : |
| | : |[43m [0m: |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|R: |

# Goal 1 – Q-Learning

## Q-table structure

In [4]:
print("Action space size: ", env.action_space.n)
print("State space size: ", env.observation_space.n)

Action space size:  6
State space size:  500


In [5]:
qtable = np.zeros((env.observation_space.n, env.action_space.n))
qtable

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

## Hyperparams

In [55]:
total_episodes = 50000
max_steps = 99

learning_rate = 0.7
gamma = 0.6

epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.01

## Training

In [56]:
qtable = np.zeros((env.observation_space.n, env.action_space.n))

for episode in range(total_episodes):
    state = env.reset()
    done = False
    
    for step in range(max_steps):
        # explore or exploit
        if random.uniform(0,1)<epsilon:
            action = env.action_space.sample() # random
        else:
            action = np.argmax(qtable[state, :]) # action with the highest Q-value, for a given state
        
        # take action
        new_state, reward, done, info = env.step(action)
        
        # update Q-table
        qtable[state, action] += learning_rate * (reward + gamma*np.max(qtable[new_state, :]) - qtable[state, action]) 
        
        state = new_state
        
        if done:
            break
        
    epsilon = min_epsilon + (max_epsilon-min_epsilon)*np.exp(-decay_rate*episode)

## Testing

In [63]:
def evaluate_qlearning(debug=True):
    episodes = 1000
    rewards = []

    env.reset()

    for episode in range(episodes):
        state = env.reset()
        done = False
        total_rewards = 0

        if debug and episode%100 == 0:
            print("-"*50)
            print("EPISODE ", episode)

        for step in range(max_steps):
            action = np.argmax(qtable[state,:]) # action with the highest Q-value, for a given state
            new_state, reward, done, info = env.step(action)
            total_rewards += reward
            state = new_state

            if done:
                break

        rewards.append(total_rewards)    

    env.close()
    print("Score over time: ",  sum(rewards)/episodes)

In [58]:
evaluate_qlearning()

--------------------------------------------------
EPISODE  0
--------------------------------------------------
EPISODE  100
--------------------------------------------------
EPISODE  200
--------------------------------------------------
EPISODE  300
--------------------------------------------------
EPISODE  400
--------------------------------------------------
EPISODE  500
--------------------------------------------------
EPISODE  600
--------------------------------------------------
EPISODE  700
--------------------------------------------------
EPISODE  800
--------------------------------------------------
EPISODE  900
Score over time:  8.428


# Goal 2 – Beat Basic Q-Learning

## Let's first try with a consant `epsilon`

In [119]:
total_episodes = 50000
max_steps = 99

learning_rate = 0.7
gamma = 0.6

epsilon = 0.5

In [120]:
qtable = np.zeros((env.observation_space.n, env.action_space.n))

for episode in range(total_episodes):
    state = env.reset()
    done = False
    
    for step in range(max_steps):
        # explore or exploit
        if random.uniform(0,1)<epsilon:
            action = env.action_space.sample() # random
        else:
            action = np.argmax(qtable[state, :]) # action with the highest Q-value, for a given state
        
        # take action
        new_state, reward, done, info = env.step(action)
        
        # update Q-table
        qtable[state, action] += learning_rate * (reward + gamma*np.max(qtable[new_state, :]) - qtable[state, action]) 
        
        state = new_state
        
        if done:
            break

In [122]:
evaluate_qlearning()

Score over time:  8.54


8.54

### lower `epsilon`

In [66]:
total_episodes = 50000
max_steps = 99

learning_rate = 0.7
gamma = 0.6

epsilon = 0.1

In [67]:
qtable = np.zeros((env.observation_space.n, env.action_space.n))

for episode in range(total_episodes):
    state = env.reset()
    done = False
    
    for step in range(max_steps):
        # explore or exploit
        if random.uniform(0,1)<epsilon:
            action = env.action_space.sample() # random
        else:
            action = np.argmax(qtable[state, :]) # action with the highest Q-value, for a given state
        
        # take action
        new_state, reward, done, info = env.step(action)
        
        # update Q-table
        qtable[state, action] += learning_rate * (reward + gamma*np.max(qtable[new_state, :]) - qtable[state, action]) 
        
        state = new_state
        
        if done:
            break

In [68]:
evaluate_qlearning()

--------------------------------------------------
EPISODE  0
--------------------------------------------------
EPISODE  100
--------------------------------------------------
EPISODE  200
--------------------------------------------------
EPISODE  300
--------------------------------------------------
EPISODE  400
--------------------------------------------------
EPISODE  500
--------------------------------------------------
EPISODE  600
--------------------------------------------------
EPISODE  700
--------------------------------------------------
EPISODE  800
--------------------------------------------------
EPISODE  900
Score over time:  8.485


### higher `epsilon`

will just be similar to the random approach, so we can skip this

## Grid Search with decaying `epsilon`

In [117]:
def train_qlearning(params):
    total_episodes = 50000
    max_steps = 99    
    
    learning_rate = 0.7 if not 'learning_rate' in params else params['learning_rate']
    gamma = 0.6 if not 'gamma' in params else params['gamma']

    epsilon = 1.0 if not 'epsilon' in params else params['epsilon']
    decay_epsilon = True if not 'decay_epsilon' in params else params['decay_epsilon']
    max_epsilon = 1.0
    min_epsilon = 0.01 if not 'min_epsilon' in params else params['min_epsilon']
    decay_rate = 0.01 if not 'decay_rate' in params else params['decay_rate']
        
    qtable = np.zeros((env.observation_space.n, env.action_space.n))

    for episode in range(total_episodes):
        state = env.reset()
        done = False

        for step in range(max_steps):
            # explore or exploit
            if random.uniform(0,1)<epsilon:
                action = env.action_space.sample() # random
            else:
                action = np.argmax(qtable[state, :]) # action with the highest Q-value, for a given state

            # take action
            new_state, reward, done, info = env.step(action)

            # update Q-table
            qtable[state, action] += learning_rate * (reward + gamma*np.max(qtable[new_state, :]) - qtable[state, action]) 

            state = new_state

            if done:
                break

        if decay_epsilon:
            epsilon = min_epsilon + (max_epsilon-min_epsilon)*np.exp(-decay_rate*episode)
            
    return qtable

In [97]:
def evaluate_qlearning(qtable, debug=False):
    episodes = 1000
    rewards = []

    env.reset()

    for episode in range(episodes):
        state = env.reset()
        done = False
        total_rewards = 0

        if debug and episode%100 == 0:
            print("-"*50)
            print("EPISODE ", episode)

        for step in range(max_steps):
            action = np.argmax(qtable[state,:]) # action with the highest Q-value, for a given state
            new_state, reward, done, info = env.step(action)
            total_rewards += reward
            state = new_state

            if done:
                break

        rewards.append(total_rewards)    

    env.close()
    score = sum(rewards)/episodes
    print("Score over time: ",  score)
    return score

In [98]:
grid_params = {
    'learning_rate': [0.05, 0.1, 0.2, 0.3, 0.5, 0.7, 0.9],
    'gamma': [0.1, 0.6, 0.9],
    'min_epsilon': [0.01, 0.1],
    'decay_rate': [0.1, 0.01]
}
grid_params

{'learning_rate': [0.1, 0.3, 0.5, 0.6, 0.7, 0.9],
 'gamma': [0.1, 0.5, 0.6, 0.9],
 'min_epsilon': [0.01, 0.1, 0.0],
 'decay_rate': [0.1, 0.01]}

In [99]:
from sklearn.model_selection import ParameterGrid

grid = list(ParameterGrid(grid_params))
final_scores = np.zeros(len(grid))

In [100]:
N_RUNS = 3

def evaluate_grid_option(args):
    index, params = args
    print("Evaluating params: {}".format(params))
    params['quiet'] = True
    
    scores = []
    for _ in range(N_RUNS):
        qtable = train_qlearning(params)
        score = evaluate_qlearning(qtable)
        scores.append(score)
        
    avg_score = np.mean(scores)
    print("Finished evaluating set {} with score of {}.".format(index, avg_score))
    return avg_score

In [142]:
import time
import multiprocessing

start_time = time.time()
print('Evaluating {} parameter sets.'.format(len(grid)))
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
final_scores = pool.map(evaluate_grid_option, list(enumerate(grid)))

print('Best parameter set was {} with score of {}'.format(grid[np.argmin(final_scores)], np.min(final_scores)))
print('Execution time: {} sec'.format(time.time() - start_time))

## The Grig Search is taking too much time – let's try some options studying the logs

original hyperparms we started with

In [104]:
qtable = train_qlearning(params={})
evaluate_qlearning(qtable)

Score over time:  8.425


8.425

Let's try lower `learning_rate` and higher `gamma`

In [105]:
qtable = train_qlearning(params={'learning_rate': 0.1, 'gamma': 0.9, 'min_epsilon': 0.01, 'decay_rate': 0.01})
evaluate_qlearning(qtable)

Score over time:  8.415


8.415

`learning_rate` as 0.2

In [107]:
qtable = train_qlearning(params={'learning_rate': 0.2, 'gamma': 0.9, 'min_epsilon': 0.01, 'decay_rate': 0.01})
evaluate_qlearning(qtable)

Score over time:  8.488


8.488

`gamma` as 1.0

In [108]:
qtable = train_qlearning(params={'learning_rate': 0.2, 'gamma': 1.0, 'min_epsilon': 0.01, 'decay_rate': 0.01})
evaluate_qlearning(qtable)

Score over time:  8.461


8.461

Try with `decay_epsilon` as False 

In [112]:
qtable = train_qlearning(params={'learning_rate': 0.2, 'gamma': 0.9, 'epsilon': 0.01, 'decay_epsilon': False})
evaluate_qlearning(qtable)

Score over time:  8.382


8.382

Other strong options from the GridSearch

In [118]:
qtable = train_qlearning(params={'decay_rate': 0.1, 'gamma': 0.1, 'learning_rate': 0.6, 'min_epsilon': 0.0})
evaluate_qlearning(qtable)

Score over time:  8.494


8.494

In [123]:
qtable = train_qlearning(params={'decay_rate': 0.1, 'gamma': 0.6, 'learning_rate': 0.7, 'min_epsilon': 0.0})
evaluate_qlearning(qtable)

Score over time:  8.466


8.466

In [124]:
qtable = train_qlearning(params={'decay_rate': 0.01, 'gamma': 0.1, 'learning_rate': 0.6, 'min_epsilon': 0.0})
evaluate_qlearning(qtable)

Score over time:  8.355


8.355

In [125]:
qtable = train_qlearning(params={'decay_rate': 0.01, 'gamma': 0.6, 'learning_rate': 0.6, 'min_epsilon': 0.1})
evaluate_qlearning(qtable)

Score over time:  8.469


8.469

In [135]:
qtable = train_qlearning(params={'decay_rate': 0.01, 'gamma': 0.9, 'learning_rate': 0.06, 'min_epsilon': 0.1})
evaluate_qlearning(qtable)

Score over time:  8.488


8.488

In [132]:
qtable = train_qlearning(params={'decay_rate': 0.01, 'gamma': 0.9, 'learning_rate': 0.05, 'min_epsilon': 0.1})
evaluate_qlearning(qtable)

Score over time:  8.575


8.575

## Best combination seen so far

In [143]:
params={'decay_rate': 0.01, 'gamma': 0.9, 'learning_rate': 0.05, 'min_epsilon': 0.1}

N_RUNS = 5

scores = []
for _ in range(N_RUNS):
    qtable = train_qlearning(params)
    score = evaluate_qlearning(qtable)
    scores.append(score)

avg_score = np.mean(scores)
print("Avg Score over %d runs: %f" % (N_RUNS, avg_score))

Score over time:  8.503
Score over time:  8.625
Score over time:  8.537
Score over time:  8.483
Score over time:  8.482
Avg Score over 5 runs: 8.526000
