In [0]:
from IPython.display import clear_output
import numpy as np
import random
import time
import gym

In [2]:
env = gym.make("Taxi-v2") # Create environment
env.render() # Show it

+---------+
|R: |[43m [0m: :G|
| : : : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+



In [3]:
# Number of possible actions
action_size = env.action_space.n 
print("Action size ", action_size) 

# Number of possible states
state_size = env.observation_space.n 
print("State size ", state_size)

Action size  6
State size  500


In [5]:
qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [0]:
episodes = 30000            # Total episodes
max_steps = 1000            # Max steps per episode
lr = 0.3                    # Learning rate
decay_fac = 0.00001         # Decay learning rate each iteration
gamma = 0.90                # Discounting rate - later rewards impact less

In [7]:
for episode in range(episodes):
    
    state = env.reset() # Reset the environment
    done = False        # Are we done with the environment
    lr -= decay_fac     # Decaying learning rate
    step = 0
    
    if lr <= 0: # Nothing more to learn?
        break
        
    for step in range(max_steps):
        
        # Randomly Choose an Action
        action = env.action_space.sample()
        
        # Take the action -> observe new state and reward
        new_state, reward, done, info = env.step(action)
        
        # Update qtable values
        if done == True: # If last, do not count future accumulated reward
            if(step < 199 | step > 201):
                qtable[state, action] = qtable[state, action]+lr*(reward+gamma*0-qtable[state,action])
            break
        else: # Consider accumulated reward of best decision stream
            qtable[state, action] = qtable[state,action]+lr*(reward+gamma*np.max(qtable[new_state,:])-qtable[state,action])
    
        # if done.. jump to next episode
        if done == True:
            break
        
        # moving states
        state = new_state
        
    episode += 1
    
    if (episode % 3000 == 0):
        print('episode = ', episode)
        print('learning rate = ', lr)
        print('-----------')

episode =  3000
learning rate =  0.26999999999997
-----------
episode =  6000
learning rate =  0.23999999999993998
-----------
episode =  9000
learning rate =  0.20999999999990998
-----------
episode =  12000
learning rate =  0.17999999999987998
-----------
episode =  15000
learning rate =  0.14999999999984998
-----------
episode =  18000
learning rate =  0.11999999999982693
-----------
episode =  21000
learning rate =  0.08999999999983856
-----------
episode =  24000
learning rate =  0.059999999999848445
-----------
episode =  27000
learning rate =  0.029999999999839697
-----------


In [18]:
# New environment
state = env.reset()
env.render()
done = False
total_reward = 0

while(done == False):
    
    action = np.argmax(qtable[state,:]) # Choose best action (Q-table)
    state, reward, done, info = env.step(action) # Take action
    total_reward += reward  # Summing rewards
    
    # Display it
    time.sleep(0.5)
    #clear_output(wait=True)
    env.render()
    print('Episode Reward = ', total_reward)

+---------+
|R: | : :[35mG[0m|
| : :[43m [0m: : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+

+---------+
|R: | : :[35mG[0m|
| : : : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
Episode Reward =  -1
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (East)
Episode Reward =  -2
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : |[43m [0m: |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
Episode Reward =  -3
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[34;1m[43mB[0m[0m: |
+---------+
  (South)
Episode Reward =  -4
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[42mB[0m: |
+---------+
  (Pickup)
Episode Reward =  -5
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : |[42m_[0m: |
|Y| : |B: |
+---------+
  (North)
Episode Reward =  -6
+---------+
|R: | : :[35mG[0m|
| : : : : |