In [1]:
import numpy as np
import gym
import random

In [2]:
env = gym.make('Taxi-v2')
env.render()

+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : |[43m [0m: |
|[34;1mY[0m| : |[35mB[0m: |
+---------+



In [27]:
qtable = np.zeros((env.observation_space.n,env.action_space.n))
# print(qtable)

In [28]:
#hyperparams
total_episodes = 50000        # Total episodes
learning_rate = 0.7           # Learning rate
max_steps = 100               # Max steps per episode
gamma = 0.618                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.01             # Exponential decay rate for exploration prob

In [29]:
def do_explore(epsilon):
    r = random.uniform(0,1)
    if r<epsilon:
        return True
    return False

for episode in range(total_episodes):
    state = env.reset()
    done = False
    step = 0
    t_rewards = 0
    for step in range(max_steps):
        #choose action <-- do_explore, if True: explore, if False: exploit
        if do_explore(epsilon):
            action = env.action_space.sample()
        else:
            action = np.argmax(qtable[state])
        #take action
        nstate, reward, done, info = env.step(action)
        #update table
        qtable[state][action] += learning_rate*(reward + gamma * max(qtable[nstate]) - qtable[state][action]) 
        state = nstate
        if done: 
            break
        
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)

print(qtable)
print(epsilon)

[[  0.           0.           0.           0.           0.
    0.        ]
 [ -2.32039605  -2.13657437  -2.32039617  -2.13656783  -1.83910189
  -11.13628468]
 [ -1.83940239  -1.35778875  -1.8391055   -1.3585264   -0.57891593
  -10.3577832 ]
 ...
 [ -0.57911942   0.6813658   -0.57893176  -1.35853483  -9.5770359
   -9.5770359 ]
 [ -2.48400579  -2.45983197  -2.44272006  -2.13656674 -10.59435889
   -9.40282   ]
 [ -1.54476574  -1.29472     -1.21282     11.35999982 -10.4580196
  -10.403512  ]]
0.01


In [40]:
#test
state = env.reset()
test_num = 10
rewards = []
for _ in range(test_num):
    state = env.reset()
    t_rewards = 0
    for step in range(max_steps):
        action = np.argmax(qtable[state])
        nstate, reward, done, info = env.step(action)
        t_rewards += reward
        state = nstate
        if done:
            env.render()
            rewards.append(t_rewards)
            
print(sum(rewards)/test_num)

+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[34;1m[43mB[0m[0m[0m: |
+---------+
  (Dropoff)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[34;1m[43mB[0m[0m[0m: |
+---------+
  (Dropoff)
+---------+
|R: | : :[35m[34;1m[43mG[0m[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
+---------+
|R: | : :[35m[34;1m[43mG[0m[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)
+---------+
|R: | : :[35m[34;1m[43mG[0m[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
