In [37]:
!pip install gym==0.14.0



In [0]:
from collections import deque
import gym
import numpy as np

np.random.seed(42)

In [0]:
EPISODES = 3000
ALPHA = 0.1
GAMMA = 0.9
EPSILON = 1.0
DECAY = 0.999
ALPHA_DECAY = 0.9

In [0]:
def e_greedy(s, aspace):
  p = np.random.random()
  best_action = np.argmax(Q[s])

  # explore with prob EPSILON
  if p < EPSILON:
    a = aspace.sample()
  else:
    a = best_action

  return a

In [135]:
env = gym.make("Taxi-v2").env
env.seed(42)

[42]

In [136]:
env.P[328]

{0: [(1.0, 428, -1, False)],
 1: [(1.0, 228, -1, False)],
 2: [(1.0, 348, -1, False)],
 3: [(1.0, 328, -1, False)],
 4: [(1.0, 328, -10, False)],
 5: [(1.0, 328, -10, False)]}

In [137]:
state = env.encode(3, 1, 2, 0) # (taxi row, taxi column, passenger index, destination index)
print("State:", state)

env.s = state
env.render()

State: 328
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+



In [138]:
d = deque([0] * 300, maxlen=300)
v = deque([np.inf] * 10, maxlen=10)

Q = np.random.random((env.observation_space.n, env.action_space.n))
print(env.observation_space.n, env.action_space.n)

for i in range(EPISODES):
  s = env.reset()
  taxi_row, taxi_col, pass_idx, dest_idx = env.decode(s)
  print(taxi_row, taxi_col, pass_idx, dest_idx)

  while True:
    # Choose A from S using policy derived from Q (e.g., e-greedy)
    a = e_greedy(s, env.action_space)
    # print(s, a)

    # Take action A, observe R, S'
    s1, r, done, info = env.step(a)

    prev_sum = np.sum(Q)

    # Update Q(S,A)
    Q[s, a] = Q[s, a] + ALPHA * (r + GAMMA * np.max(Q[s1]) - Q[s, a])

    # S <- S'
    s = s1

    if done:
      d.appendleft(r)
      v.appendleft(np.abs(np.sum(Q)-prev_sum))
      print("{} : Average reward for latest 300 episodes: {}".format(i, np.sum(d)/300))
      print("Q difference: ", np.abs(np.sum(Q)-prev_sum))
      break

  # if i > 10:
  EPSILON *= DECAY

  # ALPHA *= ALPHA_DECAY

  # if EPSILON < 0.1:
  #   EPSILON = 0.1

  if (np.sum(d)/300 >= 20) and (np.sum(v) < 1e-5):
    break

# print(Q[462,4])
# print(Q[398,3])
# print(Q[253,0])
# print(Q[377,1])
# print(Q[83,5])


500 6
1 4 1 2
0 : Average reward for latest 300 episodes: 0.06666666666666667
Q difference:  2.0764675474736123
2 4 0 2
1 : Average reward for latest 300 episodes: 0.13333333333333333
Q difference:  1.868820792726332
4 3 2 1
2 : Average reward for latest 300 episodes: 0.2
Q difference:  2.0636899744553716
1 1 3 1
3 : Average reward for latest 300 episodes: 0.26666666666666666
Q difference:  1.8573209770098629
2 1 2 1
4 : Average reward for latest 300 episodes: 0.3333333333333333
Q difference:  1.6715888793087288
4 0 1 3
5 : Average reward for latest 300 episodes: 0.4
Q difference:  2.047870120530206
0 0 2 3
6 : Average reward for latest 300 episodes: 0.4666666666666667
Q difference:  1.8430831084770034
4 3 3 2
7 : Average reward for latest 300 episodes: 0.5333333333333333
Q difference:  1.6819387134537465
0 1 0 1
8 : Average reward for latest 300 episodes: 0.6
Q difference:  1.5044299913779469
4 0 1 2
9 : Average reward for latest 300 episodes: 0.6666666666666666
Q difference:  1.51374

In [139]:
''' Actions:
    There are 6 discrete deterministic actions:
    - 0: move south
    - 1: move north
    - 2: move east 
    - 3: move west 
    - 4: pickup passenger
    - 5: dropoff passenger
'''
state = env.encode(3, 1, 2, 0) # (taxi row, taxi column, passenger index, destination index)
print("State:", state)

env.s = state
env.render()

print(Q[state])


State: 328
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+
  (Dropoff)
[ -4.25993667  -2.45701764  -4.21798609  -4.08158048 -12.98232886
 -13.12937036]


In [115]:
state = env.encode(4, 4, 3, 1) # (taxi row, taxi column, passenger index, destination index)
print("State:", state)

env.s = state
env.render()

print(Q[state])

State: 493
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m:[43m [0m|
+---------+
  (Dropoff)
[ 3.14896815  1.83241986  3.14897611  4.61130459 -5.85136656 -5.85236938]


In [116]:
env.s = 83
env.render()

print(Q[83])

+---------+
|[34;1mR[0m: | : :[43mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (Dropoff)
[ -2.96350266  -3.66715656  -3.66715664  -2.96350319 -12.66715508
 -12.66715502]


In [132]:
env.s = 408
env.render()

print(Q[408])



+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |B: |
+---------+
  (Dropoff)
[ 6.41141874  4.77025011  6.41142182  6.41139765  8.23495753 -2.58858243]


In [140]:
# Alpha: 0.1, EPS: 1.0
# • Q(462,4) = −11.374402515 • Q(398,3) = 4.348907 • Q(253,0) = −0.5856821173 • Q(377,1) = 9.683 • Q(83,5) = −12.8232660372

print(Q[462,4])
print(Q[398,3])
print(Q[253,0])
print(Q[377,1])
print(Q[83,5])

-13.02431765897712
-1.4335702972923636
-3.2571847786731087
5.842245500927924
-13.244991091694633


In [131]:
# Q[408,0]
# Q[257,0]
# Q[123,4]
# Q[222,2]
# Q[392,5]
# Q[421,2]
# Q[133,0]
# Q[66,2]
# Q[368,3]
Q[38,0]

8.20192630456854

In [0]:
'''
EPS : 0.1, no decay, alpha: 0.1
1211 : Average reward for latest 300 episodes: 20.0
Q difference:  0.0

EPS: 0.5, 0.9 decay, alpha: 0.1 - BAD
1188 : Average reward for latest 300 episodes: 20.0
Q difference:  0.0

EPS: 0.3, no decay, alpha: 0.1
1151 : Average reward for latest 300 episodes: 20.0
Q difference:  0.0

EPS: 0.15, no decay, alpha: 0.1
1176 : Average reward for latest 300 episodes: 20.0
Q difference:  0.0

EPS: 0.08, no decay, alpha: 0.1
1195 : Average reward for latest 300 episodes: 20.0
Q difference:  0.0

EPS: 0.1, no decay, alpha: 0.3
359 : Average reward for latest 300 episodes: 20.0
Q difference:  0.0

'''