In [1]:
import sys
import gym
import numpy as np
import random
from IPython.display import clear_output
from IPython.display import Markdown, display
sys.tracebacklimit = 0
def printmd(string):
  display(Markdown(string))
# Init Taxi-V2 Env
env = gym.make("Taxi-v3").env
# Init arbitrary values
q_table = np.zeros([env.observation_space.n, env.action_space.n])
# Hyperparameters+
alpha = 0.7 # Momentum 0.2, Current 0.8 Greedy, 0.2 is to reduce volatility and flip flop
gamma = 0.2 # Learning Rate 0.1 Greediness is 10%
epsilon = 0.4 # explore 10% exploit 90%
all_epochs = []
all_penalties = []
training_memory = []
for i in range(1, 50000):
  state = env.reset()
  # Init Vars
  epochs, penalties, reward, = 0, 0, 0
  done = False
  #training
  while not done:
    if random.uniform(0, 1) < epsilon:
      # Check the action space
      action = env.action_space.sample() # for explore
    else:
      # Check the learned values
      action = np.argmax(q_table[state]) # for exploit
      next_state, reward, done, info = env.step(action) #gym generate, the environment already setup for you
      old_value = q_table[state, action]
      next_max = np.max(q_table[next_state]) #take highest from q table for exploit
      # Update the new value
      new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
      q_table[state, action] = new_value
      # penalty for performance evaluation
    if reward == -10:
      penalties += 1
    state = next_state
    epochs += 1
  if i % 100 == 0:
    training_memory.append(q_table.copy())
    clear_output(wait=True)
    print("Episode:", i)
    print("Saved q_table during training:", i)
print("Training finished.")
print(q_table)

Episode: 49900
Saved q_table during training: 49900
Training finished.
[[-1.25       -1.25       -1.24999923 -1.25       -7.43749413 -7.1747276 ]
 [-1.24999423 -1.24999439 -1.24999441 -1.24999439 -1.24998925 -7.        ]
 [-1.2482698  -1.24824397 -1.24841796 -1.24824397 -1.24328    -7.        ]
 ...
 [-1.24092961 -1.23833243 -1.24092961 -1.24398207 -7.         -7.        ]
 [-1.24966003 -1.24968186 -1.24966003 -1.24968737 -7.         -7.        ]
 [-1.008      -1.008      -1.008       1.05       -7.         -7.        ]]


In [2]:
# At state 499 i will definitely move west
state = 499
print(training_memory[0][state])
print(training_memory[20][state])
print(training_memory[50][state])
print(training_memory[200][state])


[-1.008 -1.008 -1.008  1.05  -7.    -7.   ]
[-1.008 -1.008 -1.008  1.05  -7.    -7.   ]
[-1.008 -1.008 -1.008  1.05  -7.    -7.   ]
[-1.008 -1.008 -1.008  1.05  -7.    -7.   ]


  and should_run_async(code)


In [3]:
# At state 77 i will definitely move east
state = 77
print(training_memory[0][state])
print(training_memory[20][state])
print(training_memory[50][state])
print(training_memory[200][state])

[-1.008      -1.008       2.96519943 -0.7        -7.         -7.        ]
[-1.008     -1.008      2.9500003 -0.7       -7.        -7.       ]
[-1.008 -1.008  2.95  -0.7   -7.    -7.   ]
[-1.008 -1.008  2.95  -0.7   -7.    -7.   ]


In [4]:
# To show that at state 393, how the move evolved
from IPython.display import Markdown, display
def printmd(string):
  display(Markdown(string))
action_dict = {0: "move south"
,1: "move north"
,2: "move east"
,3: "move west"
,4: "pickup passenger"
,5: "dropoff passenger"
}

ENV_STATE = env.reset()
print(env.render(mode='ansi'))
state_memory = [i[ENV_STATE] for i in training_memory]
printmd("For state **{}**".format(ENV_STATE))
for step, i in enumerate(state_memory):
  if step % 200==0:
    choice = np.argmax(i)
    printmd("for episode in {}, q table action is {} and it will ... **{}**".format(step*100, choice, action_dict[choice]))
    print(i)
    print()

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+




See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


For state **207**

for episode in 0, q table action is 1 and it will ... **move north**

[-1.24985165 -1.24982745 -1.24989789 -1.24985041 -7.         -7.        ]



for episode in 20000, q table action is 2 and it will ... **move east**

[-1.24999994 -1.24999994 -1.24999991 -1.24999996 -7.         -7.        ]



for episode in 40000, q table action is 2 and it will ... **move east**

[-1.24999994 -1.24999994 -1.24999991 -1.24999996 -7.         -7.        ]



In [5]:
import time
def print_frames(frames):
  for i, frame in enumerate(frames):
    clear_output(wait=True)
    print(frame['frame'])
    print(f"Episode: {frame['episode']}")
    print(f"Timestep: {i + 1}")
    print(f"State: {frame['state']}")
    print(f"Action: {frame['action']}")
    print(f"Reward: {frame['reward']}")
    time.sleep(0.8)
total_epochs, total_penalties = 0, 0
episodes = 10 # Try 10 rounds
frames = []
for ep in range(episodes):
  state = env.reset()
  epochs, penalties, reward = 0, 0, 0
  done = False
  while not done:
    action = np.argmax(q_table[state]) # deterministic (exploit), not stochastic (explore), only explore in training
    env
    state, reward, done, info = env.step(action) #gym
    if reward == -10:
      penalties += 1
    # Put each rendered frame into dict for animation, gym generated
    frames.append({
        'frame': env.render(mode='ansi'),
        'episode': ep,
        'state': state,
        'action': action,
        'reward': reward})
    epochs += 1
  total_penalties += penalties
  total_epochs += epochs
print_frames(frames)
print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")

+---------+
|R: | : :[35m[34;1m[43mG[0m[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Episode: 9
Timestep: 137
State: 85
Action: 5
Reward: 20
Results after 10 episodes:
Average timesteps per episode: 13.7
Average penalties per episode: 0.0
