In [1]:
import gym
import numpy as np
import random
from IPython.display import clear_output
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

# https://towardsdatascience.com/reinforcement-learning-with-python-8ef0242a2fa2
# Init Taxi-V2 Env
env = gym.make("Taxi-v3").env

# Init arbitary values
q_table = np.zeros([env.observation_space.n, env.action_space.n])

# Hyperparameters
alpha = 0.7 # Momemtum 0.2, Current 0.8 Greedy, 0.2 is to reduce volatality and flip flop
gamma = 0.2 # Learning Rate 0.1 Greedyness is 10%
epsilon = 0.4 # explore 10% exploit 90%


all_epochs = []
all_penalties = []
training_memory = []

for i in range(1, 50000):
    state = env.reset()

    # Init Vars
    epochs, penalties, reward, = 0, 0, 0
    done = False

    #training
    while not done:
        if random.uniform(0, 1) < epsilon: 
            # Check the action space
            action = env.action_space.sample() # for explore
        else:
            # Check the learned values
            action = np.argmax(q_table[state]) # for exploit

        next_state, reward, done, info = env.step(action) #gym generate, the environment already setup for you

        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state]) #take highest from q table for exploit

        # Update the new value
        new_value = (1 - alpha) * old_value + alpha * \
            (reward + gamma * next_max)
        q_table[state, action] = new_value        
        
        # penalty for performance evaluation
        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1
    

    if i % 100 == 0:
        training_memory.append(q_table.copy())
        clear_output(wait=True)
        print("Episode:", i)
        print("Saved q_table during training:", i)

print("Training finished.")
print(q_table)

Episode: 49900
Saved q_table during training: 49900
Training finished.
[[  0.           0.           0.           0.           0.
    0.        ]
 [ -1.24999956  -1.24999782  -1.24999956  -1.24999782  -1.24998912
  -10.24999782]
 [ -1.249728    -1.24864     -1.249728    -1.24864     -1.2432
  -10.24864   ]
 ...
 [ -1.2432      -1.216       -1.2432      -1.24864    -10.2432
  -10.2432    ]
 [ -1.24998912  -1.2499456   -1.24998912  -1.2499456  -10.24998912
  -10.24998912]
 [ -0.4         -1.08        -0.4          3.          -9.4
   -9.4       ]]


** There are four designated locations in the grid world indicated by R(ed), B(lue), G(reen), and Y(ellow). When the episode starts, the taxi starts off at a random square and the passenger is at a random location. The taxi drive to the passenger's location, pick up the passenger, drive to the passenger's destination (another one of the four specified locations), and then drop off the passenger. Once the passenger is dropped off, the episode ends. There are 500 discrete states since there are 25 taxi positions, 5 possible locations of the passenger (including the case when the passenger is the taxi), and 4 destination locations. Actions: There are 6 discrete deterministic actions: **

    0: move south
    1: move north
    2: move east
    3: move west
    4: pickup passenger
    5: dropoff passenger
Rewards: There is a reward of -1 for each action and an additional reward of +20 for delievering the passenger. There is a reward of -10 for executing actions "pickup" and "dropoff" illegally. Rendering:

    blue: passenger
    magenta: destination
    yellow: empty taxi
    green: full taxi
    other letters: locations


state space is represented by:
    (taxi_row, taxi_col, passenger_location, destination)

In [2]:
# At state 499 i will definitely move west
state = 499
print(training_memory[0][state])
print(training_memory[20][state])
print(training_memory[50][state])
print(training_memory[200][state])

[-1.008    -1.177806 -0.91      1.05     -7.       -7.      ]
[-0.5824     -1.08079223 -0.40415428  3.         -9.38056    -9.39998632]
[-0.40001197 -1.08       -0.40000002  3.         -9.39999872 -9.4       ]
[-0.4  -1.08 -0.4   3.   -9.4  -9.4 ]


In [3]:
# At state 77 i will definitely move east
state = 77
print(training_memory[0][state])
print(training_memory[20][state])
print(training_memory[50][state])
print(training_memory[200][state])

[-1.03049441 -1.15752    -0.91       -1.008      -7.         -9.198     ]
[-1.07999991 -0.40001495  3.         -1.08004012 -9.39998897 -9.40001281]
[-1.08 -0.4   3.   -1.08 -9.4  -9.4 ]
[-1.08 -0.4   3.   -1.08 -9.4  -9.4 ]


In [7]:
# To show that at state 393, how the move evolved

from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

In [8]:
action_dict = {0:  "move south"
,1: "move north"
,2: "move east"
,3: "move west"
,4: "pickup passenger"
,5: "dropoff passenger"
}

ENV_STATE = env.reset()
print(env.render(mode='ansi'))
state_memory = [i[ENV_STATE] for i in training_memory]
printmd("For state **{}**".format(ENV_STATE))
for step, i in enumerate(state_memory):
    
    if step % 20==0:
        choice = np.argmax(i)
        printmd("for episode in {}, q table action is {} and it will ... **{}**".format(step*100, choice, action_dict[choice]))
        print(i)
        print()

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|Y| : |[34;1mB[0m: |
+---------+




For state **312**

for episode in 0, q table action is 0 and it will ... **move south**

[ -1.2499597   -1.24998883  -1.24996969  -1.24998244 -10.24909704
 -10.24175059]



for episode in 2000, q table action is 1 and it will ... **move north**

[ -1.25        -1.25        -1.25        -1.25       -10.24999216
 -10.24925745]



for episode in 4000, q table action is 1 and it will ... **move north**

[ -1.25        -1.25        -1.25        -1.25       -10.24999929
 -10.24997995]



for episode in 6000, q table action is 1 and it will ... **move north**

[ -1.25        -1.25        -1.25        -1.25       -10.24999929
 -10.24999946]



for episode in 8000, q table action is 1 and it will ... **move north**

[ -1.25        -1.25        -1.25        -1.25       -10.24999998
 -10.25      ]



for episode in 10000, q table action is 1 and it will ... **move north**

[ -1.25        -1.25        -1.25        -1.25       -10.24999998
 -10.25      ]



for episode in 12000, q table action is 1 and it will ... **move north**

[ -1.25        -1.25        -1.25        -1.25       -10.24999999
 -10.25      ]



for episode in 14000, q table action is 1 and it will ... **move north**

[ -1.25  -1.25  -1.25  -1.25 -10.25 -10.25]



for episode in 16000, q table action is 1 and it will ... **move north**

[ -1.25  -1.25  -1.25  -1.25 -10.25 -10.25]



for episode in 18000, q table action is 1 and it will ... **move north**

[ -1.25  -1.25  -1.25  -1.25 -10.25 -10.25]



for episode in 20000, q table action is 1 and it will ... **move north**

[ -1.25  -1.25  -1.25  -1.25 -10.25 -10.25]



for episode in 22000, q table action is 1 and it will ... **move north**

[ -1.25  -1.25  -1.25  -1.25 -10.25 -10.25]



for episode in 24000, q table action is 1 and it will ... **move north**

[ -1.25  -1.25  -1.25  -1.25 -10.25 -10.25]



for episode in 26000, q table action is 1 and it will ... **move north**

[ -1.25  -1.25  -1.25  -1.25 -10.25 -10.25]



for episode in 28000, q table action is 1 and it will ... **move north**

[ -1.25  -1.25  -1.25  -1.25 -10.25 -10.25]



for episode in 30000, q table action is 1 and it will ... **move north**

[ -1.25  -1.25  -1.25  -1.25 -10.25 -10.25]



for episode in 32000, q table action is 1 and it will ... **move north**

[ -1.25  -1.25  -1.25  -1.25 -10.25 -10.25]



for episode in 34000, q table action is 1 and it will ... **move north**

[ -1.25  -1.25  -1.25  -1.25 -10.25 -10.25]



for episode in 36000, q table action is 1 and it will ... **move north**

[ -1.25  -1.25  -1.25  -1.25 -10.25 -10.25]



for episode in 38000, q table action is 1 and it will ... **move north**

[ -1.25  -1.25  -1.25  -1.25 -10.25 -10.25]



for episode in 40000, q table action is 1 and it will ... **move north**

[ -1.25  -1.25  -1.25  -1.25 -10.25 -10.25]



for episode in 42000, q table action is 1 and it will ... **move north**

[ -1.25  -1.25  -1.25  -1.25 -10.25 -10.25]



for episode in 44000, q table action is 1 and it will ... **move north**

[ -1.25  -1.25  -1.25  -1.25 -10.25 -10.25]



for episode in 46000, q table action is 1 and it will ... **move north**

[ -1.25  -1.25  -1.25  -1.25 -10.25 -10.25]



for episode in 48000, q table action is 1 and it will ... **move north**

[ -1.25  -1.25  -1.25  -1.25 -10.25 -10.25]



In [None]:
# Testing............

import time
def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Episode: {frame['episode']}")
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        time.sleep(0.8)

total_epochs, total_penalties = 0, 0
episodes = 10 # Try 10 rounds
frames = []

for ep in range(episodes):
    state = env.reset()
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
        action = np.argmax(q_table[state]) # deterministic (exploit), not stochastic (explore), only explore in training
        env
        state, reward, done, info = env.step(action) #gym

        if reward == -10:
            penalties += 1
        
        # Put each rendered frame into dict for animation, gym generated
        frames.append({
            'frame': env.render(mode='ansi'),
            'episode': ep, 
            'state': state,
            'action': action,
            'reward': reward
            }
        )
        epochs += 1

    total_penalties += penalties
    total_epochs += epochs

print_frames(frames)

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Episode: 7
Timestep: 98
State: 0
Action: 5
Reward: 20
