In [1]:
import gym
import random
import numpy as np
import time

In [8]:
# Environment
env = gym.make("Taxi-v3")

# Training parameters for Q learning
alpha = 0.9 # Learning rate
gamma = 0.9 # Future reward discount factor
epsilon = 0.3
num_of_episodes = 1000
num_of_steps = 500 # per each episode

# Q tables for rewards
Q_reward = -100000*np.zeros((500,6)) 

for i in range(0, num_of_episodes):
    state = env.reset()
    epochs = 0
    penalties = 0
    reward = 0
    done = False
    
    while done==False:
        if random.uniform(0, 1) < epsilon:
            act = env.action_space.sample()
        else:
            act = np.argmax(Q_reward[state])

        next_state, reward, done, info = env.step(act) 
        
        prev_val = Q_reward[state, act]
        
        next_max = np.max(Q_reward[next_state])
        
        new_val = (1 - alpha) * prev_val + alpha * (reward + gamma * next_max)
        
        Q_reward[state, act] = new_val

        state = next_state
        epochs += 1
        

        
        
        

In [12]:

# Testing
state = env.reset()
tot_reward, tot_steps = 0, 0

for t in range(50):
    action = np.argmax(Q_reward[state,:])
    state, reward, done, info = env.step(action)
    tot_reward += reward
    tot_steps += 1
    env.render()
    time.sleep(0.5)
    if done:
        print("Total rewards %d" %tot_reward)
        print("Total steps %d" %tot_steps)
        break

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|[34;1mY[0m| : |B: |
+---------+
  (North)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : |[43m [0m: |
|[34;1mY[0m| : |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (North)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :G|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |B: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |B: |
+---------+
  (South)
+---------+
|[35mR

In [13]:

rew_arr = np.zeros(10)
steps_arr = np.zeros(10)

for i in range(0,10):
    
    state = env.reset()
    tot_reward = 0
    tot_steps = 0

    for t in range(50):
        action = np.argmax(Q_reward[state,:])
        state, reward, done, info = env.step(action)
        tot_reward += reward
        tot_steps += 1
        if done:
            print(f'Total rewards {tot_reward}')
            print(f'Total steps {tot_steps}')
            print("===========================")
            break
    
    rew_arr[i] = tot_reward
    steps_arr[i] = tot_steps
    

Total rewards 10
Total steps 11
Total rewards 6
Total steps 15
Total rewards 9
Total steps 12
Total rewards 12
Total steps 9
Total rewards 9
Total steps 12
Total rewards 10
Total steps 11
Total rewards 11
Total steps 10
Total rewards 6
Total steps 15
Total rewards 7
Total steps 14
Total rewards 5
Total steps 16


In [15]:
print(f'Average rewards: {np.mean(rew_arr)}')
print(f'Average steps: {np.mean(steps_arr)}')

Average rewards: 8.5
Average steps: 12.5
