In [36]:
import gym

env = gym.make('Taxi-v2')
state = env.reset()
env.render()

total_reward = 0
done = False
while not done:
    state, reward, done, info = env.step(env.action_space.sample())
    total_reward += reward
    env.render()
    print(env.action_space.sample())

print('Total reward:', total_reward)

+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | :[43m [0m| : |
|Y| : |[34;1mB[0m: |
+---------+

+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | :[43m [0m| : |
|Y| : |[34;1mB[0m: |
+---------+
  (East)
1
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | :[43m [0m| : |
|Y| : |[34;1mB[0m: |
+---------+
  (Dropoff)
0
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | :[43m [0m| : |
|Y| : |[34;1mB[0m: |
+---------+
  (Dropoff)
0
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | :[43m [0m| : |
|Y| : |[34;1mB[0m: |
+---------+
  (East)
2
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | :[43m [0m| : |
|Y| : |[34;1mB[0m: |
+---------+
  (East)
2
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (North)
1
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (Dropoff)
1
+-------

In [39]:
print(env.observation_space)
print(env.action_space)

Discrete(500)
Discrete(6)


# Random

In [45]:
episodes = 1000
rewards = []
max_steps = 99

for episode in range(episodes):
    state = env.reset()  # Assuming you already have env created as above
    total_rewards = 0
    
    for step in range(max_steps):
        action = env.action_space.sample()  # TODO your policy here!
        state, reward, done, info = env.step(env.action_space.sample())
        total_rewards += reward
        if done:
            break
    rewards.append(total_rewards)        

print('Average score over time:', sum(rewards) / episodes)

Average score over time: -389.438


In [58]:
from collections import defaultdict
import random
import numpy as np



# Initial Q-table

In [113]:
qtable_dict = defaultdict(lambda: np.zeros(6))  # array of actions

In [114]:
episodes = 1000
rewards = []
learning_rate = 0.6           # Learning rate
max_steps = 99                # Max steps per episode
gamma = 0.8                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.1            # Minimum exploration probability 
decay_rate = 0.02             # Exponential decay rate for exploration prob

In [115]:
for episode in range(episodes):
    state = env.reset()  # Assuming you already have env created as above
    total_rewards = 0
    done = False
    
    for step in range(max_steps):
        
        if random.uniform(0, 1) < epsilon:
            # Time to explore!
            action = env.action_space.sample()
            
        else:
            # Exploit based on best available rewards
            action = np.argmax(qtable_dict[state])
            
        new_state, reward, done, info = env.step(action)
        
        
        qtable_dict[state][action] += (learning_rate *
                                        (reward + gamma *
                                        (np.max(qtable_dict[new_state])) -
                                        qtable_dict[state][action]))
        
        
        total_rewards += reward
        state = new_state
        if done:
            break
            
    #reduce exploration 
    epsilon = (min_epsilon +
               (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode))
    rewards.append(total_rewards)        

print('Average score over time:', sum(rewards) / episodes)

Average score over time: -41.79


Better than random

# Better Try

In [329]:
qtable_dict = defaultdict(lambda: np.zeros(6))  # array of actions

Good Intial optimization

In [437]:
episodes = 10000
learning_rate = 0.02           # Learning rate
max_steps = 99                # Max steps per episode
gamma = 0.05                 # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.00005            # Minimum exploration probability 
decay_rate = 1.0             # Exponential decay rate for exploration prob

In [441]:
rewards = []

for episode in range(episodes):
    state = env.reset()  # Assuming you already have env created as above
    total_rewards = 0
    done = False
    
    for step in range(max_steps):
        
        if random.uniform(0, 1) < epsilon:
            # Time to explore!
            action = env.action_space.sample()
            
        else:
            # Exploit based on best available rewards
            action = np.argmax(qtable_dict[state])
            
        new_state, reward, done, info = env.step(action)
        
        
        qtable_dict[state][action] += (learning_rate *
                                       (reward + gamma *
                                        (np.max(qtable_dict[new_state]) -
                                         qtable_dict[state][action])))
        
        
        total_rewards += reward
        state = new_state
        if done:
            break
            
    #reduce exploration 
    epsilon = (min_epsilon +
               (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode))
    rewards.append(total_rewards)        

print('Average score over time:', sum(rewards) / episodes)

Average score over time: 8.4728


Better than the 8.467 goal

# Playing with trained model

In [455]:
rewards = []

for episode in range(episodes):
    state = env.reset()  # Assuming you already have env created as above
    total_rewards = 0
    done = False
    
    for step in range(max_steps):
        
        if random.uniform(0, 1) < epsilon:
            # Time to explore!
            action = env.action_space.sample()
            
        else:
            # Exploit based on best available rewards
            action = np.argmax(qtable_dict[state])
            
        new_state, reward, done, info = env.step(action)    
        
        total_rewards += reward
        state = new_state
        if done:
            break
            
    rewards.append(total_rewards)        

print('Average score over time:', sum(rewards) / episodes)

Average score over time: 8.4849
