In [1]:
import gym
import numpy as np

# core gym interface is env
env = gym.make('smart_cab:smart_cab-v1')

env.render()

+-------------------+
| :A| : :B: : | :[35mC[0m| |
| : | : | : | : | : |
| : | : | : | : | : |
| : : : | : : : : : |
| : | : : : | : | : |
| : | : | : | : | : |
| : | : : : |[41m [0m: | : |
| | : : | : : | : : |
| :[34;1mD[0m| : :E: : : |F| |
| | : : | : | | : : |
+-------------------+



In [2]:
env.P[431]

{0: [(1.0, 851, -1, False)],
 1: [(1.0, 11, -1, False)],
 2: [(1.0, 473, -1, False)],
 3: [(1.0, 431, -1, False)],
 4: [(1.0, 431, -3, False)],
 5: [(1.0, 431, -1, False)]}

In [3]:
%%time
"""Training the agent"""

import random
from IPython.display import clear_output

def q_learning(discount_factor, exploration_proba):
    
    # initialise Q-table with zeros
    
    q_table = np.zeros([env.observation_space.n, env.action_space.n])
    
    total_epochs, total_penalties, total_rewards = 0, 0, 0
    
    # Hyperparameters
    
    # there are 6 actions
    learning_rate = 1/6
    
    # discount factor
    gamma = discount_factor
    
    exploration_proba = exploration_proba
    
    episodes = 1000
    
    # Maximum of iteration per episode
    #max_iter_episode = 5000
    
    rewards_per_episode = []


    for i in range(episodes):
        state = env.reset()

        # Sum the rewards that the agent gets from the environment
        epochs, penalties, rewards = 0, 0, 0
        done = False

        while not done:
            
            if random.uniform(0, 1) < exploration_proba:
                action = env.action_space.sample() # Explore action space
            else:
                action = np.argmax(q_table[state]) # Exploit learned values

            next_state, reward, done, info = env.step(action) 

            current_value = q_table[state, action]
            next_max = np.max(q_table[next_state])

            new_value = (1 - learning_rate) * current_value + learning_rate * (reward + gamma * next_max)
            q_table[state, action] = new_value
            
            if reward == -5 or reward == -3:
                penalties += 1
            
            rewards = rewards + reward

            #total_episode_reward = total_episode_reward + reward
            
            state = next_state
            epochs += 1
           
        
        total_penalties += penalties
        total_epochs += epochs
        total_rewards += rewards
        
    
    print(f"Results after {episodes} episodes:")
    print(f"Average timesteps per episode: {total_epochs / episodes}")
    print(f"Average penalties per episode: {total_penalties / episodes}")
    print(f"Average rewards per episode: {total_rewards / episodes}")
            

        

    

CPU times: user 9 µs, sys: 0 ns, total: 9 µs
Wall time: 11 µs


In [4]:
q_learning(discount_factor = 0.5, exploration_proba = 0.1)
print()

q_learning(discount_factor = 0.5, exploration_proba = 0.3)
print()

q_learning(discount_factor = 0.5, exploration_proba = 0.5)
print()

q_learning(discount_factor = 0.5, exploration_proba = 0.7)


Results after 1000 episodes:
Average timesteps per episode: 314.508
Average penalties per episode: 15.191
Average rewards per episode: -335.01

Results after 1000 episodes:
Average timesteps per episode: 334.576
Average penalties per episode: 22.841
Average rewards per episode: -371.272

Results after 1000 episodes:
Average timesteps per episode: 374.343
Average penalties per episode: 35.611
Average rewards per episode: -437.911

Results after 1000 episodes:
Average timesteps per episode: 446.038
Average penalties per episode: 55.04
Average rewards per episode: -550.336


In [5]:
q_learning(discount_factor = 0.1, exploration_proba = 0.1)
print()

q_learning(discount_factor = 0.3, exploration_proba = 0.1)
print()

q_learning(discount_factor = 0.5, exploration_proba = 0.1)
print()

q_learning(discount_factor = 0.7, exploration_proba = 0.1)
print()


Results after 1000 episodes:
Average timesteps per episode: 558.875
Average penalties per episode: 17.945
Average rewards per episode: -585.155

Results after 1000 episodes:
Average timesteps per episode: 399.479
Average penalties per episode: 15.635
Average rewards per episode: -420.889

Results after 1000 episodes:
Average timesteps per episode: 314.776
Average penalties per episode: 15.26
Average rewards per episode: -335.398

Results after 1000 episodes:
Average timesteps per episode: 268.985
Average penalties per episode: 15.454
Average rewards per episode: -290.013



# Realistic Initialisation

In [6]:
# each row --> state
# each col --> action
q_table_real = np.zeros([env.observation_space.n, env.action_space.n])


total_epochs, total_penalties, total_rewards = 0, 0, 0

# Hyperparameters

# there are 6 actions
learning_rate = 1/6

# discount factor
gamma = 0.7

exploration_proba = 0.1

episodes = 1000


episode_list_real = []
ep_reward_real = []
epoch_list_real = []



for i in range(episodes):
    
    episode_list_real.append(i)
    
    state = env.reset()

    # Sum the rewards that the agent gets from the environment
    epochs, penalties, rewards = 0, 0, 0
    done = False

    while not done:

        if random.uniform(0, 1) < exploration_proba:
            action = env.action_space.sample() # Explore action space
        else:
            action = np.argmax(q_table_real[state]) # Exploit learned values
        
        

        next_state, reward, done, info = env.step(action) 
        

        current_value = q_table_real[state, action]
        next_max = np.max(q_table_real[next_state])

        new_value = (1 - learning_rate) * current_value + learning_rate * (reward + gamma * next_max)
        q_table_real[state, action] = new_value

        if reward == -5 or reward == -3:
            penalties += 1
        
        rewards = rewards + reward

        state = next_state
        
        
        epochs += 1
    
    
        
        
    ep_reward_real.append(rewards)
    epoch_list_real.append(epochs)
 

    total_penalties += penalties
    total_epochs += epochs
    total_rewards += rewards



#print(episode_list_real)
#print(ep_reward_real)
print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")
print(f"Average rewards per episode: {total_rewards / episodes}")


Results after 1000 episodes:
Average timesteps per episode: 275.069
Average penalties per episode: 15.74
Average rewards per episode: -296.663


# Using the realistic initialisation to find the optimistic Q-values for each action

In [7]:

# take the mean value for each action, axis 0 means col
mean_each_actions = q_table_real.mean(axis=0)
print("Mean value for each action: {}".format(mean_each_actions), '\n')

# initialise the q-value for each action above the mean
initialised_values = mean_each_actions + 0.5
print("Initialised values for each action: {}".format(initialised_values))




Mean value for each action: [-1.38337561 -1.36005791 -1.36137719 -1.37742652 -1.60532722 -1.33481749] 

Initialised values for each action: [-0.88337561 -0.86005791 -0.86137719 -0.87742652 -1.10532722 -0.83481749]


# Optimistic Initialisation

In [8]:

q_table_op = np.zeros([env.observation_space.n, env.action_space.n])

q_table_op = q_table_op + initialised_values



total_epochs, total_penalties, total_rewards = 0, 0, 0

# Hyperparameters

# there are 6 actions
learning_rate = 1/6

# discount factor
gamma = 0.7

exploration_proba = 0.1

episodes = 1000


episode_list_op = []
ep_reward_op = []
epoch_list_op = []


for i in range(episodes):
    
    episode_list_op.append(i)
    
    state = env.reset()

    # Sum the rewards that the agent gets from the environment
    epochs, penalties, rewards = 0, 0, 0
    done = False

    while not done:

        action = np.argmax(q_table_op[state]) # Exploit learned values

        next_state, reward, done, info = env.step(action) 
        

        current_value = q_table_op[state, action]
        next_max = np.max(q_table_op[next_state])

        new_value = (1 - learning_rate) * current_value + learning_rate * (reward + gamma * next_max)
        q_table_op[state, action] = new_value

        if reward == -5 or reward == -3:
            penalties += 1
        
        rewards = rewards + reward

        state = next_state
        
        
        epochs += 1
        
        
    ep_reward_op.append(rewards)
    epoch_list_op.append(epochs)
    

    total_penalties += penalties
    total_epochs += epochs
    total_rewards += rewards


print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")
print(f"Average reward per episode: {total_rewards / episodes}")



Results after 1000 episodes:
Average timesteps per episode: 256.954
Average penalties per episode: 9.548
Average reward per episode: -265.856
