# Q Learning Examples

## Q Table

In [98]:
import numpy as np
import gym
import random
import time

np.random.seed(1234)

#env = gym.make('FrozenLake-v0')
env = gym.make('Taxi-v3')

STATE_SPACE = env.observation_space.n
ACTION_SPACE = env.action_space.n

NUM_EPISODES = 10000
MAX_EPISODE_LENGTH = 100

a = 0.1 # alpha = learning rate
y = 0.99 # gamma = decay rate

MAX_EXPLORATION_RATE = 1
MIN_EXPLORATION_RATE = 0.01
EXPLORATION_RATE_DECAY = 0.001

def decay_exploration_rate(episode):
    # exponential decay
    #return MIN_EXPLORATION_RATE + (MAX_EXPLORATION_RATE - MIN_EXPLORATION_RATE) * np.exp(-EXPLORATION_RATE_DECAY * episode)
    
    # linear decay
    new_exploration_rate = MAX_EXPLORATION_RATE - (EXPLORATION_RATE_DECAY) * episode
    return max(new_exploration_rate, MIN_EXPLORATION_RATE)

def should_explore(exploration_rate):
    exploration_rate_threshold = np.random.uniform(0, 1)
    return exploration_rate > exploration_rate_threshold # exploration rate decreases, less frequent larger than random number

def get_next_action(q_table, state, exploration_rate):
    if should_explore(exploration_rate):
        return env.action_space.sample()
    return np.argmax(q_table[state, :])

# discrete state & action space
q_table = np.zeros((STATE_SPACE, ACTION_SPACE))
exploration_rate = MAX_EXPLORATION_RATE
total_rewards = []
for episode in range(NUM_EPISODES):
    state = env.reset()
    total_reward = 0
    
    for step in range(MAX_EPISODE_LENGTH): 
        action = get_next_action(q_table, state, exploration_rate)
        new_state, reward, done, _ = env.step(action)
        
        if done:
            q_table[state, action] = (1 - a) * q_table[state, action] + a * reward
        else:
            q_table[state, action] = (1 - a) * q_table[state, action] + a * (reward + y * np.max(q_table[new_state, :]))

        state = new_state
        total_reward += reward
        if done == True: 
            break
    exploration_rate = decay_exploration_rate(episode)
    total_rewards.append(total_reward)

rewards_per_thousand_episodes = np.split(np.array(total_rewards), NUM_EPISODES/1000)

print('Average reward per thousand episodes:\n')
for index, r in enumerate(rewards_per_thousand_episodes):
    print((index + 1) * 1000, ": ", str(sum(r/1000)))

Average reward per thousand episodes:

1000 :  -210.67100000000065
2000 :  -2.9680000000000155
3000 :  6.958999999999959
4000 :  7.587999999999961
5000 :  7.247999999999963
6000 :  7.4709999999999654
7000 :  7.538999999999965
8000 :  7.385999999999964
9000 :  7.6209999999999765
10000 :  7.241999999999967


In [99]:
q_table[454,:]

array([ -6.1319182 ,   4.50780883,  -6.16080086,  -6.16168415,
       -10.87592696, -11.51100366])

## Deep Q Learning
### Training on last experience

In [163]:
import numpy as np
import gym
import random
import time
import tensorflow as tf

LEARNING_RATE = 1e-3

def build_model(state_space, action_space):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Embedding(state_space, 10, input_length=1))
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(action_space, activation='relu'))
    model.add(tf.keras.layers.Dense(action_space, activation='relu'))
    model.add(tf.keras.layers.Dense(action_space, activation='relu'))
    model.add(tf.keras.layers.Dense(action_space))
    
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=LEARNING_RATE), loss='mae', metrics=['accuracy'])
    model.summary()
    
    return model

np.random.seed(1234)
env.seed(1234)

#env = gym.make('FrozenLake-v0')
env = gym.make('Taxi-v3')

STATE_SPACE = env.observation_space.n
ACTION_SPACE = env.action_space.n

NUM_EPISODES = 200
MAX_EPISODE_LENGTH = 50

y = 0.99 # gamma = decay rate

MAX_EXPLORATION_RATE = 1
MIN_EXPLORATION_RATE = 0.01
EXPLORATION_RATE_DECAY = 0.01

def decay_exploration_rate(episode):
    # exponential decay
    #return MIN_EXPLORATION_RATE + (MAX_EXPLORATION_RATE - MIN_EXPLORATION_RATE) * np.exp(-EXPLORATION_RATE_DECAY * episode)
    
    # linear decay
    new_exploration_rate = MAX_EXPLORATION_RATE - (EXPLORATION_RATE_DECAY) * episode
    return max(new_exploration_rate, MIN_EXPLORATION_RATE)

def should_explore(exploration_rate):
    exploration_rate_threshold = np.random.uniform(0, 1)
    return exploration_rate > exploration_rate_threshold # exploration rate decreases, less frequent larger than random number

def one_hot_encode(state, state_space):
    return np.array([np.eye(state_space)[state]])

def encode(state):
    return np.array([np.array([state]).reshape((1, 1))])

def get_next_action(q_model, state, exploration_rate):
    if should_explore(exploration_rate):
        return env.action_space.sample()
    return np.argmax(q_model.predict(encode(state)))

# discrete state & action space
q_model = build_model(STATE_SPACE, ACTION_SPACE)
exploration_rate = MAX_EXPLORATION_RATE
total_rewards = []
for episode in range(NUM_EPISODES):
    if episode % 10 == 0:
        print('Episode', episode)
    state = env.reset()
    total_reward = 0
    
    for step in range(MAX_EPISODE_LENGTH): 
        action = get_next_action(q_model, state, exploration_rate)
        new_state, reward, done, _ = env.step(action)
        
        if done:
            target = reward
        else:
            target = reward + y * np.max(q_model.predict(encode(new_state)))
        current_targets = q_model.predict(encode(state))
        current_targets[0][action] = target
        q_model.fit(encode(state), current_targets, epochs=1, verbose=0)

        state = new_state
        total_reward += reward
        if done == True: 
            break
    exploration_rate = decay_exploration_rate(episode)
    total_rewards.append(total_reward)

rewards_per_episodes = np.split(np.array(total_rewards), NUM_EPISODES/10)

print('Average reward per 10 episodes:\n')
for index, r in enumerate(rewards_per_episodes):
    print((index + 1) * 10, ": ", str(np.average(r)))

Model: "sequential_64"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_48 (Embedding)     (None, 1, 10)             5000      
_________________________________________________________________
flatten_40 (Flatten)         (None, 10)                0         
_________________________________________________________________
dense_107 (Dense)            (None, 6)                 66        
_________________________________________________________________
dense_108 (Dense)            (None, 6)                 42        
_________________________________________________________________
dense_109 (Dense)            (None, 6)                 42        
_________________________________________________________________
dense_110 (Dense)            (None, 6)                 42        
Total params: 5,192
Trainable params: 5,192
Non-trainable params: 0
___________________________________________________

In [164]:
q_model.predict([328])

array([[-100.48277 ,  -99.97023 , -100.81094 , -100.345345, -108.639114,
        -109.62755 ]], dtype=float32)

### Replay Memory
Use a fixed size replay memory. Store ```(state, action, new_state, reward)``` for each action taken and thenùse them as memory blocks during tbhe replay. Thus the model trains on more data than the previous attempt with a single example per training iteration.

In [165]:
import numpy as np
import gym
import random
import time
import tensorflow as tf
from collections import deque

LEARNING_RATE = 1e-3

def build_model(state_space, action_space):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Embedding(state_space, 10, input_length=1))
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(action_space, activation='relu'))
    model.add(tf.keras.layers.Dense(action_space, activation='relu'))
    model.add(tf.keras.layers.Dense(action_space, activation='relu'))
    model.add(tf.keras.layers.Dense(action_space))
    
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=LEARNING_RATE), loss='mae', metrics=['accuracy'])
    model.summary()
    
    return model

np.random.seed(1234)
env.seed(1234)

#env = gym.make('FrozenLake-v0')
env = gym.make('Taxi-v3')

STATE_SPACE = env.observation_space.n
ACTION_SPACE = env.action_space.n

NUM_EPISODES = 200
MAX_EPISODE_LENGTH = 20

y = 0.99 # gamma = decay rate

MAX_EXPLORATION_RATE = 1
MIN_EXPLORATION_RATE = 0.01
EXPLORATION_RATE_DECAY = 0.01

def decay_exploration_rate(episode):
    # exponential decay
    #return MIN_EXPLORATION_RATE + (MAX_EXPLORATION_RATE - MIN_EXPLORATION_RATE) * np.exp(-EXPLORATION_RATE_DECAY * episode)
    
    # linear decay
    new_exploration_rate = MAX_EXPLORATION_RATE - (EXPLORATION_RATE_DECAY) * episode
    return max(new_exploration_rate, MIN_EXPLORATION_RATE)

def should_explore(exploration_rate):
    exploration_rate_threshold = np.random.uniform(0, 1)
    return exploration_rate > exploration_rate_threshold # exploration rate decreases, less frequent larger than random number

def encode(state):
    return np.array([np.array([state]).reshape((1, 1))])

def get_next_action(q_model, state, exploration_rate):
    if should_explore(exploration_rate):
        return env.action_space.sample()
    return np.argmax(q_model.predict(encode(state)))

def replay_memory_training(q_model, replay_memory, batch_size = 64):
    sample_memory = random.sample(replay_memory, min(len(replay_memory), batch_size))
    states = np.array([state for (state, _, _, _, _) in sample_memory])
    encoded_states = np.array([encode(state)[0] for state in states])
    actions = np.array([action for (_, action, _, _, _) in sample_memory])
    new_states = np.array([new_state for (_, _, new_state, _, _) in sample_memory])
    encoded_new_states = np.array([encode(new_state)[0] for new_state in new_states])
    rewards = np.array([reward for (_, _, _, reward, _) in sample_memory])
    dones = np.array([done for (_, _, _, _, done) in sample_memory])
    next_values = np.max(q_model.predict(encoded_new_states), axis=1)
    targets = np.where(dones, rewards, rewards + y * next_values)
    current_targets = q_model.predict(encoded_states)
    
    for index, action in enumerate(actions):
        current_targets[index][action] = targets[index]
    
    q_model.fit(encoded_states, current_targets, epochs=1, verbose=0)

# discrete state & action space
q_model = build_model(STATE_SPACE, ACTION_SPACE)

# replay memory with size 100
replay_memory = deque(maxlen=10000)

exploration_rate = MAX_EXPLORATION_RATE
total_rewards = []
for episode in range(NUM_EPISODES):
    if episode % 10 == 0:
        print('Episode', episode)
    state = env.reset()
    total_reward = 0
    
    for step in range(MAX_EPISODE_LENGTH): 
        action = get_next_action(q_model, state, exploration_rate)
        new_state, reward, done, _ = env.step(action)
        
        replay_memory.append((state, action, new_state, reward, done))
        replay_memory_training(q_model, replay_memory)
        
        state = new_state
        total_reward += reward
        if done == True: 
            break
    exploration_rate = decay_exploration_rate(episode)
    total_rewards.append(total_reward)

rewards_per_episodes = np.split(np.array(total_rewards), NUM_EPISODES/10)

print('Average reward per 10 episodes:\n')
for index, r in enumerate(rewards_per_episodes):
    print((index + 1) * 10, ": ", str(np.average(r)))

Model: "sequential_65"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_49 (Embedding)     (None, 1, 10)             5000      
_________________________________________________________________
flatten_41 (Flatten)         (None, 10)                0         
_________________________________________________________________
dense_111 (Dense)            (None, 6)                 66        
_________________________________________________________________
dense_112 (Dense)            (None, 6)                 42        
_________________________________________________________________
dense_113 (Dense)            (None, 6)                 42        
_________________________________________________________________
dense_114 (Dense)            (None, 6)                 42        
Total params: 5,192
Trainable params: 5,192
Non-trainable params: 0
___________________________________________________

In [166]:
q_model.predict([454])

array([[-2.8120596, -3.1325543, -2.6421409, -3.1163185, -3.1478245,
        -4.2167587]], dtype=float32)

### Replay Memory with fixed targets
Instead of using a variable network for training, update the target network after several episodes only

In [167]:
import numpy as np
import gym
import random
import time
import tensorflow as tf
from collections import deque

LEARNING_RATE = 1e-3

def build_model(state_space, action_space):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Embedding(state_space, 10, input_length=1))
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(action_space, activation='relu'))
    model.add(tf.keras.layers.Dense(action_space, activation='relu'))
    model.add(tf.keras.layers.Dense(action_space, activation='relu'))
    model.add(tf.keras.layers.Dense(action_space))
    
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=LEARNING_RATE), loss='mae', metrics=['accuracy'])
    model.summary()
    
    return model

np.random.seed(1234)
env.seed(1234)

#env = gym.make('FrozenLake-v0')
env = gym.make('Taxi-v3')

STATE_SPACE = env.observation_space.n
ACTION_SPACE = env.action_space.n

NUM_EPISODES = 500
MAX_EPISODE_LENGTH = 20

y = 0.99 # gamma = decay rate
TAU = 1e-2 # learning rate for target model parameter updates - similar to alpha in q_table

MAX_EXPLORATION_RATE = 1
MIN_EXPLORATION_RATE = 0.01
EXPLORATION_RATE_DECAY = 0.01

def decay_exploration_rate(episode):
    # exponential decay
    #return MIN_EXPLORATION_RATE + (MAX_EXPLORATION_RATE - MIN_EXPLORATION_RATE) * np.exp(-EXPLORATION_RATE_DECAY * episode)
    
    # linear decay
    new_exploration_rate = MAX_EXPLORATION_RATE - (EXPLORATION_RATE_DECAY) * episode
    return max(new_exploration_rate, MIN_EXPLORATION_RATE)

    # reduce by EXPLORATION_RATE_DECAY
    #new_exploration_rate = MAX_EXPLORATION_RATE * pow((1 - EXPLORATION_RATE_DECAY), episode)
    #return max(new_exploration_rate, MIN_EXPLORATION_RATE)

def should_explore(exploration_rate):
    exploration_rate_threshold = np.random.uniform(0, 1)
    return exploration_rate > exploration_rate_threshold # exploration rate decreases, less frequent larger than random number

def encode(state):
    return np.array([np.array([state]).reshape((1, 1))])

def get_next_action(q_model, state, exploration_rate):
    if should_explore(exploration_rate):
        return env.action_space.sample()
    return np.argmax(q_model.predict(encode(state)))

def replay_memory_training(q_model, target_model, replay_memory, batch_size = 64):
    sample_memory = random.sample(replay_memory, min(len(replay_memory), batch_size))
    states = np.array([state for (state, _, _, _, _) in sample_memory])
    encoded_states = np.array([encode(state)[0] for state in states])
    actions = np.array([action for (_, action, _, _, _) in sample_memory])
    new_states = np.array([new_state for (_, _, new_state, _, _) in sample_memory])
    encoded_new_states = np.array([encode(new_state)[0] for new_state in new_states])
    rewards = np.array([reward for (_, _, _, reward, _) in sample_memory])
    dones = np.array([done for (_, _, _, _, done) in sample_memory])
    q_future = np.max(target_model.predict(encoded_new_states), axis=1)
    targets = np.where(dones, rewards, rewards + y * q_future)
    current_targets = target_model.predict(encoded_states)
    
    for index, action in enumerate(actions):
        current_targets[index][action] = targets[index]
    
    q_model.fit(encoded_states, current_targets, epochs=1, verbose=0)
    
def train_target_model(q_model, target_model, t=TAU):
    q_model_weights = q_model.get_weights()
    target_model_weights = target_model.get_weights()
    for i in range(len(target_model_weights)):
        target_model_weights[i] = (1 - t) * target_model_weights[i] + t * q_model_weights[i]
    target_model.set_weights(target_model_weights)

# discrete state & action space
q_model = build_model(STATE_SPACE, ACTION_SPACE)
target_model = tf.keras.models.clone_model(q_model)

# replay memory with size 100
replay_memory = deque(maxlen=1000)

exploration_rate = MAX_EXPLORATION_RATE
total_rewards = []
for episode in range(NUM_EPISODES):
    if episode % 10 == 0:
        print('Episode', episode)
    state = env.reset()
    total_reward = 0
    
    for step in range(MAX_EPISODE_LENGTH): 
        action = get_next_action(q_model, state, exploration_rate)
        new_state, reward, done, _ = env.step(action)
        
        replay_memory.append((state, action, new_state, reward, done))
        replay_memory_training(q_model, target_model, replay_memory)
        train_target_model(q_model, target_model, TAU)
        
        state = new_state
        total_reward += reward
        if done == True: 
            break
    exploration_rate = decay_exploration_rate(episode)
    total_rewards.append(total_reward)

rewards_per_episodes = np.split(np.array(total_rewards), NUM_EPISODES/10)

print('Average reward per 10 episodes:\n')
for index, r in enumerate(rewards_per_episodes):
    print((index + 1) * len(r), ": ", str(np.avg(r)))

Model: "sequential_66"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_50 (Embedding)     (None, 1, 10)             5000      
_________________________________________________________________
flatten_42 (Flatten)         (None, 10)                0         
_________________________________________________________________
dense_115 (Dense)            (None, 6)                 66        
_________________________________________________________________
dense_116 (Dense)            (None, 6)                 42        
_________________________________________________________________
dense_117 (Dense)            (None, 6)                 42        
_________________________________________________________________
dense_118 (Dense)            (None, 6)                 42        
Total params: 5,192
Trainable params: 5,192
Non-trainable params: 0
___________________________________________________

Exception ignored in: <function IteratorResourceDeleter.__del__ at 0x13d855dc0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/site-packages/tensorflow/python/data/ops/iterator_ops.py", line 536, in __del__
    gen_dataset_ops.delete_iterator(
  File "/usr/local/lib/python3.8/site-packages/tensorflow/python/ops/gen_dataset_ops.py", line 1277, in delete_iterator
    _result = pywrap_tfe.TFE_Py_FastPathExecute(
KeyboardInterrupt: 


KeyboardInterrupt: 

In [178]:
print(q_model.predict([454]))
print(np.argmax(q_model.predict([454])))

[[-0.00341365 -0.0028316  -0.00295588 -0.00309611 -0.00400039 -0.00292171]]
1


In [179]:
[v for v in env.decode(454)]

[4, 2, 3, 2]