In [166]:
import gym
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras

from collections import deque
import time
import random

In [167]:
#set parameters
num_gps = 5
num_slots = 5
num_pre_booked = 7
num_to_book = 3

In [168]:
# An episode a full game
train_episodes = 300
test_episodes = 100

def agent(state_shape, action_shape):
    """ The agent maps X-states to Y-actions
    e.g. The neural network output is [.1, .7, .05, 0.05, .05, .05]
    The highest value 0.7 is the Q-Value.
    The index of the highest action (0.7) is action #1.
    """
    learning_rate = 0.001
    init = tf.keras.initializers.constant()
    model = keras.Sequential()
    model.add(keras.layers.Dense(24, input_shape=state_shape, activation='relu', kernel_initializer=init))
    model.add(keras.layers.Dense(12, activation='relu', kernel_initializer=init))
    model.add(keras.layers.Dense(action_shape, activation='linear', kernel_initializer=init))
    model.compile(loss=tf.keras.losses.Huber(), optimizer=tf.keras.optimizers.Adam(lr=learning_rate), metrics=['accuracy'])
    return model

def get_qs(model, state, step):
    return model.predict(state.reshape([1, state.shape[0]]))[0]

In [169]:
class SchedulerEnv(gym.Env):
    
    metadata = {'render.modes': ['human']}
    
    def __init__(self):
        
        #set parameters for the day
        self.num_gps = num_gps
        self.num_slots = num_slots
        self.num_pre_booked = num_pre_booked
        self.num_to_book = num_to_book

        #set action space this is the gp to book the appointment for
        self.action_space = gym.spaces.Discrete(num_gps)
        
        #set observation space 
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(num_slots, num_gps), dtype=np.int32)

        #create a new diary for the day
        self.reset()
        
        #general prints for testing
        free_gps = self.state.isin([0]).all().sum()
        print("Number of GPs: ", self.num_gps)
        print("Number of GPs with no appointments: ", free_gps)
        
    #creates daily diary for each gp and randomly populates prebooked appointments
    def reset(self):

        #creates zero filled dataframe with row per time slot and column per gp
        self.state = pd.DataFrame(np.zeros((self.num_slots, self.num_gps))).astype(int)

        #randomly enters a 1 for each pre booked appointments
        while self.num_pre_booked>=0:
            self.num_pre_booked -= 1
            row_to_update = np.random.randint(self.num_slots, size=1)
            col_to_update = np.random.randint(self.num_gps, size=1)
            self.state.at[row_to_update[0],col_to_update[0]]=1

        #resets parameters for new episode
        self.done = False
        self.reward = 0
        self.num_to_book = num_to_book
        self.num_pre_booked = num_pre_booked

        return self.state.to_numpy(dtype=np.int32)
    
    #books the appointment for the gp
    def step(self, action):
    
        #get diary for gp to update
        gp_to_update = self.state[action]

        #if the gp is full increase reward by -1
        full_gp = gp_to_update.all()
        if full_gp:
            self.reward -=1

        #if the gp has no current appointments increase reward by -2
        new_gp = gp_to_update.eq(0).all()
        if new_gp:
            self.reward -=2

        #fill the first available appointment for the gp
        next_appt = gp_to_update.eq(0).idxmax()
        self.state[action][next_appt] = 1
        self.num_to_book -=1

        self.done = (self.num_to_book==0)
        info = {}

        return self.state.to_numpy(dtype=np.int32), self.reward, self.done, info


In [170]:
env = SchedulerEnv()

Number of GPs:  5
Number of GPs with no appointments:  0


In [171]:
def train(env, replay_memory, model, target_model, done):
    learning_rate = 0.7 # Learning rate
    discount_factor = 0.618

    MIN_REPLAY_SIZE = 1000
    if len(replay_memory) < MIN_REPLAY_SIZE:
        return

    batch_size = 64 * 2
    mini_batch = random.sample(replay_memory, batch_size)
    current_states = np.array([encode_observation(transition[0], env.observation_space.shape) for transition in mini_batch])
    current_qs_list = model.predict(current_states)
    new_current_states = np.array([encode_observation(transition[3], env.observation_space.shape) for transition in mini_batch])
    future_qs_list = target_model.predict(new_current_states)

    X = []
    Y = []
    for index, (observation, action, reward, new_observation, done) in enumerate(mini_batch):
        if not done:
            max_future_q = reward + discount_factor * np.max(future_qs_list[index])
        else:
            max_future_q = reward

        current_qs = current_qs_list[index]
        current_qs[action] = (1 - learning_rate) * current_qs[action] + learning_rate * max_future_q

        X.append(encode_observation(observation, env.observation_space.shape))
        Y.append(current_qs)
    model.fit(np.array(X), np.array(Y), batch_size=batch_size, verbose=0, shuffle=True)
    
def encode_observation(observation, n_dims):
    return observation

In [172]:
def main():
    epsilon = 1 # Epsilon-greedy algorithm in initialized at 1 meaning every step is random at the start
    max_epsilon = 1 # You can't explore more than 100% of the time
    min_epsilon = 0.01 # At a minimum, we'll always explore 1% of the time
    decay = 0.01

    # 1. Initialize the Target and Main models
    # Main Model (updated every 4 steps)
    model = agent(env.observation_space.shape, env.action_space.n)
    # Target Model (updated every 100 steps)
    target_model = agent(env.observation_space.shape, env.action_space.n)
    target_model.set_weights(model.get_weights())

    replay_memory = deque(maxlen=50_000)

    target_update_counter = 0

    # X = states, y = actions
    X = []
    y = []

    steps_to_update_target_model = 0

    for episode in range(train_episodes):
        total_training_rewards = 0
        observation = env.reset()
        print('starting', observation)
        free_gps = pd.DataFrame(observation).isin([0]).all().sum()
        print("Number of free GPs: ", free_gps)
        done = False
        while not done:
            steps_to_update_target_model += 1
            #if True:
            #    env.render()

            random_number = np.random.rand()
            # 2. Explore using the Epsilon Greedy Exploration Strategy
            if random_number <= epsilon:
                # Explore
                action = env.action_space.sample()
            else:
                # Exploit best known action
                # model dims are (batch, env.observation_space.n)
                encoded = encode_observation(observation, env.observation_space.shape[0])
                encoded_reshaped = encoded.reshape([1, encoded.shape[0], encoded.shape[1]])
                predicted = model.predict(encoded_reshaped).flatten()
                action = np.argmax(predicted)
            new_observation, reward, done, info = env.step(action)
            replay_memory.append([observation, action, reward, new_observation, done])

            # 3. Update the Main Network using the Bellman Equation
            if steps_to_update_target_model % 4 == 0 or done:
                train(env, replay_memory, model, target_model, done)

            observation = new_observation
            total_training_rewards += reward

            if done:
                print('Total training rewards: {} after n steps = {} with final reward = {}'.format(total_training_rewards, episode, reward))
                free_gps = pd.DataFrame(observation).isin([0]).all().sum()
                print("Number of free GPs: ", free_gps)
                print('end', episode, observation)
                total_training_rewards += 1

                if steps_to_update_target_model >= 100:
                    print('Copying main network weights to the target network weights')
                    target_model.set_weights(model.get_weights())
                    steps_to_update_target_model = 0
                break

        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay * episode)
    env.close()


if __name__ == '__main__':
    main()

starting [[0 1 1 0 0]
 [0 1 0 0 0]
 [0 1 0 0 0]
 [1 1 0 1 0]
 [0 0 0 1 0]]
Number of free GPs:  1
Total training rewards: 0 after n steps = 0 with final reward = 0
Number of free GPs:  1
end 0 [[0 1 1 1 0]
 [0 1 1 0 0]
 [0 1 0 0 0]
 [1 1 0 1 0]
 [0 1 0 1 0]]
starting [[0 0 0 0 0]
 [1 0 0 0 0]
 [0 1 1 0 0]
 [0 1 0 0 1]
 [1 0 1 0 0]]
Number of free GPs:  1
Total training rewards: -6 after n steps = 1 with final reward = -2
Number of free GPs:  0
end 1 [[0 0 0 1 0]
 [1 0 0 1 0]
 [0 1 1 1 0]
 [0 1 0 0 1]
 [1 0 1 0 0]]
starting [[1 1 0 0 1]
 [0 0 1 0 0]
 [0 0 0 0 0]
 [0 0 0 1 1]
 [0 0 0 1 0]]
Number of free GPs:  0
Total training rewards: 0 after n steps = 2 with final reward = 0
Number of free GPs:  0
end 2 [[1 1 1 0 1]
 [0 1 1 0 0]
 [0 0 1 0 0]
 [0 0 0 1 1]
 [0 0 0 1 0]]
starting [[0 0 0 0 1]
 [0 1 0 0 0]
 [0 0 0 0 0]
 [0 1 1 0 0]
 [1 0 0 0 0]]
Number of free GPs:  1
Total training rewards: -6 after n steps = 3 with final reward = -2
Number of free GPs:  0
end 3 [[0 0 0 1 1]
 [0 1 0 1 1]


starting [[1 0 1 0 0]
 [1 1 0 0 0]
 [0 0 0 0 0]
 [1 0 0 1 1]
 [0 0 0 0 0]]
Number of free GPs:  0
Total training rewards: 0 after n steps = 46 with final reward = 0
Number of free GPs:  0
end 46 [[1 1 1 0 0]
 [1 1 0 0 0]
 [1 0 0 0 0]
 [1 0 0 1 1]
 [1 0 0 0 0]]
starting [[0 0 1 1 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [1 0 0 1 0]
 [0 0 1 0 1]]
Number of free GPs:  1
Total training rewards: 0 after n steps = 47 with final reward = 0
Number of free GPs:  1
end 47 [[1 0 1 1 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 1 0]
 [0 0 1 0 1]]
starting [[1 0 0 1 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 1 1 0 1]
 [1 1 1 0 0]]
Number of free GPs:  0
Total training rewards: 0 after n steps = 48 with final reward = 0
Number of free GPs:  0
end 48 [[1 1 1 1 0]
 [1 0 0 0 0]
 [0 0 0 0 0]
 [0 1 1 0 1]
 [1 1 1 0 0]]
starting [[0 0 0 0 1]
 [0 1 1 0 0]
 [0 0 0 0 0]
 [0 0 1 0 1]
 [0 1 0 1 0]]
Number of free GPs:  1
Total training rewards: -6 after n steps = 49 with final reward = -2
Number of free GPs:  0
end 49 [[1 1 0 0 1]
 [1 1 1

starting [[0 1 0 0 0]
 [1 0 0 1 0]
 [0 0 0 1 0]
 [0 0 0 0 1]
 [0 0 0 1 0]]
Number of free GPs:  1
Total training rewards: 0 after n steps = 104 with final reward = 0
Number of free GPs:  1
end 104 [[1 1 0 0 0]
 [1 0 0 1 0]
 [1 0 0 1 0]
 [1 0 0 0 1]
 [0 0 0 1 0]]
starting [[1 0 1 0 0]
 [1 0 0 0 0]
 [0 1 0 0 0]
 [0 1 0 0 0]
 [1 0 0 0 1]]
Number of free GPs:  1
Total training rewards: 0 after n steps = 105 with final reward = 0
Number of free GPs:  1
end 105 [[1 1 1 0 0]
 [1 0 0 0 0]
 [1 1 0 0 0]
 [1 1 0 0 0]
 [1 0 0 0 1]]
starting [[0 0 1 0 0]
 [1 1 0 0 0]
 [0 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 1]]
Number of free GPs:  1
Total training rewards: 0 after n steps = 106 with final reward = 0
Number of free GPs:  1
end 106 [[1 1 1 0 0]
 [1 1 0 0 0]
 [0 1 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 1]]
starting [[0 0 0 1 0]
 [0 0 0 1 0]
 [0 0 0 0 0]
 [1 1 0 0 1]
 [1 0 0 1 0]]
Number of free GPs:  1
Total training rewards: 0 after n steps = 107 with final reward = 0
Number of free GPs:  1
end 107 [[1 0 0 1 1]
 

Total training rewards: 0 after n steps = 146 with final reward = 0
Number of free GPs:  0
end 146 [[1 0 1 0 0]
 [1 0 0 0 0]
 [1 0 0 1 0]
 [1 1 0 0 1]
 [1 0 1 0 0]]
starting [[0 0 0 0 0]
 [0 0 1 1 1]
 [1 0 0 1 0]
 [0 1 0 1 0]
 [0 0 0 0 0]]
Number of free GPs:  0
Total training rewards: 0 after n steps = 147 with final reward = 0
Number of free GPs:  0
end 147 [[1 1 1 0 0]
 [0 0 1 1 1]
 [1 0 0 1 0]
 [0 1 0 1 0]
 [0 0 0 0 0]]
starting [[1 1 1 0 0]
 [0 1 0 0 0]
 [0 0 0 0 0]
 [1 0 0 1 1]
 [0 0 1 0 0]]
Number of free GPs:  0
Total training rewards: 0 after n steps = 148 with final reward = 0
Number of free GPs:  0
end 148 [[1 1 1 0 0]
 [1 1 0 0 0]
 [1 1 0 0 0]
 [1 0 0 1 1]
 [0 0 1 0 0]]
starting [[0 0 1 1 1]
 [0 0 0 1 1]
 [1 0 0 0 0]
 [0 0 0 1 0]
 [0 0 0 0 0]]
Number of free GPs:  1
Total training rewards: 0 after n steps = 149 with final reward = 0
Number of free GPs:  1
end 149 [[1 0 1 1 1]
 [1 0 0 1 1]
 [1 0 0 1 0]
 [0 0 0 1 0]
 [0 0 0 0 0]]
starting [[0 0 0 0 0]
 [0 0 1 0 0]
 [0 0 0 0 1

end 181 [[1 1 0 0 1]
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 1 1 0 0]
 [1 0 1 1 0]]
starting [[0 0 0 1 0]
 [1 0 0 0 1]
 [1 0 0 1 1]
 [0 0 0 0 1]
 [0 0 0 1 0]]
Number of free GPs:  2
Total training rewards: 0 after n steps = 182 with final reward = 0
Number of free GPs:  2
end 182 [[1 0 0 1 0]
 [1 0 0 0 1]
 [1 0 0 1 1]
 [1 0 0 0 1]
 [1 0 0 1 0]]
starting [[0 1 0 0 0]
 [0 0 0 0 1]
 [1 0 0 0 0]
 [0 1 0 0 0]
 [0 1 1 0 0]]
Number of free GPs:  1
Total training rewards: 0 after n steps = 183 with final reward = 0
Number of free GPs:  1
end 183 [[1 1 0 0 0]
 [1 0 0 0 1]
 [1 0 0 0 0]
 [1 1 0 0 0]
 [0 1 1 0 0]]
starting [[0 0 0 0 0]
 [1 0 1 0 0]
 [1 0 1 0 1]
 [0 0 0 1 0]
 [0 1 0 0 0]]
Number of free GPs:  0
Total training rewards: 0 after n steps = 184 with final reward = 0
Number of free GPs:  0
end 184 [[1 1 0 0 0]
 [1 0 1 0 0]
 [1 0 1 0 1]
 [1 0 0 1 0]
 [0 1 0 0 0]]
starting [[0 0 0 1 0]
 [0 1 0 0 0]
 [0 0 1 0 1]
 [1 0 0 0 0]
 [1 0 0 0 0]]
Number of free GPs:  0
Total training rewards: 0 after n steps

Total training rewards: 0 after n steps = 217 with final reward = 0
Number of free GPs:  0
end 217 [[1 0 0 1 0]
 [1 1 0 1 1]
 [1 0 1 0 0]
 [1 0 0 0 0]
 [0 0 0 0 1]]
starting [[1 0 0 0 0]
 [0 0 0 0 0]
 [0 1 0 0 0]
 [1 1 0 0 0]
 [1 1 0 0 0]]
Number of free GPs:  3
Total training rewards: -1 after n steps = 218 with final reward = -1
Number of free GPs:  3
end 218 [[1 0 0 0 0]
 [1 0 0 0 0]
 [1 1 0 0 0]
 [1 1 0 0 0]
 [1 1 0 0 0]]
starting [[1 0 0 0 0]
 [0 1 0 1 0]
 [0 1 0 1 0]
 [0 0 0 1 0]
 [1 0 0 0 0]]
Number of free GPs:  2
Total training rewards: -2 after n steps = 219 with final reward = -2
Number of free GPs:  1
end 219 [[1 0 0 0 1]
 [1 1 0 1 0]
 [1 1 0 1 0]
 [0 0 0 1 0]
 [1 0 0 0 0]]
starting [[0 0 1 0 0]
 [0 0 0 0 1]
 [0 1 0 0 1]
 [0 1 0 0 0]
 [0 0 1 0 1]]
Number of free GPs:  2
Total training rewards: -6 after n steps = 220 with final reward = -2
Number of free GPs:  1
end 220 [[1 0 1 0 0]
 [1 0 0 0 1]
 [1 1 0 0 1]
 [0 1 0 0 0]
 [0 0 1 0 1]]
starting [[1 0 1 0 0]
 [1 0 0 1 0]
 [0 0

Total training rewards: 0 after n steps = 258 with final reward = 0
Number of free GPs:  0
end 258 [[1 0 0 1 1]
 [1 0 0 1 0]
 [1 1 0 0 0]
 [1 0 0 0 1]
 [0 1 1 0 0]]
starting [[0 0 1 0 1]
 [0 1 0 0 0]
 [0 1 0 0 0]
 [0 0 0 0 0]
 [0 1 1 1 0]]
Number of free GPs:  1
Total training rewards: -6 after n steps = 259 with final reward = -2
Number of free GPs:  0
end 259 [[1 0 1 0 1]
 [1 1 0 0 0]
 [1 1 0 0 0]
 [0 0 0 0 0]
 [0 1 1 1 0]]
starting [[1 0 1 0 0]
 [0 0 1 1 0]
 [1 0 0 0 0]
 [0 1 0 0 1]
 [0 1 0 0 0]]
Number of free GPs:  0
Total training rewards: 0 after n steps = 260 with final reward = 0
Number of free GPs:  0
end 260 [[1 0 1 0 0]
 [1 0 1 1 0]
 [1 0 0 0 0]
 [1 1 0 0 1]
 [1 1 0 0 0]]
starting [[0 0 0 1 1]
 [0 0 0 0 0]
 [0 0 1 1 1]
 [1 0 1 1 0]
 [0 0 0 0 0]]
Number of free GPs:  1
Total training rewards: 0 after n steps = 261 with final reward = 0
Number of free GPs:  1
end 261 [[1 0 0 1 1]
 [1 0 0 0 0]
 [1 0 1 1 1]
 [1 0 1 1 0]
 [0 0 0 0 0]]
starting [[0 0 0 0 0]
 [1 0 0 0 1]
 [0 0 0 0

Total training rewards: 0 after n steps = 299 with final reward = 0
Number of free GPs:  0
end 299 [[1 0 1 0 0]
 [1 1 0 0 0]
 [1 0 0 0 0]
 [1 0 0 1 1]
 [1 1 0 0 0]]
