In [None]:
import gym
import numpy as np
import random
from collections import Counter
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.nn.utils as nn_utils
import torch.nn.functional as F
import torch.optim as optim

In [None]:
# Hyperparameters
batch_size = 32

episodes = 500
gamma = 0.99

self.epsilon = 1.0          # Exploration rate
self.epsilon_decay = 0.995
self.epsilon_min = 0.1      # Minimal exploration rate (epsilon-greedy)

self.learning_rate = 0.001
replay_buffer = deque(maxlen=2000)
    
target_update = 1000 # Number of steps until updating the target network


In [None]:
#create model
class Model(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(Model, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(input_shape[0]*input_shape[1], 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions)

    def forward(self, x):
        # flatten the observation space Box to linear tensor
        x_flat = torch.flatten(x, 1,2).to(torch.float32)
        return self.net(x_flat)

In [None]:
#replay once hit learning size
def replay(self, batch_size):
    minibatch = random.sample(self.memory, batch_size)

    for state, action, reward, next_state, done in minibatch:

        if not done:
            target = (reward + self.gamma * np.amax(self.target_model.predict(next_state)))
        else:
            target = reward

        # Construct the target vector as follows:
        # 1. Use the current model to output the Q-value predictions
        target_f = self.model.predict(state)
        # 2. Rewrite the chosen action value with the computed target
        target_f[0][action] = target
        # 3. Use vectors in the objective computation
        self.model.fit(state, target_f, epochs=1, verbose=0)


#use mini batches to learn once size is correct
def replay(self, batch_size):
    minibatch = random.sample(self.memory, batch_size)
    for state, action, reward, next_state, done in minibatch:
        
        optimizer.zero_grad()
        
        target = reward
        if not done:
            target = (reward + self.gamma *
                      np.amax(self.model.predict(next_state)[0]))
        target_f = self.model.predict(state)
        target_f[0][action] = target
        

        loss = F.mse_loss(state, target_f)
        loss.backward()

        optimizer.step()
        self.model.fit(state, target_f, epochs=1, verbose=0)


In [None]:
#epsilon for epsilon greedy strategy
def set_epsilon(self, epsilon):
    self.epsilon = epsilon        
    if self.epsilon > self.epsilon_min:
        self.epsilon *= self.epsilon_decay

In [None]:

def train(self, epochs=10, batch_size=128, verbose=False):
    losses = []
    for epoch in range(epochs):
        #reads in mini batches
        states, actions, rewards, dones, next_states = self.sample(batch_size)

        #converts to tensors
        states = tf.convert_to_tensor(states, dtype=tf.float64)
        done_masks = tf.convert_to_tensor((~dones.astype(bool)).astype(int), dtype=tf.float64)
        next_states = tf.convert_to_tensor(next_states, dtype=tf.float64)
        #gets target values by taking action taken from output of all actions
        action_masks = tf.one_hot(actions, self.action_space.n, dtype=tf.float64)
        target_values = tf.expand_dims( rewards + self.gamma * np.max( self.target_model( next_states ) ) * done_masks, axis = 1 )
        target_values *= action_masks

        #updates qvalues
        with tf.GradientTape() as tape:
            q_values = self.model(states) * action_masks
            if verbose:
                print(target_values, q_values)
            loss = tf.reduce_mean((target_values - q_values)**2)

        losses.append(loss)
        grads = tape.gradient(loss, self.model.trainable_variables)
        optimizer = tf.train.AdamOptimizer(self.lr)
        optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
    return np.mean(losses)




In [None]:
class SchedulerEnv(gym.Env):

    def __init__(self):
        
        #starting parameters
        num_gps = 100
        num_slots = 32
        num_pre_booked = 750
        to_book = [2,1,2,2,1,1,1,3,3,1,2,1,3,2,1,1,2,1,3,2,3,2]
        num_to_book = len(to_book)
        agent_pos = [0,0]
        reward_decay = 0.95
        
        #set parameters for the day
        self.num_gps = num_gps
        self.num_slots = num_slots
        self.num_pre_booked = num_pre_booked
        self.to_book = to_book
        self.num_to_book = num_to_book
        self.diary_slots = num_gps*num_slots
        self.agent_pos = agent_pos
        self.reward_decay = reward_decay

        #set action space to move around the grid
        self.action_space = gym.spaces.Discrete(4) #up, down, left, right
        
        #set observation space 
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(self.num_slots, self.num_gps), dtype=np.int32)
   
    #creates daily diary for each gp, randomly populates prebooked appointments and resets parameters
    def reset(self):

        #creates zero filled dataframe with row per time slot and column per gp
        self.state = np.zeros((self.num_slots, self.num_gps),dtype=float)

        #randomly enters a 1 for each pre booked appointments
        pre_booked = self.num_pre_booked
        while pre_booked>0:
            pre_booked -= 1
            self.state[np.random.randint(self.num_slots), np.random.randint(self.num_gps)] = 1
            
        #randomly sets the agent start space
        self.agent_pos = [np.random.randint(self.num_slots), np.random.randint(self.num_gps)]

        #resets parameters for new episode
        self.done = False
        self.reward = 0
        self.appt_idx = 0
        self.decay_steps = 1
        
        #print('starting state', self.state.sum(), self.state)

        return self.state
    
    #calculates new position of the agent based on the action
    def move_agent(self, action):

        #set boundaries for the grid
        max_row = self.num_slots - 1
        max_col = self.num_gps - 1

        #setting new co-ordinates for the agent
        new_row = self.agent_pos[0]
        new_col = self.agent_pos[1]

        #calculate what the new position may be based on the action without going out the grid
        if action == 0:
            #print('up')
            new_row = max(self.agent_pos[0] - 1, 0)
        if action == 1:
            #print('down')
            new_row = min(self.agent_pos[0] + 1, max_row)
        if action == 2:
            #print('left')
            new_col = max(self.agent_pos[1] - 1, 0)
        if action == 3:
            #print('right')
            new_col = min(self.agent_pos[1] + 1, max_col)

        new_pos = [new_row, new_col]
        #print('new pos', new_pos)

        return new_pos

    #checks if we can look to book appointment starting here
    def check_bookable(self):
        return self.state[self.agent_pos[0], self.agent_pos[1]] == 0.0
    
    #action if we can't book the appointment
    def invalid_booking(self):
        #print('cant book')
        self.decay_steps += 1
        self.reward = -1
        
    #action if we can book the appointment
    def valid_booking(self):
        #print('go ahead and book')
        self.appt_idx += 1
        self.decay_steps = 1
        self.reward = 1
    
    #checks if the appointment fits
    def check_and_book(self):
        
        max_row = self.num_slots - 1
        cells_to_check = self.to_book[self.appt_idx]
        
        if cells_to_check==1:
            #print('good to check for single')
            if self.state[self.agent_pos[0], self.agent_pos[1]] == 0:
                self.state[self.agent_pos[0], self.agent_pos[1]] = 1
                self.valid_booking()
            else:
                #print('single taken')
                self.invalid_booking()

        if cells_to_check==2:
            #check we're not at the bottom of the grid
            if self.agent_pos[0]<max_row:
                #check the next cells is also 0.0
                #print('good to check for double')
                if self.state[self.agent_pos[0], self.agent_pos[1]] == 0 and \
                self.state[(self.agent_pos[0]+1), self.agent_pos[1]] == 0:
                    self.state[self.agent_pos[0], self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+1), self.agent_pos[1]] = 1
                    self.valid_booking()
                    self.agent_pos = [(self.agent_pos[0]+1), self.agent_pos[1]]
                    #print('after booking', self.agent_pos)
                else:
                    #print('double taken')
                    self.invalid_booking()
            else:
                #print('not for double')
                self.invalid_booking()
                
        if cells_to_check==3:
            #check we're not at the bottom of the grid
            if self.agent_pos[0]+1<max_row:
                #print('good to check for treble')
                if self.state[self.agent_pos[0], self.agent_pos[1]] == 0 and \
                self.state[(self.agent_pos[0]+1), self.agent_pos[1]] == 0 \
                 and self.state[(self.agent_pos[0]+2), self.agent_pos[1]] == 0:
                    self.state[self.agent_pos[0], self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+1), self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+2), self.agent_pos[1]] = 1
                    self.valid_booking()
                    self.agent_pos = [(self.agent_pos[0]+2), self.agent_pos[1]]
                else:
                    #print('treble taken')
                    self.invalid_booking()
            else:
                #print('not for treble')
                self.invalid_booking()
                
        if cells_to_check==4:
            #check we're not at the bottom of the grid
            if self.agent_pos[0]+2<max_row:
                #check the next cells is also 0.0
                #print('good for quad')
                if self.state[self.agent_pos[0], self.agent_pos[1]] == 0 and \
                self.state[(self.agent_pos[0]+1), self.agent_pos[1]] == 0 \
                 and self.state[(self.agent_pos[0]+2), self.agent_pos[1]] == 0 and \
                self.state[(self.agent_pos[0]+3), self.agent_pos[1]] == 0:
                    self.state[self.agent_pos[0], self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+1), self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+2), self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+3), self.agent_pos[1]] = 1
                    self.valid_booking()
                    self.agent_pos = [(self.agent_pos[0]+3), self.agent_pos[1]]
                else:
                    #print('quad taken')
                    self.invalid_booking()
            else:
                #print('not for quad')
                self.invalid_booking()

        next_state = self.state

        return next_state

    def step(self, action):

        #get new position of agent based on action
        new_agent_pos = self.move_agent(action)
        #print('new and old pos', new_agent_pos, self.agent_pos)
        
        #if the agent is stuck on an edge then move to a new position
        if new_agent_pos == self.agent_pos:
            self.agent_pos = [np.random.randint(self.num_slots), np.random.randint(self.num_gps)]
            #print('here1', self.agent_pos)
        else:
            self.agent_pos = new_agent_pos
            #print('here2', self.agent_pos)
        
        #print('trying to book', self.to_book, self.appt_idx)
        
        #check if it's possible to book then book
        if self.check_bookable():
            #print('checked here')
            self.state = self.check_and_book()
        else:
            #print('not bookable')
            self.invalid_booking()
        
        #work out if episode complete
        if self.appt_idx == len(self.to_book):
            #print('all booked')
            self.done = True
  
        #work out rewards
        #self.reward = (1 - (self.reward_decay**self.decay_steps))
        
        #print('step', self.decay_steps, self.reward)
        #print('end step')

        info = {}
        return self.state, self.reward, self.done, info

In [None]:
env = SchedulerEnv()

#create the current network and target network
policy_model = Model(envs[0].observation_space.shape, envs[0].action_space.n).to(device)
#self.model = MLP(model_input_dim, self.action_space.n)
#self.model(tf.convert_to_tensor([np.random.normal(size=model_input_shape)], dtype=tf.float64))


target_model = Model(envs[0].observation_space.shape, envs[0].action_space.n).to(device)
#self.target_model = MLP(model_input_dim, self.action_space.n)
#self.target_model(tf.convert_to_tensor([np.random.normal(size=model_input_shape)], dtype=tf.float64))

target_net.load_state_dict(policy_net.state_dict())
target_net.eval()



num_episodes = 50
for i_episode in range(num_episodes):
    # Initialize the environment and state
    #set initial parameters
    total_reward = 0

    replay_buffer = deque(maxlen=max_len)  

    state = env.reset()
    
    
    for t in count():
        # Select and perform an action
        action = select_action(state)
        if t > learning_starts:
            if np.random.rand() => self.epsilon:
                action = policy_net(state).max(1)[1].view(1, 1)
        else:
            action = random.randrange(num_actions)

        
        next_state, reward, done, _ = env.step(action)
        reward = torch.tensor([reward], device=device)




        #take a step and save the info to the replay buffer
        next_state, reward, done, _ = env.step(action)

        # Store other info in replay memory
        remember(state, action, reward, next_state, done)
        # Append experience to replay buffer
        replay_buffer.append((state, action, reward, next_state, done))


        # Move to the next state
        state = next_state
        
        
        reward -= 1  # Punish behavior which does not accumulate reward
        total_reward += reward
        
        if done:
            all_rewards += game_score

            
            break
            
        #once we're ready to learn then start learning with mini batches
        if len(replay_buffer) > batch_size:
            replay(batch_size)
        
        

        # Update the target network, copying all weights and biases in DQN
        # Periodically update the target network by Q network to target Q network
        if num_param_updates % target_update_freq == 0:
            # Update weights of target
            #self.target_model.set_weights(self.model.get_weights())
            target_model.load_state_dict(model.state_dict())
    

