In [None]:
import gym
import ptan
import numpy as np
import argparse
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.nn.utils as nn_utils
import torch.nn.functional as F
import torch.optim as optim

gamma = 0.99
batch_size = 11
num_envs = 6
reward_steps = 4

In [None]:
class Model(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(Model, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(input_shape[0]*input_shape[1], 64),
            nn.ReLU(),
            nn.Linear(64, 128) 
        )
        
        self.actor = nn.Sequential(
            nn.Linear(128, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )

        self.critic = nn.Sequential(
            nn.Linear(128, 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )


    def forward(self, x):
        # flatten the observation space Box to linear tensor
        x_flat = torch.flatten(x, 1,2).to(torch.float32)
        #print('x_flat', x_flat.size(), x_flat)
        init_out = self.net(x_flat)
        return self.actor(init_out), self.critic(init_out)

In [None]:
def unpack_batch(batch, model, device='cpu'):

    states = []
    actions = []
    rewards = []
    not_done_idx = []
    last_states = []
    #create lists of the states, actions and rewards
    for idx, exp in enumerate(batch):
        states.append(np.array(exp.state, copy=False))
        actions.append(int(exp.action))
        rewards.append(exp.reward)
        #separate out the last states to be able to calculate the rewards
        if exp.last_state is not None:
            not_done_idx.append(idx)
            last_states.append(np.array(exp.last_state, copy=False))

    #convert to tensors for calculations
    states = torch.FloatTensor(
        np.array(states, copy=False)).to(device)
    actions = torch.LongTensor(actions).to(device)

    # handle rewards
    rewards_np = np.array(rewards, dtype=np.float32)
    if not_done_idx:
        last_states = torch.FloatTensor(np.array(last_states, copy=False)).to(device)
        last_vals = model(last_states)[1]
        last_vals_np = last_vals.data.cpu().numpy()[:, 0]
        last_vals_np *= gamma ** reward_steps
        rewards_np[not_done_idx] += last_vals_np

    rewards = torch.FloatTensor(rewards_np).to(device)

    return states, actions, rewards

In [None]:
class SchedulerEnv(gym.Env):

    def __init__(self):
        
        #starting parameters
        num_gps = 100
        num_slots = 32
        num_pre_booked = 15
        to_book = [2,1,2,2,1,1,1]
        num_to_book = len(to_book)
        agent_pos = [0,0]
        reward_decay = 0.95
        
        #set parameters for the day
        self.num_gps = num_gps
        self.num_slots = num_slots
        self.num_pre_booked = num_pre_booked
        self.to_book = to_book
        self.num_to_book = num_to_book
        self.diary_slots = num_gps*num_slots
        self.agent_pos = agent_pos
        self.reward_decay = reward_decay

        #set action space to move around the grid
        self.action_space = gym.spaces.Discrete(4) #up, down, left, right
        
        #set observation space 
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(self.num_slots, self.num_gps), dtype=np.int32)
   
    #creates daily diary for each gp, randomly populates prebooked appointments and resets parameters
    def reset(self):

        #creates zero filled dataframe with row per time slot and column per gp
        self.state = np.zeros((self.num_slots, self.num_gps),dtype=float)

        #randomly enters a 1 for each pre booked appointments
        pre_booked = self.num_pre_booked
        while pre_booked>0:
            pre_booked -= 1
            self.state[np.random.randint(self.num_slots), np.random.randint(self.num_gps)] = 1
            
        #randomly sets the agent start space
        self.agent_pos = [np.random.randint(self.num_slots), np.random.randint(self.num_gps)]

        #resets parameters for new episode
        self.done = False
        self.reward = 0
        self.appt_idx = 0
        self.decay_steps = 1
        
        #print('starting state', self.state.sum(), self.state)

        return self.state
    
    #calculates new position of the agent based on the action
    def move_agent(self, action):

        #set boundaries for the grid
        max_row = self.num_slots - 1
        max_col = self.num_gps - 1

        #setting new co-ordinates for the agent
        new_row = self.agent_pos[0]
        new_col = self.agent_pos[1]

        #calculate what the new position may be based on the action without going out the grid
        if action == 0:
            #print('up')
            new_row = max(self.agent_pos[0] - 1, 0)
        if action == 1:
            #print('down')
            new_row = min(self.agent_pos[0] + 1, max_row)
        if action == 2:
            #print('left')
            new_col = max(self.agent_pos[1] - 1, 0)
        if action == 3:
            #print('right')
            new_col = min(self.agent_pos[1] + 1, max_col)

        new_pos = [new_row, new_col]
        #print('new pos', new_pos)

        return new_pos

    #checks if we can look to book appointment starting here
    def check_bookable(self):
        return self.state[self.agent_pos[0], self.agent_pos[1]] == 0.0
    
    #action if we can't book the appointment
    def invalid_booking(self):
        #print('cant book')
        self.decay_steps += 1
        self.reward = -1
        
    #action if we can book the appointment
    def valid_booking(self):
        #print('go ahead and book')
        self.appt_idx += 1
        self.decay_steps = 1
        self.reward = 1
    
    #checks if the appointment fits
    def check_and_book(self):
        
        max_row = self.num_slots - 1
        cells_to_check = self.to_book[self.appt_idx]
        
        if cells_to_check==1:
            #print('good to check for single')
            if self.state[self.agent_pos[0], self.agent_pos[1]] == 0:
                self.state[self.agent_pos[0], self.agent_pos[1]] = 1
                self.valid_booking()
            else:
                #print('single taken')
                self.invalid_booking()

        if cells_to_check==2:
            #check we're not at the bottom of the grid
            if self.agent_pos[0]<max_row:
                #check the next cells is also 0.0
                #print('good to check for double')
                if self.state[self.agent_pos[0], self.agent_pos[1]] == 0 and \
                self.state[(self.agent_pos[0]+1), self.agent_pos[1]] == 0:
                    self.state[self.agent_pos[0], self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+1), self.agent_pos[1]] = 1
                    self.valid_booking()
                    self.agent_pos = [(self.agent_pos[0]+1), self.agent_pos[1]]
                    #print('after booking', self.agent_pos)
                else:
                    #print('double taken')
                    self.invalid_booking()
            else:
                #print('not for double')
                self.invalid_booking()
                
        if cells_to_check==3:
            #check we're not at the bottom of the grid
            if self.agent_pos[0]+1<max_row:
                #print('good to check for treble')
                if self.state[self.agent_pos[0], self.agent_pos[1]] == 0 and \
                self.state[(self.agent_pos[0]+1), self.agent_pos[1]] == 0 \
                 and self.state[(self.agent_pos[0]+2), self.agent_pos[1]] == 0:
                    self.state[self.agent_pos[0], self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+1), self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+2), self.agent_pos[1]] = 1
                    self.valid_booking()
                    self.agent_pos = [(self.agent_pos[0]+2), self.agent_pos[1]]
                else:
                    #print('treble taken')
                    self.invalid_booking()
            else:
                #print('not for treble')
                self.invalid_booking()
                
        if cells_to_check==4:
            #check we're not at the bottom of the grid
            if self.agent_pos[0]+2<max_row:
                #check the next cells is also 0.0
                #print('good for quad')
                if self.state[self.agent_pos[0], self.agent_pos[1]] == 0 and \
                self.state[(self.agent_pos[0]+1), self.agent_pos[1]] == 0 \
                 and self.state[(self.agent_pos[0]+2), self.agent_pos[1]] == 0 and \
                self.state[(self.agent_pos[0]+3), self.agent_pos[1]] == 0:
                    self.state[self.agent_pos[0], self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+1), self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+2), self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+3), self.agent_pos[1]] = 1
                    self.valid_booking()
                    self.agent_pos = [(self.agent_pos[0]+3), self.agent_pos[1]]
                else:
                    #print('quad taken')
                    self.invalid_booking()
            else:
                #print('not for quad')
                self.invalid_booking()

        next_state = self.state

        return next_state

    def step(self, action):

        #print('start step' , self.decay_steps)
        #get new position of agent based on action
        new_agent_pos = self.move_agent(action)
        #print('new and old pos', new_agent_pos, self.agent_pos)
        
        #if the agent is stuck on an edge then move to a new position
        if new_agent_pos == self.agent_pos:
            self.agent_pos = [np.random.randint(self.num_slots), np.random.randint(self.num_gps)]
            #print('here1', self.agent_pos)
        else:
            self.agent_pos = new_agent_pos
            #print('here2', self.agent_pos)
        
        #print('trying to book', self.to_book, self.appt_idx)
        
        #check if it's possible to book then book
        if self.check_bookable():
            #print('checked here')
            self.state = self.check_and_book()
        else:
            #print('not bookable')
            self.invalid_booking()
        
        #work out if episode complete
        if self.appt_idx == len(self.to_book):
            #print('all booked')
            self.done = True
  
        #work out rewards
        #self.reward = (1 - (self.reward_decay**self.decay_steps))
        
        #print('step', self.decay_steps, self.reward)
        #print('end step')

        info = {}
        return self.state, self.reward, self.done, info

In [None]:
#device = "cuda"
device = "cpu"

#create multiple environments for multiprocessing
make_env = lambda: SchedulerEnv()
envs = [make_env() for _ in range(num_envs)]

#start writing to tensorboard
writer = SummaryWriter(comment="Scheduler")

#initialise model, agent and run through episodes to get experience
model = Model(envs[0].observation_space.shape, envs[0].action_space.n).to(device)
agent = ptan.agent.PolicyAgent(lambda x: model(x)[0], apply_softmax=True, device=device)
exp_source = ptan.experience.ExperienceSourceFirstLast(envs, agent, gamma=gamma, steps_count=reward_steps)

optimizer = optim.Adam(model.parameters(), lr=0.001, eps=1e-3)

#create list to capture batches
batch = []

#create lists to be used to record values for tracking averages
reward_stack = []
loss_stack = []

#work through each experience source to capture state, actions etc
for step_idx, exp in enumerate(exp_source):
    batch.append(exp)

    if len(batch) < batch_size:
        continue

    states, actions, rewards = unpack_batch(batch, model, device=device)
    batch.clear()

    optimizer.zero_grad()

    # using the network to give actions and state_value
    actor_val, critic_val = model(states)
    # [CRITIC] calculate the loss between value_state (just predicted now) and reward from the batch
    critic_loss = F.mse_loss(critic_val.squeeze(-1), rewards)

    # Runs the log_softmax against actor output (just predicted now)
    log_prob = F.log_softmax(actor_val, dim=1)
    # Advantage equals reward from the batch (size:[batch_size]) minus the value_state (just predicted now)
    advantage = rewards - critic_val.detach()

    # multiples the advantage at each step by the log probability of the chosen action for that step
    log_prob_actions = advantage * log_prob[range(batch_size), actions]
    # calculate the policy gradient adjustment to make (negated to move toward policy improvement)
    actor_loss = -log_prob_actions.mean()

    # perform softmax on action estimates (from ACTOR) (just predicted now)
    prob_val = F.softmax(actor_val, dim=1)
    # calculating the action entropy 
    entropy_loss = 0.01 * (prob_val * log_prob).sum(dim=1).mean()

    # calculate policy gradients only

    # [ACTOR] backpropogate
    actor_loss.backward(retain_graph=True)

    # apply entropy and value gradients
    # [CRITIC] backpropagate and apply entropy
    loss = entropy_loss + critic_loss
    loss.backward()

    optimizer.step()

    #send average loss and rewards to tensorboard
    if len(reward_stack) > 0 and step_idx % 10 == 0:
        #print(step_idx)
        avg_rewards = np.mean(reward_stack)
        avg_loss = np.mean(loss_stack)
        writer.add_scalar('ave_batch_reward', avg_rewards, step_idx)
        writer.add_scalar('ave_batch_loss', avg_loss, step_idx)
        print('ave_batch_reward', avg_rewards, 'step', step_idx)
        print('ave_batch_loss', avg_loss, 'step', step_idx)
        reward_stack.clear()
        loss_stack.clear()
    else:
        reward_stack.append(torch.mean(rewards).item())
        loss_stack.append(torch.mean(critic_loss).item())
        
    if step_idx > 100000:
        break

writer.close()