In [16]:
import gym
import numpy as np
import pandas as pd
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from tensorboardX import SummaryWriter

In [17]:
class SchedulerEnv(gym.Env):

    def __init__(self):
        
        #starting parameters
        num_gps = 10
        num_slots = 15
        num_pre_booked = 75
        to_book = [2,3,1]
        num_to_book = len(to_book)
        agent_pos = [0,0]
        reward_decay = 0.95
        
        #set parameters for the day
        self.num_gps = num_gps
        self.num_slots = num_slots
        self.num_pre_booked = num_pre_booked
        self.to_book = to_book
        self.num_to_book = num_to_book
        self.diary_slots = num_gps*num_slots
        self.agent_pos = agent_pos
        self.reward_decay = reward_decay

        #set action space to move around the grid
        self.action_space = gym.spaces.Discrete(4) #up, down, left, right
        
        #set observation space 
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(self.num_slots, self.num_gps), dtype=np.int32)
   
    #creates daily diary for each gp, randomly populates prebooked appointments and resets parameters
    def reset(self):

        #creates zero filled dataframe with row per time slot and column per gp
        self.state = np.zeros((self.num_slots, self.num_gps),dtype=float)

        #randomly enters a 1 for each pre booked appointments
        pre_booked = self.num_pre_booked
        while pre_booked>0:
            pre_booked -= 1
            self.state[np.random.randint(self.num_slots), np.random.randint(self.num_gps)] = 1
            
        #randomly sets the agent start space
        self.agent_pos = [np.random.randint(self.num_slots), np.random.randint(self.num_gps)]

        #resets parameters for new episode
        self.done = False
        self.reward = 0
        self.appt_idx = 0
        self.decay_steps = 1
        
        #print('starting state', self.state.sum(), self.state)

        return self.state
    
    #calculates new position of the agent based on the action
    def move_agent(self, action):

        #set boundaries for the grid
        max_row = env.num_slots - 1
        max_col = env.num_gps - 1

        #set new co-ordinates for the agent
        new_row = self.agent_pos[0]
        new_col = self.agent_pos[1]

        #calculate what the new position may be based on the action without going out the griid
        if action == 0:
            #print('up')
            new_row = max(self.agent_pos[0] - 1, 0)
        if action == 1:
            #print('down')
            new_row = min(self.agent_pos[0] + 1, max_row)
        if action == 2:
            #print('left')
            new_col = max(self.agent_pos[1] - 1, 0)
        if action == 3:
            #print('right')
            new_col = min(self.agent_pos[1] + 1, max_col)

        new_pos = [new_row, new_col]
        #print('new pos', new_pos)

        return new_pos

    #checks if we can look to book appointment starting here
    def check_bookable(self):
        return self.state[self.agent_pos[0], self.agent_pos[1]] == 0.0
    
    #checks if the appointment fits
    def check_and_book(self):
        max_row = env.num_slots - 1

        #checks if the appointment fits
        cells_to_check = self.to_book[self.appt_idx]
        if cells_to_check==1:
            #print('good to check for single')
            if self.state[self.agent_pos[0], self.agent_pos[1]] == 0:
                self.state[self.agent_pos[0], self.agent_pos[1]] = 1
                self.appt_idx += 1
                #print('go ahead and book')
                self.decay_steps = 1
                self.reward = 1
            else:
                #print('already taken')
                self.decay_steps += 1
                self.reward = -1
        if cells_to_check==2:
            #check we're not at the bottom of the grid
            if self.agent_pos[0]<max_row:
                #check the next cells is also 0.0
                #print('good to check for double')
                if self.state[self.agent_pos[0], self.agent_pos[1]] == 0 and \
                self.state[(self.agent_pos[0]+1), self.agent_pos[1]] == 0:
                    self.state[self.agent_pos[0], self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+1), self.agent_pos[1]] = 1
                    self.appt_idx += 1
                    #print('go ahead and book')
                    self.decay_steps = 1
                    self.reward = 1
                else:
                    #print('already taken')
                    self.decay_steps += 1
                    self.reward = -1
            else:
                #print('not for double')
                self.decay_steps += 1
                self.reward = -1
        if cells_to_check==3:
            #check we're not at the bottom of the grid
            if self.agent_pos[0]+1<max_row:
                #print('good to check for treble')
                if self.state[self.agent_pos[0], self.agent_pos[1]] == 0 and \
                self.state[(self.agent_pos[0]+1), self.agent_pos[1]] == 0 \
                 and self.state[(self.agent_pos[0]+2), self.agent_pos[1]] == 0:
                    self.state[self.agent_pos[0], self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+1), self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+2), self.agent_pos[1]] = 1
                    self.appt_idx += 1
                    #print('go ahead and book')
                    self.decay_steps = 1
                    self.reward = 1
                else:
                    #print('already taken')
                    self.decay_steps += 1
                    self.reward = -1
            else:
                #print('not for treble')
                self.decay_steps += 1
                self.reward = -1
        if cells_to_check==4:
            #check we're not at the bottom of the grid
            if self.agent_pos[0]+2<max_row:
                #check the next cells is also 0.0
                #print('good for quad')
                if self.state[self.agent_pos[0], self.agent_pos[1]] == 0 and \
                self.state[(self.agent_pos[0]+1), self.agent_pos[1]] == 0 \
                 and self.state[(self.agent_pos[0]+2), self.agent_pos[1]] == 0 and \
                self.state[(self.agent_pos[0]+3), self.agent_pos[1]] == 0:
                    self.state[self.agent_pos[0], self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+1), self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+2), self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+3), self.agent_pos[1]] = 1
                    self.appt_idx += 1
                    #print('go ahead and book')
                    self.decay_steps = 1
                    self.reward = 1
                else:
                    #print('already taken')
                    self.decay_steps += 1
                    self.reward = -1
            else:
                #print('not for quad')
                self.decay_steps += 1
                self.reward = -1

        next_state = self.state

        return next_state

    def step(self, action):

        #print('start step' , self.decay_steps)
        #get new positioin of agent based on action
        self.agent_pos = self.move_agent(action)
        #print('trying to book', self.to_book, self.appt_idx)
        
        #check if it's possible to book then book
        if self.check_bookable():
            self.state = self.check_and_book()
            #print('checked here')
        else:
            #print('not bookable')
            self.decay_steps += 1
            self.reward = -1
            if self.decay_steps > 100:
                self.agent_pos = [np.random.randint(self.num_slots), np.random.randint(self.num_gps)]
                self.reward -= 10
                #print('break here')
        
        #work out if episode complete
        if self.appt_idx == len(self.to_book):
            #print('all booked')
            self.done = True
  
            
        #work out rewards
        #self.reward = (1 - (self.reward_decay**self.decay_steps))
        
        #print('step', self.decay_steps, self.reward)
        #print('end step')

        info = {}

        return self.state, self.reward, self.done, info

In [18]:
class Model(nn.Module):
    def __init__(self, input_size, action_size):
        super(Model, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Linear(64, 128) 
        )

        self.actor = nn.Sequential(
            nn.Linear(128, 512),
            nn.ReLU(),
            nn.Linear(512, action_size)
        )

        self.critic = nn.Sequential(
            nn.Linear(128, 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )

    def forward(self, x):
        init_out = self.net(x)
        return self.actor(init_out), self.critic(init_out)

In [19]:
#convert numpy array to tensor for input
def tensor_convert(x):
    return torch.from_numpy(x).float()

def train(b_states, b_actions, b_rewards, b_actor, b_critic, eps_steps, optimizer):
    
    optimizer.zero_grad()
    #logits_v = b_actor
    #value_v = b_critic

    b_rewards = torch.Tensor(b_rewards)
    b_critic = torch.FloatTensor(b_critic)
    
    loss_value_v = F.mse_loss(b_critic, b_rewards)

    log_prob_v = F.log_softmax(b_actor, dim=1)
    adv_v = b_rewards - b_critic.detach()
    log_prob_actions_v = adv_v * log_prob_v[eps_steps, b_actions]
    loss_policy_v = -log_prob_actions_v.mean()

    prob_v = F.softmax(b_actor, dim=1)
    entropy_loss_v = 0.01 * (prob_v * log_prob_v).sum(dim=1).mean()

    # calculate policy gradients only
    loss_policy_v.backward(retain_graph=True)
    grads = np.concatenate([p.grad.data.cpu().numpy().flatten()
                            for p in net.parameters()
                            if p.grad is not None])

    # apply entropy and value gradients
    loss_v = entropy_loss_v + loss_value_v
    loss_v.backward()
    nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD)
    optimizer.step()
    # get full loss
    loss_v += loss_policy_v

In [20]:
def train(b_states, b_actions, b_rewards, b_actor, b_critic, eps_steps, optimizer):
    
    #unpack batches for training
    for i in range(eps_steps):
        state = b_states[i]
        action = b_actions[i]
        #print('actions', action)
        reward = b_rewards[i]
        actor = b_actor[i]
        critic = b_critic[i]

        obs_v = torch.FloatTensor(state)
        rewards_v = torch.tensor(reward)
        critic = torch.FloatTensor(critic)
        actions_t = torch.tensor(action)

        loss_value_v = F.mse_loss(critic, rewards_v)

        log_prob_v = F.log_softmax(actor)
        adv_v = rewards_v - critic
        log_prob_actions_v = adv_v * log_prob_v[action]
        loss_policy_v = -log_prob_actions_v.mean()

        prob_v = F.softmax(actor)
        #entropy_loss_v = (prob_v * log_prob_v).sum().mean()
        #loss_v = (entropy_beta * entropy_loss_v + loss_value_v + loss_policy_v)
        loss_v = (loss_value_v + loss_policy_v)
        loss_v = torch.tensor(loss_v, requires_grad = True)
        #print('loss_v', loss_v)

        loss_v.backward()

        optimizer.step()

    return loss_v

In [None]:
#initialise environment, model and optimiser
env = SchedulerEnv()
model = Model((env.observation_space.shape[0]*env.observation_space.shape[1]), env.action_space.n)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, 
                             amsgrad=False)
optimizer.zero_grad()

writer = SummaryWriter()

for i in range(300000):

    state = env.reset()
    #print('start pos', env.agent_pos)
    done = False
    b_states = []
    b_actions = []
    b_rewards = []
    b_new_state = []
    b_actor = []
    b_critic = []
    eps_reward = 0
    eps_steps = 0

    while not done:
        
        #create model input from flattened grid
        nn_input = torch.flatten(tensor_convert(state))
        actor, critic = model(nn_input)
        b_actor.append(actor)
        b_critic.append(critic)

        #print('get new action')
        if random.random() < 0.05:
            action = env.action_space.sample()
        else:
            action = torch.argmax(actor)
        

        #print(F.softmax(actor), action)

        #run through step to book appointment
        new_state, reward, done, info = env.step(action)
        b_states.append(nn_input)
        b_actions.append(action)
        b_rewards.append(reward)
        b_new_state.append(new_state)
        eps_reward += reward
        eps_steps += 1

        #print('done', done)
        state = new_state

    #print("train with this", b_rewards, b_actions)
    #print('end of episode', eps_reward, eps_steps)
    #print(rewards_np[0], b_rewards[0], b_critic[0], tensor_rewards[0])
    loss = train(b_states, b_actions, b_rewards, b_actor, b_critic, eps_steps, optimizer)
    writer.add_scalar("eps_reward", eps_reward, i )
    writer.add_scalar("loss", loss, i )
    if i%100 == 0:
        print('rewards for episode', i, eps_reward)

writer.close()

  from ipykernel import kernelapp as app


rewards for episode 0 -1
rewards for episode 100 -2
rewards for episode 200 -201
rewards for episode 300 0
rewards for episode 400 -42
rewards for episode 500 -144
rewards for episode 600 -5
rewards for episode 700 -122
rewards for episode 800 -193
rewards for episode 900 -30
rewards for episode 1000 -189
rewards for episode 1100 -116
rewards for episode 1200 -135
rewards for episode 1300 -2
rewards for episode 1400 -242
rewards for episode 1500 -147
rewards for episode 1600 -153
rewards for episode 1700 -353
rewards for episode 1800 -418
rewards for episode 1900 1
rewards for episode 2000 -101
rewards for episode 2100 -247
rewards for episode 2200 -112
rewards for episode 2300 0
rewards for episode 2400 -131
rewards for episode 2500 -153
rewards for episode 2600 -8
rewards for episode 2700 -112
rewards for episode 2800 -306
rewards for episode 2900 -189
rewards for episode 3000 -30
rewards for episode 3100 -293
rewards for episode 3200 -251
rewards for episode 3300 -166
rewards for ep

rewards for episode 27500 -113
rewards for episode 27600 -282
rewards for episode 27700 -4
rewards for episode 27800 -131
rewards for episode 27900 -234
rewards for episode 28000 -122
rewards for episode 28100 -15
rewards for episode 28200 -150
rewards for episode 28300 -378
rewards for episode 28400 -3
rewards for episode 28500 -149
rewards for episode 28600 -104
rewards for episode 28700 1
rewards for episode 28800 -3
rewards for episode 28900 -2
rewards for episode 29000 -146
rewards for episode 29100 -355
rewards for episode 29200 -3
rewards for episode 29300 -227
rewards for episode 29400 -138
rewards for episode 29500 -3
rewards for episode 29600 -2
rewards for episode 29700 -121
rewards for episode 29800 -2
rewards for episode 29900 -127
rewards for episode 30000 -65
rewards for episode 30100 -156
rewards for episode 30200 0
rewards for episode 30300 -122
rewards for episode 30400 -212
rewards for episode 30500 -138
rewards for episode 30600 -206
rewards for episode 30700 -116
r

rewards for episode 54500 -179
rewards for episode 54600 -274
rewards for episode 54700 1
rewards for episode 54800 -114
rewards for episode 54900 -3
rewards for episode 55000 -367
rewards for episode 55100 -121
rewards for episode 55200 -110
rewards for episode 55300 -2
rewards for episode 55400 -201
rewards for episode 55500 0
rewards for episode 55600 -267
rewards for episode 55700 -185
rewards for episode 55800 -208
rewards for episode 55900 -139
rewards for episode 56000 -1325
rewards for episode 56100 -408
rewards for episode 56200 -218
rewards for episode 56300 -112
rewards for episode 56400 -9
rewards for episode 56500 -186
rewards for episode 56600 -113
rewards for episode 56700 -80
rewards for episode 56800 -110
rewards for episode 56900 -146
rewards for episode 57000 -1
rewards for episode 57100 -153
rewards for episode 57200 -112
rewards for episode 57300 -245
rewards for episode 57400 -4
rewards for episode 57500 -121
rewards for episode 57600 -127
rewards for episode 5770

rewards for episode 81600 -161
rewards for episode 81700 -163
rewards for episode 81800 -2
rewards for episode 81900 -361
rewards for episode 82000 -185
rewards for episode 82100 -153
rewards for episode 82200 -115
rewards for episode 82300 -22
rewards for episode 82400 -111
rewards for episode 82500 -259
rewards for episode 82600 -192
rewards for episode 82700 -37
rewards for episode 82800 -178
rewards for episode 82900 -91
rewards for episode 83000 -135
rewards for episode 83100 -285
rewards for episode 83200 -1
rewards for episode 83300 -176
rewards for episode 83400 -151
rewards for episode 83500 -124
rewards for episode 83600 -113
rewards for episode 83700 -119
rewards for episode 83800 -115
rewards for episode 83900 -131
rewards for episode 84000 -150
rewards for episode 84100 -9
rewards for episode 84200 -88
rewards for episode 84300 -168
rewards for episode 84400 -435
rewards for episode 84500 -135
rewards for episode 84600 -179
rewards for episode 84700 -149
rewards for episod

rewards for episode 108400 -166
rewards for episode 108500 -110
rewards for episode 108600 -111
rewards for episode 108700 -130
rewards for episode 108800 -162
rewards for episode 108900 -277
rewards for episode 109000 -116
rewards for episode 109100 -319
rewards for episode 109200 -111
rewards for episode 109300 -109
rewards for episode 109400 -45
rewards for episode 109500 -147
rewards for episode 109600 0
rewards for episode 109700 -130
rewards for episode 109800 -6
rewards for episode 109900 1
rewards for episode 110000 -232
rewards for episode 110100 -249
rewards for episode 110200 -6
rewards for episode 110300 -145
rewards for episode 110400 -297
rewards for episode 110500 -182
rewards for episode 110600 -9
rewards for episode 110700 -120
rewards for episode 110800 -69
rewards for episode 110900 -131
rewards for episode 111000 -2
rewards for episode 111100 -129
rewards for episode 111200 -171
rewards for episode 111300 -200
rewards for episode 111400 -135
rewards for episode 1115

rewards for episode 134600 -2
rewards for episode 134700 1
rewards for episode 134800 -145
rewards for episode 134900 -119
rewards for episode 135000 -254
rewards for episode 135100 -134
rewards for episode 135200 -332
rewards for episode 135300 -115
rewards for episode 135400 -153
rewards for episode 135500 -56
rewards for episode 135600 -1
rewards for episode 135700 -1
rewards for episode 135800 -143
rewards for episode 135900 -118
rewards for episode 136000 -228
rewards for episode 136100 -135
rewards for episode 136200 -88
rewards for episode 136300 -25
rewards for episode 136400 -62
rewards for episode 136500 -119
rewards for episode 136600 -130
rewards for episode 136700 -344
rewards for episode 136800 -305
rewards for episode 136900 -124
rewards for episode 137000 -137
rewards for episode 137100 -172
rewards for episode 137200 -6
rewards for episode 137300 -11
rewards for episode 137400 -2
rewards for episode 137500 -1
rewards for episode 137600 -5
rewards for episode 137700 -26

In [None]:
actor

In [None]:
env.action_space.sample()

In [None]:
random.random()