In [None]:
import gym
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from tensorboardX import SummaryWriter

In [None]:
class SchedulerEnv(gym.Env):

    def __init__(self):
        
        #starting parameters
        num_gps = 10
        num_slots = 15
        num_pre_booked = 75
        to_book = [2,3,1]
        num_to_book = len(to_book)
        agent_pos = [0,0]
        reward_decay = 0.95
        
        #set parameters for the day
        self.num_gps = num_gps
        self.num_slots = num_slots
        self.num_pre_booked = num_pre_booked
        self.to_book = to_book
        self.num_to_book = num_to_book
        self.diary_slots = num_gps*num_slots
        self.agent_pos = agent_pos
        self.reward_decay = reward_decay

        #set action space to move around the grid
        self.action_space = gym.spaces.Discrete(4) #up, down, left, right
        
        #set observation space 
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(self.num_slots, self.num_gps), dtype=np.int32)
   
    #creates daily diary for each gp, randomly populates prebooked appointments and resets parameters
    def reset(self):

        #creates zero filled dataframe with row per time slot and column per gp
        self.state = np.zeros((self.num_slots, self.num_gps),dtype=float)

        #randomly enters a 1 for each pre booked appointments
        pre_booked = self.num_pre_booked
        while pre_booked>0:
            pre_booked -= 1
            self.state[np.random.randint(self.num_slots), np.random.randint(self.num_gps)] = 1
            
        #randomly sets the agent start space
        self.agent_pos = [np.random.randint(self.num_slots), np.random.randint(self.num_gps)]

        #resets parameters for new episode
        self.done = False
        self.reward = 0
        self.appt_idx = 0
        self.decay_steps = 1
        
        #print('starting state', self.state.sum(), self.state)

        return self.state
    
    #calculates new position of the agent based on the action
    def move_agent(self, action):

        #set boundaries for the grid
        max_row = env.num_slots - 1
        max_col = env.num_gps - 1

        #set new co-ordinates for the agent
        new_row = self.agent_pos[0]
        new_col = self.agent_pos[1]

        #calculate what the new position may be based on the action without going out the griid
        if action == 0:
            #print('up')
            new_row = max(self.agent_pos[0] - 1, 0)
        if action == 1:
            #print('down')
            new_row = min(self.agent_pos[0] + 1, max_row)
        if action == 2:
            #print('left')
            new_col = max(self.agent_pos[1] - 1, 0)
        if action == 3:
            #print('right')
            new_col = min(self.agent_pos[1] + 1, max_col)

        new_pos = [new_row, new_col]
        #print('new pos', new_pos)

        return new_pos

    #checks if we can look to book appointment starting here
    def check_bookable(self):
        return self.state[self.agent_pos[0], self.agent_pos[1]] == 0.0
    
    #checks if the appointment fits
    def check_and_book(self):
        max_row = env.num_slots - 1

        #checks if the appointment fits
        cells_to_check = self.to_book[self.appt_idx]
        if cells_to_check==1:
            #print('good to check for single')
            if self.state[self.agent_pos[0], self.agent_pos[1]] == 0:
                self.state[self.agent_pos[0], self.agent_pos[1]] = 1
                self.appt_idx += 1
                #print('go ahead and book')
                self.decay_steps = 1
                self.reward = 1
            else:
                #print('already taken')
                self.decay_steps += 1
                self.reward = -1
        if cells_to_check==2:
            #check we're not at the bottom of the grid
            if self.agent_pos[0]<max_row:
                #check the next cells is also 0.0
                #print('good to check for double')
                if self.state[self.agent_pos[0], self.agent_pos[1]] == 0 and \
                self.state[(self.agent_pos[0]+1), self.agent_pos[1]] == 0:
                    self.state[self.agent_pos[0], self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+1), self.agent_pos[1]] = 1
                    self.appt_idx += 1
                    #print('go ahead and book')
                    self.decay_steps = 1
                    self.reward = 1
                else:
                    #print('already taken')
                    self.decay_steps += 1
                    self.reward = -1
            else:
                #print('not for double')
                self.decay_steps += 1
                self.reward = -1
        if cells_to_check==3:
            #check we're not at the bottom of the grid
            if self.agent_pos[0]+1<max_row:
                #print('good to check for treble')
                if self.state[self.agent_pos[0], self.agent_pos[1]] == 0 and \
                self.state[(self.agent_pos[0]+1), self.agent_pos[1]] == 0 \
                 and self.state[(self.agent_pos[0]+2), self.agent_pos[1]] == 0:
                    self.state[self.agent_pos[0], self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+1), self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+2), self.agent_pos[1]] = 1
                    self.appt_idx += 1
                    #print('go ahead and book')
                    self.decay_steps = 1
                    self.reward = 1
                else:
                    #print('already taken')
                    self.decay_steps += 1
                    self.reward = -1
            else:
                #print('not for treble')
                self.decay_steps += 1
                self.reward = -1
        if cells_to_check==4:
            #check we're not at the bottom of the grid
            if self.agent_pos[0]+2<max_row:
                #check the next cells is also 0.0
                #print('good for quad')
                if self.state[self.agent_pos[0], self.agent_pos[1]] == 0 and \
                self.state[(self.agent_pos[0]+1), self.agent_pos[1]] == 0 \
                 and self.state[(self.agent_pos[0]+2), self.agent_pos[1]] == 0 and \
                self.state[(self.agent_pos[0]+3), self.agent_pos[1]] == 0:
                    self.state[self.agent_pos[0], self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+1), self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+2), self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+3), self.agent_pos[1]] = 1
                    self.appt_idx += 1
                    #print('go ahead and book')
                    self.decay_steps = 1
                    self.reward = 1
                else:
                    #print('already taken')
                    self.decay_steps += 1
                    self.reward = -1
            else:
                #print('not for quad')
                self.decay_steps += 1
                self.reward = -1

        next_state = self.state

        return next_state

    def step(self, action):

        #print('start step' , self.decay_steps)
        #get new positioin of agent based on action
        self.agent_pos = self.move_agent(action)
        #print('trying to book', self.to_book, self.appt_idx)
        
        #check if it's possible to book then book
        if self.check_bookable():
            self.state = self.check_and_book()
            #print('checked here')
        else:
            #print('not bookable')
            self.decay_steps += 1
            self.reward = -1
        
        #work out if episode complete
        if self.appt_idx == len(self.to_book):
            #print('all booked')
            self.done = True
  
            
        #work out rewards
        #self.reward = (1 - (self.reward_decay**self.decay_steps))
        
        #print('step', self.decay_steps, self.reward)
        #print('end step')

        info = {}

        return self.state, self.reward, self.done, info

In [None]:
class Model(nn.Module):
    def __init__(self, input_size, action_size):
        super(Model, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Linear(64, 128) 
        )

        self.actor = nn.Sequential(
            nn.Linear(128, 512),
            nn.ReLU(),
            nn.Linear(512, action_size)
        )

        self.critic = nn.Sequential(
            nn.Linear(128, 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )

    def forward(self, x):
        init_out = self.net(x)
        return self.actor(init_out), self.critic(init_out)

In [None]:
#convert numpy array to tensor for input
def tensor_convert(x):
    return torch.from_numpy(x).float()

def train(b_states, b_actions, b_rewards, b_actor, b_critic, eps_steps, optimizer):
    
    optimizer.zero_grad()
    #logits_v = b_actor
    #value_v = b_critic

    b_rewards = torch.Tensor(b_rewards)
    b_critic = torch.FloatTensor(b_critic)
    
    loss_value_v = F.mse_loss(b_critic, b_rewards)

    log_prob_v = F.log_softmax(b_actor, dim=1)
    adv_v = b_rewards - b_critic.detach()
    log_prob_actions_v = adv_v * log_prob_v[eps_steps, b_actions]
    loss_policy_v = -log_prob_actions_v.mean()

    prob_v = F.softmax(b_actor, dim=1)
    entropy_loss_v = 0.01 * (prob_v * log_prob_v).sum(dim=1).mean()

    # calculate policy gradients only
    loss_policy_v.backward(retain_graph=True)
    grads = np.concatenate([p.grad.data.cpu().numpy().flatten()
                            for p in net.parameters()
                            if p.grad is not None])

    # apply entropy and value gradients
    loss_v = entropy_loss_v + loss_value_v
    loss_v.backward()
    nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD)
    optimizer.step()
    # get full loss
    loss_v += loss_policy_v

In [None]:
def train(b_states, b_actions, b_rewards, b_actor, b_critic, eps_steps, optimizer):
    
    #unpack batches for training
    for i in range(eps_steps):
        state = b_states[i]
        action = b_actions[i]
        reward = b_rewards[i]
        actor = b_actor[i]
        critic = b_critic[i]

        obs_v = torch.FloatTensor(state)
        rewards_v = torch.tensor(reward)
        critic = torch.FloatTensor(critic)
        actions_t = torch.FloatTensor(action)

        loss_value_v = F.mse_loss(critic, rewards_v)

        log_prob_v = F.log_softmax(actor)
        adv_v = rewards_v - critic
        log_prob_actions_v = adv_v * log_prob_v[range(len(action))]
        loss_policy_v = -log_prob_actions_v.mean()

        prob_v = F.softmax(action)
        #entropy_loss_v = (prob_v * log_prob_v).sum().mean()
        #loss_v = (entropy_beta * entropy_loss_v + loss_value_v + loss_policy_v)
        loss_v = (loss_value_v + loss_policy_v)
        loss_v = torch.tensor(loss_v, requires_grad = True)
        print('loss_v', loss_v)
        writer.add_scalar("loss", loss_v, i + (len(rewards)*batch_count))

        loss_v.backward()

        optimizer.step()

    writer.close()

In [None]:
#initialise environment, model and optimiser
env = SchedulerEnv()
model = Model((env.observation_space.shape[0]*env.observation_space.shape[1]), env.action_space.n)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, 
                             amsgrad=False)
optimizer.zero_grad()

writer = SummaryWriter()

for i in range(3):

    state = env.reset()
    #print('start pos', env.agent_pos)
    done = False
    b_states = []
    b_actions = []
    b_rewards = []
    b_new_state = []
    b_actor = []
    b_critic = []
    eps_reward = 0
    eps_steps = 0

    while not done:

        #create model input from flattened grid
        nn_input = torch.flatten(tensor_convert(state))
        actor, critic = model(nn_input)
        b_actor.append(actor)
        b_critic.append(critic)

        #print('get new action')
        #action = torch.argmax(actor)
        action = env.action_space.sample()

        #print(F.softmax(actor), action)

        #run through step to book appointment
        new_state, reward, done, info = env.step(action)
        b_states.append(nn_input)
        b_actions.append(action)
        b_rewards.append(reward)
        b_new_state.append(new_state)
        eps_reward += reward
        eps_steps += 1

        #print('done', done)
        state = new_state

    #print("train with this", b_rewards, b_actions)
    #print('end of episode', eps_reward, eps_steps)
    #print(rewards_np[0], b_rewards[0], b_critic[0], tensor_rewards[0])
    train(b_states, b_actions, b_rewards, b_actor, b_critic, eps_steps, optimizer)


In [None]:
b_actor