In [5]:
import gym
import ptan
import argparse
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [6]:
#set parameters
num_gps = 3
num_slots = 4
num_pre_booked = 3
to_book = torch.tensor([1,2,3,1])
num_to_book = len(to_book)

GAMMA = 0.99
LEARNING_RATE = 0.001
ENTROPY_BETA = 0.01
BATCH_SIZE = 128
NUM_ENVS = 50

REWARD_STEPS = 4
CLIP_GRAD = 0.1
device = torch.device("cpu")

In [7]:
#convert numpy array to tensor for input
def tensor_convert(x):
    return torch.from_numpy(x).float()

In [8]:
class Model(nn.Module):
    def __init__(self, input_size, diary_shape):
        super(Model, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Linear(64, 128) 
        )

        self.actor = nn.Sequential(
            nn.Linear(128, 512),
            nn.ReLU(),
            nn.Linear(512, diary_shape),
            nn.Tanh()  
        )

        self.critic = nn.Sequential(
            nn.Linear(128, 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )

    def forward(self, x):
        init_out = self.net(x)
        return self.actor(init_out), self.critic(init_out)

In [9]:
class SchedulerEnv(gym.Env):
    
    metadata = {'render.modes': ['human']}
    
    def __init__(self):
        
        #set parameters for the day
        self.num_gps = num_gps
        self.num_slots = num_slots
        self.num_pre_booked = num_pre_booked
        self.to_book = to_book
        self.num_to_book = num_to_book
        self.diary_slots = num_gps*num_slots

        #set action space this format of the diary
        self.action_space = gym.spaces.Box(low=0, high=1, shape=(self.num_slots, self.num_gps), dtype=np.int32)
        
        #set observation space 
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(self.num_slots, self.num_gps), dtype=np.int32)
   
    #creates daily diary for each gp and randomly populates prebooked appointments
    def reset(self):

        #creates zero filled dataframe with row per time slot and column per gp
        self.state = np.zeros((self.num_slots, self.num_gps),dtype=float)

        #randomly enters a 1 for each pre booked appointments
        while self.num_pre_booked>0:
            self.num_pre_booked -= 1
            row_to_update = np.random.randint(self.num_slots, size=1)
            col_to_update = np.random.randint(self.num_gps, size=1)
            #self.state.at[row_to_update[0],col_to_update[0]]=1
            self.state[row_to_update,col_to_update]=1

        #resets parameters for new episode
        self.done = False
        self.reward = 0
        self.num_to_book = num_to_book
        self.num_pre_booked = num_pre_booked

        return self.state

    def step(self, action):
    
        tot_appts = self.num_pre_booked + self.num_to_book
        #print('total appts to book = ', tot_appts)
        final_diary = action
        #print('final appts in diary = ', action.sum())
        
        #rewards if keeps original appointments in same place
        pre_booked_position = np.transpose(np.nonzero(self.state))
        for i in (pre_booked_position):
            if action[i[0],i[1]]:
                self.reward +=1
                #print('plus 1 matching')
            else:
                self.reward -=1
                #print('minus 1 appt lost')

        #rewards if all new appts are booked
        if tot_appts == action.sum():
            self.reward +=5
            #print('plus 1 all booked')
        else:
            self.reward -=5
            #print('not all booked')
        
        #rewards if all longer appts are booked together

        self.done = True
        info = {}

        return action, self.reward, self.done, info
        

In [10]:
def train(state, action, rewards, critic, max_rewards, optimizer):
    
    advantage = rewards - max_rewards
        
    log_prob_v = F.log_softmax(action, dim=1) 

    actor_loss = (-log_prob_v) * advantage
    adam_actor.zero_grad()
    actor_loss.backward()
    adam_actor.step()

    critic_loss = advantage.pow(2)
    adam_critic.zero_grad()
    critic_loss.backward()
    grad = []
    #for param in critic.parameters():
    #    grad.append(param.grad.view(-1))
    adam_critic.step()



    return obs_v

    optimizer.zero_grad()
    mb_adv = mb_rewards - mb_values
    adv_v = torch.FloatTensor(mb_adv).to(device)
    obs_v = torch.FloatTensor(mb_obs).to(device)
    rewards_v = torch.FloatTensor(mb_rewards).to(device)
    actions_t = torch.LongTensor(mb_actions).to(device)
    logits_v, values_v = net(obs_v)
    loss_value_v = F.mse_loss(values_v.squeeze(-1), rewards_v)

    log_prob_v = F.log_softmax(logits_v, dim=1)
    log_prob_actions_v = adv_v * log_prob_v[range(len(mb_actions)), actions_t]
    loss_policy_v = -log_prob_actions_v.mean()

    prob_v = F.softmax(logits_v, dim=1)
    entropy_loss_v = (prob_v * log_prob_v).sum(dim=1).mean()
    loss_v = ENTROPY_BETA * entropy_loss_v + loss_value_v + loss_policy_v
    loss_v.backward()
    nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD)
    optimizer.step()

In [11]:
# run a number of episodes to gather values and rewards

In [12]:
env = SchedulerEnv()
model = Model((env.diary_slots+env.num_to_book), env.diary_slots)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, 
                             amsgrad=False)

for j in range(5):

    state = env.reset()

    flat_state = torch.flatten(tensor_convert(state))
    nn_input = torch.cat((flat_state, env.to_book))
    action, critic = model(nn_input)

    #convert output to match action space
    end_diary = action.reshape(env.observation_space.shape)
    end_diary[end_diary>0] = 1 
    end_diary[end_diary<0] = 0 
    _, rewards, done, _ = env.step(end_diary)

    #calc max rewards
    max_rewards = num_pre_booked + 5

    train(state, action, rewards, critic, max_rewards, optimizer)

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [None]:
optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE, eps=1e-3)

In [None]:
# train function
def train(memory, q_val):
    values = torch.stack(memory.values)
    q_vals = np.zeros((len(memory), 1))
    
    # target values are calculated backward
    # it's super important to handle correctly done states,
    # for those cases we want our to target to be equal to the reward only
    for i, (_, _, reward, done) in enumerate(memory.reversed()):
        q_val = reward + gamma*q_val*(1.0-done)
        q_vals[len(memory)-1 - i] = q_val # store values from the end to the beginning
        
    advantage = torch.Tensor(q_vals) - values
    
    critic_loss = advantage.pow(2).mean()
    adam_critic.zero_grad()
    critic_loss.backward()
    adam_critic.step()
    
    actor_loss = (-torch.stack(memory.log_probs)*advantage.detach()).mean()
    adam_actor.zero_grad()
    actor_loss.backward()
    adam_actor.step()
