In [1]:
import gym
import ptan
import argparse
import numpy as np
import pandas as pd

from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

  for external in metadata.entry_points().get(self.group, []):


In [2]:
#set parameters
num_gps = 3
num_slots = 4
num_pre_booked = 3
to_book = torch.tensor([1,2,3,1])
num_to_book = len(to_book)
entropy_beta = 0.01

In [3]:
#convert numpy array to tensor for input
def tensor_convert(x):
    return torch.from_numpy(x).float()

In [4]:
class Model(nn.Module):
    def __init__(self, input_size, diary_shape):
        super(Model, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Linear(64, 128) 
        )

        self.actor = nn.Sequential(
            nn.Linear(128, 512),
            nn.ReLU(),
            nn.Linear(512, diary_shape)
        )

        self.critic = nn.Sequential(
            nn.Linear(128, 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )

    def forward(self, x):
        init_out = self.net(x)
        return self.actor(init_out), self.critic(init_out)

In [5]:
class SchedulerEnv(gym.Env):
    
    metadata = {'render.modes': ['human']}
    
    def __init__(self):
        
        #set parameters for the day
        self.num_gps = num_gps
        self.num_slots = num_slots
        self.num_pre_booked = num_pre_booked
        self.to_book = to_book
        self.num_to_book = num_to_book
        self.diary_slots = num_gps*num_slots

        #set action space this format of the diary
        self.action_space = gym.spaces.Box(low=0, high=1, shape=(self.num_slots, self.num_gps), dtype=np.int32)
        
        #set observation space 
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(self.num_slots, self.num_gps), dtype=np.int32)
   
    #creates daily diary for each gp and randomly populates prebooked appointments
    def reset(self):

        #creates zero filled dataframe with row per time slot and column per gp
        self.state = np.zeros((self.num_slots, self.num_gps),dtype=float)

        #randomly enters a 1 for each pre booked appointments
        while self.num_pre_booked>0:
            self.num_pre_booked -= 1
            row_to_update = np.random.randint(self.num_slots, size=1)
            col_to_update = np.random.randint(self.num_gps, size=1)
            #self.state.at[row_to_update[0],col_to_update[0]]=1
            self.state[row_to_update,col_to_update]=1

        #resets parameters for new episode
        self.done = False
        self.reward = 0
        self.num_to_book = num_to_book
        self.num_pre_booked = num_pre_booked

        return self.state

    def step(self, action):
    
        tot_appts = self.num_pre_booked + self.num_to_book
        #print('total appts to book = ', tot_appts)
        final_diary = action
        #print('final appts in diary = ', action.sum())
        
        #rewards if keeps original appointments in same place
        pre_booked_position = np.transpose(np.nonzero(self.state))
        for i in (pre_booked_position):
            if action[i[0],i[1]]:
                self.reward +=1
                #print('plus 1 matching')
            else:
                self.reward -=1
                #print('minus 1 appt lost')

        #rewards if all new appts are booked
        if tot_appts == action.sum():
            self.reward +=5
            #print('plus 1 all booked')
        else:
            self.reward -=5
            #print('not all booked')
            
        #print('tot reward', self.reward)
        
        #rewards if all longer appts are booked together

        self.done = True
        info = {}

        return action, self.reward, self.done, info
        

In [6]:
def train(states, actions, rewards, critics, max_rewards, optimizer, batch_count):
    
    #unpack batches for training
    for i in range(len(rewards)):
        state = states[i]
        action = actions[i]
        reward = rewards[i]
        max_reward = max_rewards[i]
        critic = critics[i]

        obs_v = torch.FloatTensor(state)
        rewards_v = torch.tensor(reward)
        critic = torch.FloatTensor(critic)
        actions_t = torch.FloatTensor(action)
        max_reward = torch.tensor(max_reward)

        loss_value_v = F.mse_loss(critic, rewards_v)

        log_prob_v = F.log_softmax(action)
        adv_v = max_reward - critic
        log_prob_actions_v = adv_v * log_prob_v[range(len(action))]
        loss_policy_v = -log_prob_actions_v.mean()

        prob_v = F.softmax(action)
        #entropy_loss_v = (prob_v * log_prob_v).sum().mean()
        #loss_v = (entropy_beta * entropy_loss_v + loss_value_v + loss_policy_v)
        loss_v = (loss_value_v + loss_policy_v)
        loss_v = torch.tensor(loss_v, requires_grad = True)
        print('loss_v', loss_v)
        writer.add_scalar("loss", loss_v, i + (len(rewards)*batch_count))

        loss_v.backward()

        optimizer.step()

    writer.close()
    return obs_v

In [7]:
# run a number of episodes to gather values and rewards

In [8]:
#initialise environment, model and optimiser
env = SchedulerEnv()
model = Model((env.diary_slots+env.num_to_book), env.diary_slots)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, 
                             amsgrad=False)
optimizer.zero_grad()
writer = SummaryWriter()


#create batches to train model. only use top 25% for training

for i in range(25):
    #create empty lists to store batches
    batch_inputs = []
    batch_actions = []
    batch_rewards = []
    batch_critic = []
    batch_max_rewards = []
    
    for j in range(10):

        state = env.reset()

        flat_state = torch.flatten(tensor_convert(state))
        nn_input = torch.cat((flat_state, env.to_book))
        action, critic = model(nn_input)
        
        #convert output to be between -1 and 1
        action = F.tanh(action)

        #convert output to match action space
        end_diary = action.reshape(env.observation_space.shape)
        end_diary[end_diary>0] = 1 
        end_diary[end_diary<0] = 0 
        _, rewards, done, _ = env.step(end_diary)

        #calc max rewards
        max_rewards = num_pre_booked + 5
        
        batch_inputs.append(nn_input)
        batch_actions.append(action)
        batch_rewards.append(rewards)
        batch_critic.append(critic)
        batch_max_rewards.append(max_rewards)
        #print(batch_rewards)

    train(batch_inputs, batch_actions, batch_rewards, batch_critic, batch_max_rewards, optimizer, i)



loss_v tensor(36.6578, requires_grad=True)
loss_v tensor(56.1652, requires_grad=True)
loss_v tensor(84.0317, requires_grad=True)
loss_v tensor(55.9152, requires_grad=True)
loss_v tensor(56.0401, requires_grad=True)
loss_v tensor(36.2927, requires_grad=True)
loss_v tensor(36.2927, requires_grad=True)
loss_v tensor(36.7034, requires_grad=True)
loss_v tensor(36.4241, requires_grad=True)
loss_v tensor(55.9428, requires_grad=True)
loss_v tensor(83.6877, requires_grad=True)
loss_v tensor(57.8201, requires_grad=True)
loss_v tensor(24.7897, requires_grad=True)
loss_v tensor(56.1148, requires_grad=True)
loss_v tensor(56.1334, requires_grad=True)
loss_v tensor(83.7591, requires_grad=True)
loss_v tensor(36.6972, requires_grad=True)
loss_v tensor(55.9428, requires_grad=True)
loss_v tensor(55.5834, requires_grad=True)
loss_v tensor(56.0964, requires_grad=True)
loss_v tensor(55.9791, requires_grad=True)
loss_v tensor(36.4444, requires_grad=True)
loss_v tensor(83.3195, requires_grad=True)
loss_v tens

loss_v tensor(36.5135, requires_grad=True)
loss_v tensor(24.7646, requires_grad=True)
loss_v tensor(68.9385, requires_grad=True)
loss_v tensor(36.2456, requires_grad=True)
loss_v tensor(36.5392, requires_grad=True)
loss_v tensor(24.7589, requires_grad=True)
loss_v tensor(45.4231, requires_grad=True)
loss_v tensor(56.2211, requires_grad=True)
loss_v tensor(29.6008, requires_grad=True)
loss_v tensor(55.8825, requires_grad=True)
loss_v tensor(68.7235, requires_grad=True)
loss_v tensor(36.3289, requires_grad=True)
loss_v tensor(36.4069, requires_grad=True)
loss_v tensor(56.2819, requires_grad=True)


In [9]:
dir(torch.nn.functional)

['GRID_SAMPLE_INTERPOLATION_MODES',
 'GRID_SAMPLE_PADDING_MODES',
 'List',
 'Optional',
 'Tensor',
 'Tuple',
 '_Reduction',
 '_VF',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_adaptive_max_pool1d',
 '_adaptive_max_pool2d',
 '_adaptive_max_pool3d',
 '_add_docstr',
 '_fractional_max_pool2d',
 '_fractional_max_pool3d',
 '_get_softmax_dim',
 '_infer_size',
 '_list_with_default',
 '_max_pool1d',
 '_max_pool2d',
 '_max_pool3d',
 '_no_grad_embedding_renorm_',
 '_overload',
 '_pad',
 '_pad_circular',
 '_pair',
 '_single',
 '_threshold',
 '_triple',
 '_unpool_output_size',
 '_verify_batch_size',
 'adaptive_avg_pool1d',
 'adaptive_avg_pool2d',
 'adaptive_avg_pool3d',
 'adaptive_max_pool1d',
 'adaptive_max_pool1d_with_indices',
 'adaptive_max_pool2d',
 'adaptive_max_pool2d_with_indices',
 'adaptive_max_pool3d',
 'adaptive_max_pool3d_with_indices',
 'affine_grid',
 'alpha_dropout',
 'assert_int_or_pair',
 'avg_pool1d',
 'avg_

In [10]:
print('Hello Becca')

Hello Becca


In [11]:
batch_inputs

[tensor([0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 1., 2., 3., 1.]),
 tensor([0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 2., 3., 1.]),
 tensor([0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 2., 3., 1.]),
 tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 2., 3., 1.]),
 tensor([0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 2., 3., 1.]),
 tensor([0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1., 2., 3., 1.]),
 tensor([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 2., 3., 1.]),
 tensor([0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 1., 2., 3., 1.]),
 tensor([0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 2., 3., 1.]),
 tensor([0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 1., 2., 3., 1.])]