In [1]:
import math
from collections import namedtuple, deque
from itertools import count
import random

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import elevator

In [2]:
# DQN setup from
# https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
torch_device_type = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(torch_device_type)

In [3]:
# Utility setup
Transition = namedtuple("Transition",
                      ("state", "action", "next_state", "reward"))

class RingBuffer(deque):
    def __init__(self, capacity):
        super(RingBuffer, self).__init__(maxlen=capacity)
        
    def push(self, *args):
        self.append(Transition(*args))
        
    def sample(self, batch_size):
        return random.sample(list(self), batch_size)

In [4]:
# Setup Q Agent
class DQN(nn.Module):
    def __init__(self, nfeats, nfloors, hidden_dim, outputs):
        """
        LSTM based DQN model
        """
        super(DQN, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(nfeats, hidden_dim)
        self.head = nn.Linear(hidden_dim * nfloors, outputs)
        
    def forward(self, state):
        lstm_out, _ = self.lstm(state)
        state_relu = F.relu(lstm_out)
        return self.head(state_relu.view(state_relu.size(0), -1))

In [5]:
# Enviromnet setup

# Four actions, move up, move down, stay (head up), stay (head down)
n_actions = elevator.N_ACTIONS
NUM_FLOORS = 10
NUM_CARS = 1
max_lambda = 5
first_floor_func = elevator.gen_lambda_func(10, 5, max_lambda, 0.7)
lambda_funcs = [ lambda x : max_lambda * 0.4 \
                for i in range(NUM_FLOORS - 1)]
lambda_funcs.insert(0, first_floor_func)

env = elevator.Elevator(NUM_FLOORS, NUM_CARS, lambda_funcs)

# Passenger arrival following a
# Poisson distribution. Passenger arrival count at each time
# may thus be determined in advance

In [6]:
# RL params
BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10

# DQN params
HIDDEN_DIM = 3

policy_net = DQN(env.nfeats, NUM_FLOORS, HIDDEN_DIM,
                 n_actions).to(device)
target_net = DQN(env.nfeats, NUM_FLOORS, HIDDEN_DIM,
                 n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimized = optim.RMSprop(policy_net.parameters())
memory = RingBuffer(100)

steps_done = 0
def select_action(state):
    global steps_done
    
    exploit = random.random()
    eps_thresh = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1 * steps_done / EPS_DECAY)
    steps_done += 1
    if exploit > eps_thresh:
        with torch.no_grad():
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        
        #Explore
        return torch.tensor([[random.randrange(n_actions)\
                              for i in range(NUM_CARS)]],
                              device=device, dtype=torch.long)

In [7]:
def optimize_agent():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    
    batch = Transition(*zip(*transitions))
    non_final_mask = torch.tensor(tuple(map(lambda s : s is not None,
                                 batch.next_state)),
                                 device=device, dtype=torch.unit8)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                      if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)
    
    #Compute Q values based on policy net
    sa_values = policy_net(state_batch).gather(1, action_batch)
    
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(
        non_final_next_states).max(1)[0].detach()
    expected_sa_values = (next_state_values * GAMMA) + reward_batch
    
    loss = F.smooth_lq_loss(sa_values, expected_sa_values.unsqueeze(1))
    
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

In [8]:
#num_epochs = 50
#epoch_duration = 24 * 60
num_epochs = 2
epoch_duration = 100
for ep_idx in range(num_epochs):
    env.reset()

    state = env.state()
    state = torch.tensor([state], device=device, dtype=torch.float)
    for t in range(epoch_duration):
        actions = select_action(state)
        env.run_iteration(t, actions[0])
        reward = -1 * env.total_cost()
        reward = torch.tensor([reward], device=device)
        
        next_state = env.state()
        next_state = torch.tensor([next_state], device=device,
                                  dtype=torch.float)
        memory.push(state, actions, next_state, reward)
        state = next_state
        
        optimize_agent()
        
    if ep_idx % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

RuntimeError: Set changed size during iteration