# Import Dependencies

In [None]:
import gym
from gym import Env
import gym.spaces as spaces
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete

# Import Helpers
import numpy as np
import random
import os
import torch 

# Import Stable Baselines Dependencies
from stable_baselines3 import PPO, DQN, A2C
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import BaseCallback


# Class Elevator - 3 Floors

In [None]:
class WaitingTimeCallback(BaseCallback):
    def __init__(self, verbose: int = 0):
        super(WaitingTimeCallback, self).__init__(verbose)
        self.waiting_times = []
    
    def _on_step(self) -> bool:
        env = self.training_env.envs[0]
        waiting_time = self.training_env.waiting_times[-1]
        self.waiting_times.append(waiting_time)
        return True

In [None]:
class Elevator3(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self):
        # observation space
        # states0: floor_0_up
        # states1: floor_1_up
        # states2: floor_1_down
        # states3: floor_2_up
        # states4: floor_2_down
        # states5: occupancy
        # states6: position
        super(Elevator3, self).__init__()

        self.done = 0
        self.reward = 0
        self.states = np.zeros(7)
        self.states[0] = 1
        self.last_time = 0
        self.time = 0
        self.max_occupancy = 7
        self.waiting_times = []
        # The Action step is composed by 3 possible movements
        # Action 0 : Stop
        # Action 1 : Go Up
        # Action 2 : Go Down
        self.action_space = spaces.Discrete(3)  # 0 stop, 1 up, 2 down
        self.observation_space = spaces.MultiDiscrete([2, 2, 2, 2, 2, 7, 3])
        
    
    def reset(self):
        '''
            This function allows to reset the env to initial state
        '''
        self.states = np.zeros(7)
        # suppose that there are already 2 people
        # waiting on the first floor at the beginning of the session
        self.states[0] = 1
        self.last_time = 0
        self.time = 0
        self.floor_0_waiting = 2
        self.floor_0_waiting_list = [1, 2]
        self.floor_1_waiting = 0
        self.floor_1_waiting_list = []
        self.floor_2_waiting = 0
        self.floor_2_waiting_list = []
        self.inside_list = []
        self.done = 0
        self.reward = 0
        self.time_check = []
        return self.states

    def timecheck(self):
        '''
            This function Checks the current time in the env and updates accordingly to this the waiting list
        '''
        if (self.last_time < 5) and (self.time >= 5):
            self.floor_1_waiting = 5
            self.floor_1_waiting_list.extend(np.random.choice([0, 2], size=self.floor_1_waiting, p=[0.75, 0.25]).tolist())
            self.states[1] = 1
            self.states[2] = 1

        elif (self.last_time < 60) and (self.time >= 60):
            self.floor_2_waiting = 5
            self.floor_2_waiting_list.extend(np.random.choice([0, 1], size=self.floor_1_waiting, p=[0.75, 0.25]).tolist())
            self.states[3] = 1
            self.states[4] = 1

        if (self.time - self.last_time >= 60) and (self.time < 120):
            self.floor_0_waiting_list.extend(np.random.choice([1, 2], size=np.random.choice(2, p=[0.9, 0.1])).tolist())
            
        self.last_time = self.time

    def waiting_list_check(self):

        '''
            Function to check the waiting list for each floor and updates the corresponding states accordingly 
        '''

        if len(self.floor_0_waiting_list) == 0:
            self.states[0] = 0
        if 0 not in self.floor_1_waiting_list:
            self.states[2] = 0
        if 2 not in self.floor_1_waiting_list:
            self.states[1] = 0
        if 0 not in self.floor_2_waiting_list:
            self.states[4] = 0
        if 1 not in self.floor_2_waiting_list:
            self.states[3] = 0

    def done_check(self):

        '''
            Checking if the episodes are done, by checking all the states. If no people are waiting on any floor and the elevator is empty, we set the flag variable 'done' to 1 indicating that the episode is done
            Moreover, is the time exceeds more than 15 minutes, the episodes is considered as well done. This additional constraints is set, as a consequential of the normal behaviour of the people: usually when someone 
            waits for too long, it will find another way to go to the desired floor (ex. Using the stairs)
        '''
        if (self.states[0] == 0) and (self.states[2] == 0) and (self.states[1] == 0) and (self.states[4] == 0) and (self.states[3] == 0) and (self.states[6] == 0):
            self.done = 1
            self.time_check.append(self.time)

        elif self.time >= 900:
            self.done = 1
            print("--Waiting More Than 15 Minutes--")
        
        if self.done:
            self.waiting_times.append(self.time)

        return self.done

    def rotating_people(self):

        '''
            This function control the movements of people in the elvator:
                If someone reaches the desired floor it will be removed and we add people from the waiting list
        '''
        self.inside_list = [x for x in self.inside_list if x != self.states[6]]
        remaining_places = self.max_occupancy - len(self.inside_list)
        if self.states[6] == 0:
            if len(self.floor_0_waiting_list) < remaining_places:
                self.inside_list.extend(self.floor_0_waiting_list)
                self.floor_0_waiting_list = []
            else:
                self.inside_list.extend(self.floor_0_waiting_list[:remaining_places])
                self.floor_0_waiting_list = self.floor_0_waiting_list[remaining_places:]
        elif self.states[6] == 1:
            if len(self.floor_1_waiting_list) < remaining_places:
                self.inside_list.extend(self.floor_1_waiting_list)
                self.floor_1_waiting_list = []
            else:
                self.inside_list.extend(self.floor_1_waiting_list[:remaining_places])
                self.floor_1_waiting_list = self.floor_1_waiting_list[remaining_places:]
        elif self.states[6] == 2:
            if len(self.floor_2_waiting_list) < remaining_places:
                self.inside_list.extend(self.floor_2_waiting_list)
                self.floor_2_waiting_list = []
            else:
                self.inside_list.extend(self.floor_2_waiting_list[:remaining_places])
                self.floor_2_waiting_list = self.floor_2_waiting_list[remaining_places:]
        self.states[5] = len(self.inside_list)

    def step(self, action):
        '''
            It helps to control the elevator movements: 
                - 0 Stop 
                - 1 Go Up 
                - 2 Go Down
        '''
        info = {}
        if self.done:
            print("END SESSION")
        else:
            if action == 0:
                self.rotating_people()
                self.time += 10
            if action == 1:
                if self.states[6] == 2:
                    print("INVALID ACTION!")
                    # If the agent takes an invalid action the rewards will decrease
                    self.reward = self.reward - 100
                    self.time += 100
                else:
                    self.time += 2
                    self.states[6] = self.states[6] + 1
            if action == 2:
                if self.states[6] == 0:
                    print("INVALID ACTION!")
                    self.reward = self.reward - 100
                    self.time += 100
                else:
                    self.time += 2
                    self.states[6] = self.states[6] - 1
            self.reward = self.reward - (self.states[5] + self.states[2] + self.states[1] + self.states[4] + self.states[3] + self.states[0])
            self.timecheck()
            self.waiting_list_check()
            self.done = self.done_check()


        return self.states, self.reward, self.done, info



# Train RL Models

In [None]:
env_elevator_3 = DummyVecEnv([lambda: env_elevator_3])
env_elevator_3.reset()

In [None]:
PPO_PATH = os.path.join('TRAIN_PPO','LOGS_PPO')
DQN_PATH = os.path.join('TRAIN_DQN','LOGS_DQN')
A2C_PATH = os.path.join('TRAIN_A2C','LOGS_A2C')

model_elevator_PPO_3 = PPO('MlpPolicy',  # Multi layer perceptron policy
                    env_elevator_3, 
                    verbose = 1,
                    tensorboard_log = PPO_PATH,
                    learning_rate=0.01,
                    gamma = 0.6)

model_elevator_DQN_3 = DQN('MlpPolicy',
                         env_elevator_3,
                         verbose = 1,
                         tensorboard_log = DQN_PATH,
                         learning_rate=0.01,
                         gamma = 0.6)

model_elevator_A2C_3 = A2C('MlpPolicy',
                         env_elevator_3,
                         verbose = 1,
                         tensorboard_log = A2C_PATH,
                         gamma=0.6,
                         learning_rate=0.01)

In [None]:
model_elevator_DQN_3.learn(total_timesteps = 500000)

In [None]:
model_elevator_PPO_3.learn(total_timesteps = 500000)

In [None]:
model_elevator_A2C_3.learn(total_timesteps = 500000)

# Save and Reload Model

In [None]:
PPO_PATH_save = os.path.join('TRAIN_PPO','SAVED_MODELS_PPO','ppo_saved')
DQN_PATH_save = os.path.join('TRAIN_DQN','SAVED_MODELS_PPO','dqn_saved')
A2C_PATH_save = os.path.join('TRAIN_A2C','SAVED_MODELS_PPO','a2c_saved')

model_elevator_PPO_3.save(PPO_PATH_save)
model_elevator_DQN_3.save(DQN_PATH_save)
model_elevator_A2C_3.save(A2C_PATH_save)

In [None]:
evaluate_policy(model_elevator_A2C_3, 
                env_elevator_3, 
                n_eval_episodes = 10000)

In [None]:
evaluate_policy(model_elevator_PPO_3, 
                env_elevator_3, 
                n_eval_episodes = 10000)

In [None]:
evaluate_policy(model_elevator_DQN_3, 
                env_elevator_3, 
                n_eval_episodes = 10000)

In [None]:
obs = env_elevator_3.reset()
while True:
    action, _states = model_elevator_PPO_3.predict(obs)
    obs, rewards, done, info = env_elevator_3.step(action)
    if done: 
        print('info', info)
        break


In [None]:
obs = env_elevator_3.reset()
while True:
    action, _states = model_elevator_DQN_3.predict(obs)
    obs, rewards, done, info = env_elevator_3.step(action)
    if done: 
        print('info', info)
        break

In [None]:
obs = env_elevator_3.reset()
while True:
    action, _states = model_elevator_A2C_3.predict(obs)
    obs, rewards, done, info = env_elevator_3.step(action)
    if done: 
        print('info', info)
        break

In [None]:
training_log_path_ppo = os.path.join(PPO_PATH, 'PPO_3')
training_log_path_dqn = os.path.join(DQN_PATH, 'DQN_3')
training_log_path_a2c = os.path.join(A2C_PATH, 'A2C_1')


In [None]:
!tensorboard --logdir={training_log_path_a2c} --port={6015}
# ok from there DQN works very well

In [None]:
!tensorboard --logdir={training_log_path_ppo} --port={6011}

In [None]:
!tensorboard --logdir={training_log_path_dqn} --port={6012}

# Callback Training Phase - PPO


In [None]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
import os

In [None]:
path_to_save = os.path.join('training', 'saved_models')
path_log = os.path.join('training','logs')

                            

In [None]:
env_elevator = Elevator3()
env_elevator = DummyVecEnv([lambda : env_elevator])

stop_CB = StopTrainingOnRewardThreshold(reward_threshold=  200, verbose = 1)
eval_CB = EvalCallback( env_elevator,
                       callback_on_new_best=stop_CB,
                       eval_freq= 10000,
                       best_model_save_path=path_to_save,
                       verbose = 1)


In [None]:
model = PPO('MlpPolicy', env_elevator, tensorboard_log= path_log, verbose = 1)
model.learn(total_timesteps=20000, callback = eval_CB)


In [None]:
model_path = os.path.join('training','saved_models','best_model')
model = PPO.load(model_path, env = env_elevator)

In [None]:
evaluate_policy(model, env_elevator, n_eval_episodes = 10000)

In [None]:
ppo =os.path.join(path_log,'PPO_2') 
ppo

In [None]:
!tensorboard --logdir={ppo} --port={6013}