In [1]:
import random
from stable_baselines3.common.env_checker import check_env
from stable_baselines3 import PPO,DQN,A2C
from stable_baselines3.common import env_checker
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback
import numpy as np
from rl_env import StatesGenerator, get_benchmark_rewards,compute_reward, critical_task_reward, CustomEnv
from collections import defaultdict

In [2]:
from config import get_config
config, _ = get_config()
import numpy as np
import time

logdir = f"../logs/{int(time.time())}/"
models_dir = f"../models/{int(time.time())}/"

In [3]:
env=CustomEnv(config)
check_env(env)
model = PPO('MlpPolicy', env, verbose=1,tensorboard_log=logdir,batch_size=64)
env.reset()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


array([0.56428571, 0.33714286, 0.54571429, 0.28071429, 0.565     ,
       0.21571429, 0.36285714, 0.55357143, 0.11214286, 0.33928571,
       0.57142857, 0.71428571, 0.85714286, 1.        ])

In [4]:

eval_callback = EvalCallback(env, best_model_save_path=models_dir,
                             log_path=logdir, eval_freq=10000,
                             deterministic=True, render=False)

In [5]:
# Run Training
is_train = False
if is_train:
    TIMESTEPS = 25_000
    iters = 0
    while True:
        iters += 1
        model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False, tb_log_name=f"PPO_Testing_new_bin_penalty",callback=eval_callback)
        model.save(f"{models_dir}/{TIMESTEPS * iters}")


In [19]:
def evaluate(model, env, num_episodes=100):
    """
    Evaluate a RL agent
    :param model: (BaseRLModel object) the RL Agent
    :param num_episodes: (int) number of episodes to evaluate it
    :return: (float) Mean reward for the last num_episodes
    """
    # This function will only work for a single Environment
    all_episode_rewards = []
    all_episodes_len=[]
    termination_cause=defaultdict(int)
    is_success=0
    occupancy_ratio=[]

    for i in range(num_episodes):
        episode_rewards = []
        done = False
        obs = env.reset()
        total_bins = obs[10:]*1400
        while not done:
            # _states are only useful when using LSTM policies
            action, _states = model.predict(obs)
            # here, action, rewards and dones are arrays
            # because we are using vectorized env
            obs, reward, done, info = env.step(action)
            if info['is_success']:
                is_success+=1
                remaining_node_capacities=obs[10:]*1400
                occupancy_ratio.append(round(100-np.mean((np.array(remaining_node_capacities) /np.array(total_bins)))*100,2))
              
            if done:
                termination_cause[info['termination_cause']]+=1
                all_episodes_len.append(info['episode_len'])
            
            
            episode_rewards.append(reward)

        all_episode_rewards.append(sum(episode_rewards))

    mean_episode_reward = np.mean(all_episode_rewards)

    return mean_episode_reward,(sum(all_episodes_len)/num_episodes),termination_cause,is_success,occupancy_ratio

mean_reward,mean_episode_len,termination_cause,is_success,_=evaluate(model, env,2)
print(f"mean_reward:{mean_reward:.2f}")
print(f"mean_episode_len:{mean_episode_len:.2f}")
print(termination_cause)


mean_reward:-12.00
mean_episode_len:3.00
defaultdict(<class 'int'>, {'DUBLICATE_PICK': 1, 'BIN_OVERFLOW': 1})


In [20]:
loaded_model =PPO.load('..\\models\\1674683760\\1125000', env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [21]:
import time

In [32]:
# evaluation over N episodes
t1=time.time()
mean_reward,mean_episode_len,termination_cause,is_success,avg_occupancy_ratio=evaluate(loaded_model, env,100)
print(f"mean_reward:{mean_reward:.2f}")
print(f"mean_episode_len:{mean_episode_len:.2f}")
print(termination_cause)
print("Average Occupancy ratio:",np.array(avg_occupancy_ratio).mean())
print("Average time per input:",(time.time()-t1)/100)

mean_reward:5.63
mean_episode_len:7.51
defaultdict(<class 'int'>, {'SUCCESS': 28, 'BIN_OVERFLOW': 72})
Average Occupancy ratio: 87.76321428571428
Average time per input: 0.003847689628601074


In [26]:
done = False
obs=env.reset()

FACTOR=1400

print("Task Costs:",obs[0:10]*FACTOR,"All tasks sum: ",sum(obs[0:10])*FACTOR)
print("Node capacities:",obs[10:]*FACTOR)

while not done:
    # _states are only useful when using LSTM policies
    action, _states = loaded_model.predict(obs)
    # here, action, rewards and dones are arrays
    # because we are using vectorized env
    obs, reward, done, info = env.step(action)
    
print()
print(info)
print()
print("Assignment=",info['assignment_status'])
print()
print(f"Last Action: Allocating Task # {action[0]} in Node/Bin # {action[1]}")
print()

print("Tasks after episodes:",obs[0:10]*FACTOR)
print("Remaining Node capacities:",obs[10:]*FACTOR)



Task capacities: [430. 492. 298. 385. 242. 563. 199. 476. 773. 365.] All tasks sum:  4222.999999999999
Node capacities: [ 800. 1000. 1200. 1400.]

{'is_success': False, 'episode_len': 9, 'termination_cause': 'BIN_OVERFLOW', 'assignment_status': [[8], [7, 0], [1, 3, 6], [5, 9, 2]]}

Assignment= [[8], [7, 0], [1, 3, 6], [5, 9, 2]]

Last Action: Allocating Task # 4 in Node/Bin # 3

Tasks after episodes: [  0.   0.   0.   0. 242.   0.   0.   0.   0.   0.]
Remaining Node capacities: [ 27.  94. 124. 174.]


In [10]:
remaining_node_capacities=obs[10:]*FACTOR

In [None]:
avg_occupancy=round(100-np.mean((np.array(remaining_node_capacities) /np.array([800,1000,1200,1400,1600])))*100,2)

In [None]:
print(avg_occupancy)