In [1]:
import random
from stable_baselines3.common.env_checker import check_env
from stable_baselines3 import PPO,DQN,A2C
from stable_baselines3.common import env_checker
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback
import numpy as np
from rl_env import StatesGenerator, get_benchmark_rewards,compute_reward, critical_task_reward, CustomEnv
from collections import defaultdict

In [2]:
from config import get_config
config, _ = get_config()
import numpy as np
import time

logdir = f"../logs/{int(time.time())}/"
models_dir = f"../models/{int(time.time())}/"

In [3]:
env=CustomEnv(config)
check_env(env)
model = PPO('MlpPolicy', env, verbose=1,tensorboard_log=logdir,batch_size=64)
env.reset()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


array([0.55073586, 0.45003873, 0.14639814, 0.44848954, 0.5120062 ,
       0.51432998, 0.22385747, 0.49883811, 0.20681642, 0.40975988,
       1.        , 0.92796282, 0.64910922, 0.75290473])

In [4]:

eval_callback = EvalCallback(env, best_model_save_path=models_dir,
                             log_path=logdir, eval_freq=10000,
                             deterministic=True, render=False)

In [5]:
# Run Training
is_train = False
if is_train:
    TIMESTEPS = 25_000
    iters = 0
    while True:
        iters += 1
        model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False, tb_log_name=f"PPO_Testing_new_bin_penalty",callback=eval_callback)
        model.save(f"{models_dir}/{TIMESTEPS * iters}")


In [6]:
def evaluate(model, env, num_episodes=100):
    """
    Evaluate a RL agent
    :param model: (BaseRLModel object) the RL Agent
    :param num_episodes: (int) number of episodes to evaluate it
    :return: (float) Mean reward for the last num_episodes
    """
    # This function will only work for a single Environment
    all_episode_rewards = []
    all_episodes_len=[]
    termination_cause=defaultdict(int)
    is_success=0


    for i in range(num_episodes):
        episode_rewards = []
        done = False
        obs = env.reset()
        while not done:
            # _states are only useful when using LSTM policies
            action, _states = model.predict(obs)
            # here, action, rewards and dones are arrays
            # because we are using vectorized env
            obs, reward, done, info = env.step(action)
            if info['is_success']:
                is_success+=1
            if done:
                termination_cause[info['termination_cause']]+=1
                all_episodes_len.append(info['episode_len'])
            
            episode_rewards.append(reward)

        all_episode_rewards.append(sum(episode_rewards))

    mean_episode_reward = np.mean(all_episode_rewards)

    return mean_episode_reward,(sum(all_episodes_len)/num_episodes),termination_cause,is_success

mean_reward,mean_episode_len,termination_cause,is_success=evaluate(model, env,100)
print(f"mean_reward:{mean_reward:.2f}")
print(f"mean_episode_len:{mean_episode_len:.2f}")
print(termination_cause)


mean_reward:-13.75
mean_episode_len:2.95
defaultdict(<class 'int'>, {'DUBLICATE_PICK': 67, 'BIN_OVERFLOW': 33})


In [7]:
loaded_model =PPO.load('..\\models\\1674683760\\1000000', env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [8]:
# evaluation over N episodes
mean_reward,mean_episode_len,termination_cause,is_success=evaluate(loaded_model, env,100)
print(f"mean_reward:{mean_reward:.2f}")
print(f"mean_episode_len:{mean_episode_len:.2f}")
print(termination_cause)
print(is_success)

mean_reward:6.21
mean_episode_len:6.35
defaultdict(<class 'int'>, {'SUCCESS': 34, 'BIN_OVERFLOW': 66})
34


In [9]:
done = False
obs=env.reset()

FACTOR=1

print("Task capacities:",obs[0:10]*FACTOR,"All tasks sum: ",sum(obs[0:10])*FACTOR)
print("Node capacities:",obs[10:]*FACTOR)

while not done:
    # _states are only useful when using LSTM policies
    action, _states = loaded_model.predict(obs)
    # here, action, rewards and dones are arrays
    # because we are using vectorized env
    obs, reward, done, info = env.step(action)
    
print()
print(info)
print()
print("Assignment=",info['assignment_status'])
print()
print(f"Last Action: Allocating Task # {action[0]} in Node/Bin # {action[1]}")
print()

print("Tasks after episodes:",obs[0:10]*FACTOR)
print("Remaining Node capacities:",obs[10:]*FACTOR)



Task capacities: [0.34077381 0.26488095 0.58705357 0.40178571 0.57738095 0.3296131
 0.2202381  0.36160714 0.43154762 0.49925595] All tasks sum:  4.014136904761905
Node capacities: [0.8764881  1.         0.83630952 0.62127976]
[4 0]
[8 1]
[9 2]
[2 3]
[3 1]
[7 0]

{'is_success': False, 'episode_len': 5, 'termination_cause': 'BIN_OVERFLOW', 'assignment_status': [[4], [8, 3], [9], [2]]}

Assignment= [[4], [8, 3], [9], [2]]

Last Action: Allocating Task # 7 in Node/Bin # 0

Tasks after episodes: [0.34077381 0.26488095 0.         0.         0.         0.3296131
 0.2202381  0.36160714 0.         0.        ]
Remaining Node capacities: [0.29910714 0.16666667 0.33705357 0.03422619]


In [10]:
remaining_node_capacities=obs[10:]*FACTOR

In [None]:
avg_occupancy=round(100-np.mean((np.array(remaining_node_capacities) /np.array([800,1000,1200,1400,1600])))*100,2)

In [None]:
print(avg_occupancy)