In [1]:
import random
from stable_baselines3.common.env_checker import check_env
from stable_baselines3 import PPO,DQN,A2C
from stable_baselines3.common import env_checker
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback
import numpy as np
from states_generator import StatesGenerator
from rl_env import get_benchmark_rewards,compute_reward, critical_task_reward
from cades_env import CadesEnv
from collections import defaultdict
from utils import evaluate

In [2]:
from config import get_config
config, _ = get_config()
import numpy as np
import time

logdir = f"../logs/{int(time.time())}/"
models_dir = f"../models/{int(time.time())}/"

In [3]:
env=CadesEnv(config)
check_env(env)
model = PPO('MultiInputPolicy', env, verbose=1,tensorboard_log=logdir,batch_size=128,device='cuda')
env.reset()
env.get_env_info()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


{'tasks_total_cost': 5204.0,
 'nodes_total_capacity': 6400.0,
 'extra_capacity': 19.0}

In [4]:

eval_callback = EvalCallback(env, best_model_save_path=models_dir,
                             log_path=logdir, eval_freq=10000,
                             deterministic=True, render=False)

In [None]:
# Run Training
is_train = True
EPOCHS=10
if is_train:
    TIMESTEPS = 25000
    iters=0
    while iters<EPOCHS:
        iters = iters+1

        model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False, tb_log_name=f"ppo_Test",callback=eval_callback)
        model.save(f"{models_dir}/{iters}")



Logging to ../logs/1677948409/ppo_Test_0
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 4.36     |
|    ep_rew_mean     | -14.4    |
| time/              |          |
|    fps             | 493      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 3.85         |
|    ep_rew_mean          | -14.8        |
| time/                   |              |
|    fps                  | 460          |
|    iterations           | 2            |
|    time_elapsed         | 8            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0071161725 |
|    clip_fraction        | 0.0139       |
|    clip_range           | 0.2          |
|    entropy_loss         | -3.69        |
|    explained_va



New best mean reward!
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 4.1      |
|    ep_rew_mean     | -13      |
| time/              |          |
|    fps             | 506      |
|    iterations      | 5        |
|    time_elapsed    | 20       |
|    total_timesteps | 10240    |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.4         |
|    ep_rew_mean          | -12.2       |
| time/                   |             |
|    fps                  | 513         |
|    iterations           | 6           |
|    time_elapsed         | 23          |
|    total_timesteps      | 12288       |
| train/                  |             |
|    approx_kl            | 0.010881149 |
|    clip_fraction        | 0.0929      |
|    clip_range           | 0.2         |
|    entropy_loss         | -3.62       |
|    explained_variance   | -0.00327    |
|    lea

In [49]:
loaded_model =PPO.load('..\\models\\1677681542\\50000', env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [50]:
mean_reward,mean_episode_len,termination_cause,is_success,_=evaluate(loaded_model, env,100)
print(f"mean_reward:{mean_reward:.2f}")
print(f"mean_episode_len:{mean_episode_len:.2f}")
print(termination_cause)


mean_reward:-8.35
mean_episode_len:2.35
defaultdict(<class 'int'>, {'BIN_OVERFLOW': 93, 'DUBLICATE_PICK': 7})


In [40]:
import time

In [41]:
# evaluation over N episodes
t1=time.time()
mean_reward,mean_episode_len,termination_cause,is_success,avg_occupancy_ratio=evaluate(loaded_model, env,100)
print(f"mean_reward:{mean_reward:.2f}")
print(f"mean_episode_len:{mean_episode_len:.2f}")
print(termination_cause)
print("Average Occupancy ratio:",np.array(avg_occupancy_ratio).mean())
print("Average time per input:",(time.time()-t1)/100)

mean_reward:5.11
mean_episode_len:6.80
defaultdict(<class 'int'>, {'SUCCESS': 29, 'BIN_OVERFLOW': 70, 'DUBLICATE_PICK': 1})
Average Occupancy ratio: 76.4451724137931
Average time per input: 0.003796532154083252


In [35]:
done = False
obs=env.reset()

FACTOR=env.norm_factor

print("Task Costs:",obs['tasks']*FACTOR,"All tasks sum: ",sum(obs['tasks'])*FACTOR)
print("Node capacities:",obs['nodes']*FACTOR)

while not done:
    # _states are only useful when using LSTM policies
    action, _states = loaded_model.predict(obs)
    # here, action, rewards and dones are arrays
    # because we are using vectorized env
    obs, reward, done, info = env.step(action)
#     print("Tasks after episodes:",obs['tasks']*FACTOR)
    print(action)
    print("Remaining Node capacities:",obs['nodes']*FACTOR)
    print()
    
print()
print(info)
print()
print("Assignment=",info['assignment_status'])
print()
print(f"Last Action: Allocating Task # {action[0]} in Node/Bin # {action[1]}")
print()





Task Costs: [103. 271. 685. 588. 277. 605. 205. 302. 411. 626.] All tasks sum:  4073.0000000000005
Node capacities: [1000. 1800. 1800. 1400.]
[2 1]
Remaining Node capacities: [1000. 1115. 1800. 1400.]

[3 2]
Remaining Node capacities: [1000. 1115. 1212. 1400.]

[9 3]
Remaining Node capacities: [1000. 1115. 1212.  774.]

[5 1]
Remaining Node capacities: [1000.  510. 1212.  774.]

[8 2]
Remaining Node capacities: [1000.  510.  801.  774.]

[1 0]
Remaining Node capacities: [729. 510. 801. 774.]

[7 0]
Remaining Node capacities: [427. 510. 801. 774.]

[4 3]
Remaining Node capacities: [427. 510. 801. 497.]

[6 2]
Remaining Node capacities: [427. 510. 596. 497.]

[0 2]
Remaining Node capacities: [427. 510. 493. 497.]


{'is_success': True, 'episode_len': 10, 'termination_cause': 'SUCCESS', 'assignment_status': [[1, 7], [2, 5], [3, 8, 6, 0], [9, 4]]}

Assignment= [[1, 7], [2, 5], [3, 8, 6, 0], [9, 4]]

Last Action: Allocating Task # 0 in Node/Bin # 2



In [29]:
remaining_node_capacities=obs['nodes']*FACTOR

In [None]:
avg_occupancy=round(100-np.mean((np.array(remaining_node_capacities) /np.array([800,1000,1200,1400,1600])))*100,2)

In [None]:
print(avg_occupancy)