### 1. Import Dependencies

In [1]:
#!pip install tensorflow==2.7.0
#!pip install gym
#!pip install keras
#!pip install keras-rl2

In [1]:
from collections import Counter
from VMAllocationEnvironment import VMAllocationEnvironment

### 2. Create Environment

In [2]:
env = VMAllocationEnvironment(10, 20)

ValueError: high <= 0

### 3. Run control test

In [14]:
MAX_STEPS = 1000
episodes = 10

for episode in range(1, episodes+1):
    state = env.reset()
    steps = 0
    done = False
    score = 0
    
    session_info = Counter({ 'placed':0, 'misplaced':0, 'discarded':0 })
    
    while not done and steps < MAX_STEPS:
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
        steps += 1
        session_info += Counter(info)

    print('Episode:{} Score:{}'.format(episode,score))
    print('Total steps: {}'.format(steps))
    
    # Counter method sometimes shuffles keys; this returns the order
    ordered_info = \
        sorted(dict(session_info).items(), key=lambda x:x[0], reverse=True)
    
    placed, misplaced, discarded = ordered_info[:]
    accuracy = (placed[1] / (placed[1] + misplaced[1])) * 100
    print('Accuracy of placed items: {:.3}%'.format(accuracy))
    
    first_time = (placed[1] / (placed[1] + misplaced[1] + discarded[1])) * 100
    print('Items placed correctly first time: {:.3}%'.format(first_time))
    
    print(ordered_info)
    
    print()
    
    #print(env.logs)
    #control_data.log(env)
    #env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

Episode:1 Score:-8275
Total steps: 405
Accuracy of placed items: 9.66%
Items placed correctly first time: 8.4%
[('placed', 34), ('misplaced', 318), ('discarded', 53)]

Episode:2 Score:-26405
Total steps: 1000
Accuracy of placed items: 3.37%
Items placed correctly first time: 3.1%
[('placed', 31), ('misplaced', 889), ('discarded', 80)]

Episode:3 Score:-25656
Total steps: 1000
Accuracy of placed items: 3.96%
Items placed correctly first time: 3.6%
[('placed', 36), ('misplaced', 873), ('discarded', 91)]

Episode:4 Score:-20915
Total steps: 1000
Accuracy of placed items: 5.12%
Items placed correctly first time: 4.6%
[('placed', 46), ('misplaced', 852), ('discarded', 102)]

Episode:5 Score:-26441
Total steps: 1000
Accuracy of placed items: 4.1%
Items placed correctly first time: 3.8%
[('placed', 38), ('misplaced', 888), ('discarded', 74)]

Episode:6 Score:-29334
Total steps: 1000
Accuracy of placed items: 4.7%
Items placed correctly first time: 4.3%
[('placed', 43), ('misplaced', 871), ('d

### 4. Train RL model

In [5]:
import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

In [6]:
# Will throw an error if these don't exist
log_path = os.path.join('Training', 'Logs')

In [7]:
env = BinPacking2D(num_bins=10, capacity=20)

In [8]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [9]:
model.learn(total_timesteps=80000)

Logging to Training\Logs\PPO_6
-----------------------------
| time/              |      |
|    fps             | 280  |
|    iterations      | 1    |
|    time_elapsed    | 7    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 365         |
|    iterations           | 2           |
|    time_elapsed         | 11          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007914768 |
|    clip_fraction        | 0.037       |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.39       |
|    explained_variance   | -0.00159    |
|    learning_rate        | 0.0003      |
|    loss                 | 1.1e+05     |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00968    |
|    value_loss           | 2.14e+05    |
-----------------------------------------
---

-----------------------------------------
| time/                   |             |
|    fps                  | 561         |
|    iterations           | 13          |
|    time_elapsed         | 47          |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.012252329 |
|    clip_fraction        | 0.158       |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.17       |
|    explained_variance   | 1.35e-05    |
|    learning_rate        | 0.0003      |
|    loss                 | 178         |
|    n_updates            | 120         |
|    policy_gradient_loss | -0.0131     |
|    value_loss           | 565         |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 566         |
|    iterations           | 14          |
|    time_elapsed         | 50          |
|    total_timesteps      | 28672 

------------------------------------------
| time/                   |              |
|    fps                  | 557          |
|    iterations           | 24           |
|    time_elapsed         | 88           |
|    total_timesteps      | 49152        |
| train/                  |              |
|    approx_kl            | 0.0006069033 |
|    clip_fraction        | 0.000195     |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.08        |
|    explained_variance   | 0.0937       |
|    learning_rate        | 0.0003       |
|    loss                 | 287          |
|    n_updates            | 230          |
|    policy_gradient_loss | -0.00185     |
|    value_loss           | 1.79e+03     |
------------------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 558          |
|    iterations           | 25           |
|    time_elapsed         | 91           |
|    total_

-----------------------------------------
| time/                   |             |
|    fps                  | 549         |
|    iterations           | 35          |
|    time_elapsed         | 130         |
|    total_timesteps      | 71680       |
| train/                  |             |
|    approx_kl            | 0.011336904 |
|    clip_fraction        | 0.228       |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.09       |
|    explained_variance   | 1.19e-07    |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0145      |
|    n_updates            | 340         |
|    policy_gradient_loss | -0.00182    |
|    value_loss           | 1.35        |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 547         |
|    iterations           | 36          |
|    time_elapsed         | 134         |
|    total_timesteps      | 73728 

<stable_baselines3.ppo.ppo.PPO at 0x1a169a60eb0>

### 5. Save model

In [10]:
PPO_Path = os.path.join('Training', 'Saved Models', '2D_Bin_Packing')

In [11]:
model.save(PPO_Path)

### 6. Load model

In [7]:
model = PPO.load(PPO_Path, env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


### 7. Test model

In [15]:
MAX_STEPS = 1000
episodes = 10

for episode in range(1, episodes+1):
    obs = env.reset()
    steps = 0
    done = False
    score = 0
    
    session_info = Counter({ 'placed':0, 'misplaced':0, 'discarded':0 })
    
    while not done and steps < MAX_STEPS:
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
        steps += 1
        session_info += Counter(info)

    print('Episode:{} Score:{}'.format(episode,score))
    print('Total steps: {}'.format(steps))
    
    # Counter method sometimes shuffles keys; this returns the order
    ordered_info = \
        sorted(dict(session_info).items(), key=lambda x:x[0], reverse=True)
    
    placed, misplaced, discarded = ordered_info[:]
    accuracy = (placed[1] / (placed[1] + misplaced[1])) * 100
    print('Accuracy of placed items: {:.3}%'.format(accuracy))
    
    first_time = (placed[1] / (placed[1] + misplaced[1] + discarded[1])) * 100
    print('Items placed correctly first time: {:.3}%'.format(first_time))
    
    print(ordered_info)
    
    print()
    #real_data.log(env)
    #env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

Episode:1 Score:-10257
Total steps: 1000
Accuracy of placed items: 4.73%
Items placed correctly first time: 3.5%
[('placed', 35), ('misplaced', 705), ('discarded', 260)]

Episode:2 Score:-14833
Total steps: 992
Accuracy of placed items: 4.82%
Items placed correctly first time: 4.33%
[('placed', 43), ('misplaced', 849), ('discarded', 100)]

Episode:3 Score:-4941
Total steps: 1000
Accuracy of placed items: 10.3%
Items placed correctly first time: 3.9%
[('placed', 39), ('misplaced', 340), ('discarded', 621)]

Episode:4 Score:-13389
Total steps: 1000
Accuracy of placed items: 4.44%
Items placed correctly first time: 4.0%
[('placed', 40), ('misplaced', 860), ('discarded', 100)]

Episode:5 Score:-773
Total steps: 233
Accuracy of placed items: 26.1%
Items placed correctly first time: 17.2%
[('placed', 40), ('misplaced', 113), ('discarded', 80)]

Episode:6 Score:-11410
Total steps: 1000
Accuracy of placed items: 4.17%
Items placed correctly first time: 3.9%
[('placed', 39), ('misplaced', 896),