### 1. Import Dependencies

In [1]:
#!pip install tensorflow==2.7.0
#!pip install gym
#!pip install keras
#!pip install keras-rl2

In [4]:
from collections import Counter
from BinPackingEnvironment2D import BinPacking2D

### 2. Create Environment

In [5]:
env = BinPacking2D(10, 20)

### 3. Run control test

In [24]:
MAX_STEPS = 1000
episodes = 10

for episode in range(1, episodes+1):
    state = env.reset()
    steps = 0
    done = False
    score = 0
    
    session_info = Counter({ 'placed':0, 'misplaced':0, 'discarded':0 })
    
    while not done and steps < MAX_STEPS:
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
        steps += 1
        session_info += Counter(info)

    print('Episode:{} Score:{}'.format(episode,score))
    print('Total steps: {}'.format(steps))
    
    # Counter method sometimes shuffles keys; this returns the order
    ordered_info = \
        sorted(dict(session_info).items(), key=lambda x:x[0], reverse=True)
    
    placed, misplaced, discarded = ordered_info[:]
    accuracy = (placed[1] / (placed[1] + misplaced[1])) * 100
    print('Accuracy of placed items: {:.3}%'.format(accuracy))
    
    first_time = (placed[1] / (placed[1] + misplaced[1] + discarded[1])) * 100
    print('Items placed correctly first time: {:.3}%'.format(first_time))
    
    print(ordered_info)
    
    print()
    
    #print(env.logs)
    #control_data.log(env)
    #env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

Episode:1 Score:-19443
Total steps: 1000
Accuracy of placed items: 3.99%
Items placed correctly first time: 3.6%
[('placed', 36), ('misplaced', 867), ('discarded', 97)]

Episode:2 Score:-18411
Total steps: 1000
Accuracy of placed items: 4.57%
Items placed correctly first time: 4.1%
[('placed', 41), ('misplaced', 856), ('discarded', 103)]

Episode:3 Score:-24898
Total steps: 1000
Accuracy of placed items: 4.23%
Items placed correctly first time: 3.9%
[('placed', 39), ('misplaced', 883), ('discarded', 78)]

Episode:4 Score:-21682
Total steps: 1000
Accuracy of placed items: 3.88%
Items placed correctly first time: 3.5%
[('placed', 35), ('misplaced', 868), ('discarded', 97)]

Episode:5 Score:-18271
Total steps: 1000
Accuracy of placed items: 4.34%
Items placed correctly first time: 3.9%
[('placed', 39), ('misplaced', 860), ('discarded', 101)]

Episode:6 Score:-26122
Total steps: 1000
Accuracy of placed items: 3.71%
Items placed correctly first time: 3.4%
[('placed', 34), ('misplaced', 883)

### 4. Train RL model

In [7]:
import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

In [8]:
# Will throw an error if these don't exist
log_path = os.path.join('Training', 'Logs')

In [9]:
env = BinPacking2D(num_bins=10, capacity=20)

In [10]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [11]:
model.learn(total_timesteps=80000)

Logging to Training\Logs\PPO_3
-----------------------------
| time/              |      |
|    fps             | 78   |
|    iterations      | 1    |
|    time_elapsed    | 26   |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 144          |
|    iterations           | 2            |
|    time_elapsed         | 28           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0063706646 |
|    clip_fraction        | 0.0131       |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.4         |
|    explained_variance   | -0.000475    |
|    learning_rate        | 0.0003       |
|    loss                 | 7.28e+04     |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.007       |
|    value_loss           | 1.5e+05      |
----------------------------

-----------------------------------------
| time/                   |             |
|    fps                  | 551         |
|    iterations           | 13          |
|    time_elapsed         | 48          |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.016442388 |
|    clip_fraction        | 0.179       |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.25       |
|    explained_variance   | 2.46e-05    |
|    learning_rate        | 0.0003      |
|    loss                 | 1.18e+03    |
|    n_updates            | 120         |
|    policy_gradient_loss | -0.0107     |
|    value_loss           | 2.77e+03    |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 572         |
|    iterations           | 14          |
|    time_elapsed         | 50          |
|    total_timesteps      | 28672 

-----------------------------------------
| time/                   |             |
|    fps                  | 725         |
|    iterations           | 24          |
|    time_elapsed         | 67          |
|    total_timesteps      | 49152       |
| train/                  |             |
|    approx_kl            | 0.025423264 |
|    clip_fraction        | 0.177       |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.04       |
|    explained_variance   | 5.96e-08    |
|    learning_rate        | 0.0003      |
|    loss                 | 0.182       |
|    n_updates            | 230         |
|    policy_gradient_loss | -0.00183    |
|    value_loss           | 8.11        |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 736         |
|    iterations           | 25          |
|    time_elapsed         | 69          |
|    total_timesteps      | 51200 

-----------------------------------------
| time/                   |             |
|    fps                  | 818         |
|    iterations           | 35          |
|    time_elapsed         | 87          |
|    total_timesteps      | 71680       |
| train/                  |             |
|    approx_kl            | 0.001889704 |
|    clip_fraction        | 0.0133      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.99       |
|    explained_variance   | 0.761       |
|    learning_rate        | 0.0003      |
|    loss                 | 14.1        |
|    n_updates            | 340         |
|    policy_gradient_loss | -8.96e-05   |
|    value_loss           | 38.2        |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 825         |
|    iterations           | 36          |
|    time_elapsed         | 89          |
|    total_timesteps      | 73728 

<stable_baselines3.ppo.ppo.PPO at 0x1fc809eb670>

### 5. Save model

In [12]:
PPO_Path = os.path.join('Training', 'Saved Models', '2D_Bin_Packing')

In [13]:
model.save(PPO_Path)

### 6. Load model

In [7]:
model = PPO.load(PPO_Path, env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


### 7. Test model

In [23]:
MAX_STEPS = 1000
episodes = 10

for episode in range(1, episodes+1):
    obs = env.reset()
    steps = 0
    done = False
    score = 0
    
    session_info = Counter({ 'placed':0, 'misplaced':0, 'discarded':0 })
    
    while not done and steps < MAX_STEPS:
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
        steps += 1
        session_info += Counter(info)

    print('Episode:{} Score:{}'.format(episode,score))
    print('Total steps: {}'.format(steps))
    
    # Counter method sometimes shuffles keys; this returns the order
    ordered_info = \
        sorted(dict(session_info).items(), key=lambda x:x[0], reverse=True)
    
    placed, misplaced, discarded = ordered_info[:]
    accuracy = (placed[1] / (placed[1] + misplaced[1])) * 100
    print('Accuracy of placed items: {:.3}%'.format(accuracy))
    
    first_time = (placed[1] / (placed[1] + misplaced[1] + discarded[1])) * 100
    print('Items placed correctly first time: {:.3}%'.format(first_time))
    
    print(ordered_info)
    
    print()
    #real_data.log(env)
    #env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

Episode:1 Score:-786
Total steps: 1000
Accuracy of placed items: 57.8%
Items placed correctly first time: 5.2%
[('placed', 52), ('misplaced', 38), ('discarded', 910)]

Episode:2 Score:-912
Total steps: 1000
Accuracy of placed items: 39.3%
Items placed correctly first time: 4.6%
[('placed', 46), ('misplaced', 71), ('discarded', 883)]

Episode:3 Score:-1690
Total steps: 1000
Accuracy of placed items: 8.7%
Items placed correctly first time: 4.9%
[('placed', 49), ('misplaced', 514), ('discarded', 437)]

Episode:4 Score:-1329
Total steps: 1000
Accuracy of placed items: 25.1%
Items placed correctly first time: 5.7%
[('placed', 57), ('misplaced', 170), ('discarded', 773)]

Episode:5 Score:-1037
Total steps: 1000
Accuracy of placed items: 53.9%
Items placed correctly first time: 4.8%
[('placed', 48), ('misplaced', 41), ('discarded', 911)]

Episode:6 Score:-1323
Total steps: 1000
Accuracy of placed items: 18.4%
Items placed correctly first time: 4.9%
[('placed', 49), ('misplaced', 218), ('disca