### 1. Import Dependencies

In [1]:
#!pip install tensorflow==2.7.0
#!pip install gym
#!pip install keras
#!pip install keras-rl2

In [1]:
from collections import Counter
from BinPackingEnvironment2D import BinPacking2D

### 2. Create Environment

In [2]:
env = BinPacking2D(10, 20)

### 3. Run control test

In [3]:
MAX_STEPS = 1000
episodes = 10

for episode in range(1, episodes+1):
    state = env.reset()
    steps = 0
    done = False
    score = 0
    
    session_info = Counter({ 'placed':0, 'misplaced':0, 'discarded':0 })
    
    while not done and steps < MAX_STEPS:
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
        steps += 1

    print('Episode:{} Score:{}'.format(episode,score))
    
    # Counter method sometimes shuffles keys; this returns the order
    ordered_info = \
        sorted(dict(session_info).items(), key=lambda x:x[0], reverse=True)
    
    print(ordered_info)
    
    #print(env.logs)
    #control_data.log(env)
    #env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

Episode:1 Score:-24448
Episode:2 Score:-23442
Episode:3 Score:-23800
Episode:4 Score:-24532
Episode:5 Score:-21684
Episode:6 Score:-25411
Episode:7 Score:-19218
Episode:8 Score:-20587
Episode:9 Score:-23839
Episode:10 Score:-22060


### 4. Train RL model

In [4]:
import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

In [5]:
# Will throw an error if these don't exist
log_path = os.path.join('Training', 'Logs')

In [6]:
env = BinPacking2D(num_bins=10, capacity=20)

In [7]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [8]:
model.learn(total_timesteps=80000)

Logging to Training\Logs\PPO_2
-----------------------------
| time/              |      |
|    fps             | 202  |
|    iterations      | 1    |
|    time_elapsed    | 10   |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 290          |
|    iterations           | 2            |
|    time_elapsed         | 14           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0065387916 |
|    clip_fraction        | 0.0242       |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.39        |
|    explained_variance   | 2.67e-05     |
|    learning_rate        | 0.0003       |
|    loss                 | 1.06e+05     |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00584     |
|    value_loss           | 1.93e+05     |
----------------------------

-----------------------------------------
| time/                   |             |
|    fps                  | 356         |
|    iterations           | 13          |
|    time_elapsed         | 74          |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.012903697 |
|    clip_fraction        | 0.143       |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.19       |
|    explained_variance   | 1.06e-05    |
|    learning_rate        | 0.0003      |
|    loss                 | 309         |
|    n_updates            | 120         |
|    policy_gradient_loss | -0.0084     |
|    value_loss           | 667         |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 361         |
|    iterations           | 14          |
|    time_elapsed         | 79          |
|    total_timesteps      | 28672 

------------------------------------------
| time/                   |              |
|    fps                  | 428          |
|    iterations           | 24           |
|    time_elapsed         | 114          |
|    total_timesteps      | 49152        |
| train/                  |              |
|    approx_kl            | 0.0016385126 |
|    clip_fraction        | 0.00659      |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.13        |
|    explained_variance   | 0.00825      |
|    learning_rate        | 0.0003       |
|    loss                 | 203          |
|    n_updates            | 230          |
|    policy_gradient_loss | -0.0051      |
|    value_loss           | 730          |
------------------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 434          |
|    iterations           | 25           |
|    time_elapsed         | 117          |
|    total_

------------------------------------------
| time/                   |              |
|    fps                  | 469          |
|    iterations           | 35           |
|    time_elapsed         | 152          |
|    total_timesteps      | 71680        |
| train/                  |              |
|    approx_kl            | 0.0076543456 |
|    clip_fraction        | 0.0337       |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.07        |
|    explained_variance   | 0.839        |
|    learning_rate        | 0.0003       |
|    loss                 | 40.2         |
|    n_updates            | 340          |
|    policy_gradient_loss | -0.00485     |
|    value_loss           | 88.3         |
------------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 472         |
|    iterations           | 36          |
|    time_elapsed         | 155         |
|    total_times

<stable_baselines3.ppo.ppo.PPO at 0x2224330ad60>

### 5. Save model

In [9]:
PPO_Path = os.path.join('Training', 'Saved Models', '2D_Bin_Packing')

In [10]:
model.save(PPO_Path)

### 6. Load model

In [7]:
model = PPO.load(PPO_Path, env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


### 7. Test model

In [18]:
MAX_STEPS = 1000
episodes = 10

for episode in range(1, episodes+1):
    obs = env.reset()
    steps = 0
    done = False
    score = 0
    
    session_info = Counter({ 'placed':0, 'misplaced':0, 'discarded':0 })
    
    while not done and steps < MAX_STEPS:
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
        steps += 1
        session_info += Counter(info)

    print('Episode:{} Score:{}'.format(episode,score))
    
    # Counter method sometimes shuffles keys; this returns the order
    ordered_info = \
        sorted(dict(session_info).items(), key=lambda x:x[0], reverse=True)
    
    print(ordered_info)
    #real_data.log(env)
    #env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

Episode:1 Score:-1592
[('placed', 43), ('misplaced', 127), ('discarded', 830)]
Episode:2 Score:-3600
[('placed', 44), ('misplaced', 267), ('discarded', 689)]
Episode:3 Score:-1280
[('placed', 45), ('misplaced', 107), ('discarded', 848)]
Episode:4 Score:-2075
[('placed', 46), ('misplaced', 326), ('discarded', 628)]
Episode:5 Score:-1973
[('placed', 46), ('misplaced', 189), ('discarded', 765)]
Episode:6 Score:-2001
[('placed', 42), ('misplaced', 120), ('discarded', 838)]
Episode:7 Score:-795
[('placed', 42), ('misplaced', 70), ('discarded', 888)]
Episode:8 Score:-4513
[('placed', 48), ('misplaced', 513), ('discarded', 439)]
Episode:9 Score:-1836
[('placed', 45), ('misplaced', 124), ('discarded', 831)]
Episode:10 Score:-695
[('placed', 42), ('misplaced', 101), ('discarded', 138)]
