### 0. Helper functions

In [1]:
def get_percentage_filled(env):
    percentages = 100 - ((env.state/env.capacity) * 100)
    for i in range(10):
        print(f"Bin {i+1}: {percentages[i]}%")
        
def print_metrics(env):
    print(env.logs)
    print(f'Total timesteps: {sum(env.logs.values())}')
    print(get_percentage_filled(env))

### 1. Import dependencies

In [None]:
!pip install tensorflow==2.7.0
!pip install gym
!pip install keras
!pip install keras-rl2

In [2]:
from BinPackingEnvironment1D import BinPacking

### 2. Create environment

In [3]:
env = BinPacking(num_bins=10, capacity=20, min_item_size=1)

In [4]:
env.action_space.sample()

6

In [5]:
env.observation_space.sample()

array([ 2, 19, 19,  1,  1, 13, 12, 20, 19, 12, 19])

### 3. Run baseline test (No ML)

In [6]:
MAX_STEPS = 1000
episodes = 10

for episode in range(1, episodes+1):
    state = env.reset()
    steps = 0
    done = False
    score = 0
    
    while not done and steps < MAX_STEPS:
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
        steps += 1

    print('Episode:{} Score:{}'.format(episode,score))
    print_metrics(env)
    env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

Episode:1 Score:-2866
{'placed': 44, 'misplaced': 303, 'discarded': 32}
Total timesteps: 379
Bin 1: 95.0%
Bin 2: 100.0%
Bin 3: 100.0%
Bin 4: 95.0%
Bin 5: 95.0%
Bin 6: 95.0%
Bin 7: 100.0%
Bin 8: 100.0%
Bin 9: 100.0%
Bin 10: 100.0%
None
Episode:2 Score:-4298
{'placed': 35, 'misplaced': 445, 'discarded': 43}
Total timesteps: 523
Bin 1: 95.0%
Bin 2: 95.0%
Bin 3: 100.0%
Bin 4: 100.0%
Bin 5: 100.0%
Bin 6: 100.0%
Bin 7: 95.0%
Bin 8: 95.0%
Bin 9: 95.0%
Bin 10: 100.0%
None
Episode:3 Score:-561
{'placed': 49, 'misplaced': 75, 'discarded': 6}
Total timesteps: 130
Bin 1: 100.0%
Bin 2: 95.0%
Bin 3: 100.0%
Bin 4: 100.0%
Bin 5: 95.0%
Bin 6: 100.0%
Bin 7: 95.0%
Bin 8: 95.0%
Bin 9: 100.0%
Bin 10: 95.0%
None
Episode:4 Score:-3633
{'placed': 49, 'misplaced': 379, 'discarded': 42}
Total timesteps: 470
Bin 1: 100.0%
Bin 2: 100.0%
Bin 3: 100.0%
Bin 4: 95.0%
Bin 5: 100.0%
Bin 6: 100.0%
Bin 7: 100.0%
Bin 8: 100.0%
Bin 9: 100.0%
Bin 10: 100.0%
None
Episode:5 Score:-2367
{'placed': 42, 'misplaced': 254, 'discar

### 4. Train an RL Model

In [7]:
import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

In [8]:
# Will throw an error if these don't exist
log_path = os.path.join('Training', 'Logs')

In [9]:
env = BinPacking(num_bins=10, capacity=20, min_item_size=1)

In [10]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [11]:
model.learn(total_timesteps=20000)

Logging to Training\Logs\PPO_40
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 663      |
|    ep_rew_mean     | -5.4e+03 |
| time/              |          |
|    fps             | 672      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 432         |
|    ep_rew_mean          | -3.24e+03   |
| time/                   |             |
|    fps                  | 867         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.015183326 |
|    clip_fraction        | 0.0927      |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.39       |
|    explained_variance   | -0.00103    

<stable_baselines3.ppo.ppo.PPO at 0x1ecb4577e50>

### 5. Save model

In [12]:
#PPO_Path = os.path.join('Training', 'Saved Models', 'Constant_PPO_Model_Discard_Penalty')

In [13]:
#model.save(PPO_Path)

### 6. Load model

In [14]:
#model = PPO.load(PPO_Path, env=env)

### 7. Test model

In [16]:
MAX_STEPS = 1000
episodes = 10

for episode in range(1, episodes+1):
    obs = env.reset()
    steps = 0
    done = False
    score = 0
    
    while not done and steps < MAX_STEPS:
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
        steps += 1

    print('Episode:{} Score:{}'.format(episode,score))
    print_metrics(env)
    env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

Episode:1 Score:-359
{'placed': 51, 'misplaced': 47, 'discarded': 86}
Total timesteps: 184
Bin 1: 100.0%
Bin 2: 100.0%
Bin 3: 100.0%
Bin 4: 95.0%
Bin 5: 100.0%
Bin 6: 100.0%
Bin 7: 95.0%
Bin 8: 95.0%
Bin 9: 100.0%
Bin 10: 100.0%
None
Episode:2 Score:-522
{'placed': 45, 'misplaced': 62, 'discarded': 101}
Total timesteps: 208
Bin 1: 100.0%
Bin 2: 100.0%
Bin 3: 95.0%
Bin 4: 100.0%
Bin 5: 100.0%
Bin 6: 100.0%
Bin 7: 100.0%
Bin 8: 100.0%
Bin 9: 100.0%
Bin 10: 100.0%
None
Episode:3 Score:-145
{'placed': 50, 'misplaced': 28, 'discarded': 62}
Total timesteps: 140
Bin 1: 100.0%
Bin 2: 100.0%
Bin 3: 95.0%
Bin 4: 100.0%
Bin 5: 100.0%
Bin 6: 100.0%
Bin 7: 95.0%
Bin 8: 100.0%
Bin 9: 100.0%
Bin 10: 95.0%
None
Episode:4 Score:-39
{'placed': 46, 'misplaced': 20, 'discarded': 35}
Total timesteps: 101
Bin 1: 95.0%
Bin 2: 100.0%
Bin 3: 100.0%
Bin 4: 100.0%
Bin 5: 95.0%
Bin 6: 95.0%
Bin 7: 100.0%
Bin 8: 100.0%
Bin 9: 100.0%
Bin 10: 95.0%
None
Episode:5 Score:-524
{'placed': 53, 'misplaced': 63, 'discarded