### 0. Helper functions

In [56]:
import numpy as np

class knapsack_data:
    def __init__(self, env):
        self.total_capacity = np.full(env.num_knapsacks, 0) # Total capacity seen across all trials
        self.capacity_used = np.full(env.num_knapsacks, 0) # Total capacity used across all trials
        
        self.placed = 0 # Number of items correctly placed
        self.misplaced = 0
        self.discarded = 0
        
        self.total_values = np.full(env.num_knapsacks, 0)
        self.min_value = 1_000_000
        self.max_value = -1
        
        self.steps_taken = 0 # Number of timesteps taken before trial terminated
        
        self.data_points = 0 # Number of trials we've witnessed
    

    def log(self, env):        
        self.total_capacity += np.add(self.total_capacity, env.num_knapsacks * env.capacity)
        self.capacity_used += np.add(self.capacity_used, env.state[0][:-1])
        #print(env.state)
        
        self.placed += env.logs["placed"]
        self.misplaced += env.logs["misplaced"]
        self.discarded += env.logs["discarded"]
        
        values = env.state[1][:-1]
        
        self.total_values += values
        self.max_value = max(self.max_value, max(values))
        self.min_value = min(self.min_value, min(values))
        
        self.steps_taken += sum(env.logs.values())
        
        self.data_points += 1
        

    def get_avg(self):
        percentages = (self.capacity_used / self.total_capacity) * 100
        
        return {
            "steps" : self.steps_taken / self.data_points,
            "utilization" : 100 - (sum(percentages) / len(percentages)),
            "accuracy" : (self.placed / (self.placed + self.misplaced)) * 100,
            "min_value" : self.min_value,
            "max_value" : self.max_value,
            "avg_value" : sum(self.total_values) / (len(self.total_values) * self.data_points)
        }
    

    def print_data(self):
        data = self.get_avg()
        print(f'Average number of steps taken: {data["steps"]}')
        print(f'Average bin utilization: {round(data["utilization"], 2)}%')
        print(f'Accuracy: {round(data["accuracy"], 2)}%')
        print(f'Min value: {data["min_value"]}')
        print(f'Max value: {data["max_value"]}')
        print(f'Average value: {data["avg_value"]}')

In [37]:
class bin_data:
    def __init__(self, env):
        self.total_capacity = np.full(env.num_knapsacks, 0) # Total capacity seen across all trials
        self.capacity_used = np.full(env.num_knapsacks, 0) # Total capacity used across all trials
        self.steps_taken = 0 # Number of timesteps taken before trial terminated
        self.placed = 0 # Number of items correctly placed
        self.misplaced = 0
        self.discarded = 0
        self.data_points = 0 # Number of trials we've witnessed
    

    def log(self, env):
        self.total_capacity = np.add(self.total_capacity, env.capacity)
        self.capacity_used = np.add(self.capacity_used, env.state[:-1])
        self.steps_taken += sum(env.logs.values())
        self.placed += env.logs["placed"]
        self.misplaced += env.logs["misplaced"]
        self.discarded += env.logs["discarded"]
        self.data_points += 1
        

    def get_avg(self):
        percentages = 100 - ((self.capacity_used/self.total_capacity) * 100)
        
        return {
            "steps" : self.steps_taken / self.data_points,
            "utilization" : sum(percentages) / len(percentages),
            "accuracy" : (self.placed / (self.placed + self.misplaced)) * 100
        }
    

    def print_data(self):
        data = self.get_avg()
        print(f'Average number of steps taken: {data["steps"]}')
        print(f'Average bin utilization: {round(data["utilization"], 2)}%')
        print(f'Accuracy: {round(data["accuracy"], 2)}%')

### 1. Import dependencies

In [2]:
!pip install tensorflow==2.7.0
!pip install gym
!pip install keras
!pip install keras-rl2



In [52]:
from KnapsackEnvironment1D import KnapsackPacking

### 2. Create environment

In [53]:
env = KnapsackPacking(num_knapsacks=10, capacity=20)

In [54]:
env.action_space.sample()

9

In [57]:
env.observation_space.sample()

array([[   9,    3,   11,   14,   12,    6,   10,   18,    4,   19,   16],
       [1413, 1440,   10,  587, 1805,  455,  882, 1584, 1000,   37,  376]])

### 3. Run baseline test (No ML)

In [59]:
control_data = knapsack_data(env)

In [60]:
print(env.state)

[[20 20 20 20 20 20 20 20 20 20  7]
 [ 0  0  0  0  0  0  0  0  0  0 89]]


In [61]:
MAX_STEPS = 1000
episodes = 10

for episode in range(1, episodes+1):
    state = env.reset()
    steps = 0
    done = False
    score = 0
    
    while not done and steps < MAX_STEPS:
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
        steps += 1

    print('Episode: {} Score: {}'.format(episode,score))
    print(env.logs)
    control_data.log(env)
    env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

Episode: 1 Score: -2153
{'placed': 44, 'misplaced': 381, 'discarded': 36}
Episode: 2 Score: -111
{'placed': 46, 'misplaced': 217, 'discarded': 31}
Episode: 3 Score: -2369
{'placed': 47, 'misplaced': 426, 'discarded': 39}
Episode: 4 Score: -2912
{'placed': 46, 'misplaced': 475, 'discarded': 58}
Episode: 5 Score: -1667
{'placed': 55, 'misplaced': 437, 'discarded': 45}
Episode: 6 Score: -1519
{'placed': 42, 'misplaced': 310, 'discarded': 41}
Episode: 7 Score: -3161
{'placed': 44, 'misplaced': 469, 'discarded': 46}
Episode: 8 Score: -1222
{'placed': 52, 'misplaced': 371, 'discarded': 35}
Episode: 9 Score: 784
{'placed': 42, 'misplaced': 106, 'discarded': 17}
Episode: 10 Score: -702
{'placed': 44, 'misplaced': 289, 'discarded': 32}


In [63]:
control_data.print_data()

Average number of steps taken: 432.3
Average bin utilization: 99.91%
Accuracy: 11.72%
Min value: 77
Max value: 443
Average value: 235.78


In [64]:
print(env.state)

[[  0   0   1   0   1   1   0   0   0   0   1]
 [216 182 345 196 199 229 243 281 390 227  85]]


### 4. Train an RL Model

In [65]:
import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

In [66]:
# Will throw an error if these don't exist
log_path = os.path.join('Training', 'Logs')

In [67]:
env = KnapsackPacking(num_knapsacks=10, capacity=20)

In [68]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [69]:
model.learn(total_timesteps=100000)

Logging to Training\Logs\PPO_74
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 340      |
|    ep_rew_mean     | -500     |
| time/              |          |
|    fps             | 895      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -1.19e+03    |
| time/                   |              |
|    fps                  | 652          |
|    iterations           | 2            |
|    time_elapsed         | 6            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0061404393 |
|    clip_fraction        | 0.0145       |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.39        |
|    explained_variance   

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 355          |
|    ep_rew_mean          | -862         |
| time/                   |              |
|    fps                  | 513          |
|    iterations           | 11           |
|    time_elapsed         | 43           |
|    total_timesteps      | 22528        |
| train/                  |              |
|    approx_kl            | 0.0052267676 |
|    clip_fraction        | 0.00488      |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.35        |
|    explained_variance   | 0.275        |
|    learning_rate        | 0.0003       |
|    loss                 | 1.02e+04     |
|    n_updates            | 100          |
|    policy_gradient_loss | -0.00672     |
|    value_loss           | 3.18e+04     |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 330         |
|    ep_rew_mean          | -612        |
| time/                   |             |
|    fps                  | 530         |
|    iterations           | 21          |
|    time_elapsed         | 81          |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.006454389 |
|    clip_fraction        | 0.0204      |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.35       |
|    explained_variance   | 0.503       |
|    learning_rate        | 0.0003      |
|    loss                 | 6.99e+03    |
|    n_updates            | 200         |
|    policy_gradient_loss | -0.00997    |
|    value_loss           | 2.2e+04     |
-----------------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 317       

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 320          |
|    ep_rew_mean          | -424         |
| time/                   |              |
|    fps                  | 531          |
|    iterations           | 31           |
|    time_elapsed         | 119          |
|    total_timesteps      | 63488        |
| train/                  |              |
|    approx_kl            | 0.0049488945 |
|    clip_fraction        | 0.0262       |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.3         |
|    explained_variance   | 0.572        |
|    learning_rate        | 0.0003       |
|    loss                 | 1.7e+04      |
|    n_updates            | 300          |
|    policy_gradient_loss | -0.00716     |
|    value_loss           | 2.31e+04     |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 336        |
|    ep_rew_mean          | -547       |
| time/                   |            |
|    fps                  | 534        |
|    iterations           | 41         |
|    time_elapsed         | 157        |
|    total_timesteps      | 83968      |
| train/                  |            |
|    approx_kl            | 0.00594241 |
|    clip_fraction        | 0.0182     |
|    clip_range           | 0.2        |
|    entropy_loss         | -2.28      |
|    explained_variance   | 0.603      |
|    learning_rate        | 0.0003     |
|    loss                 | 7.94e+03   |
|    n_updates            | 400        |
|    policy_gradient_loss | -0.00821   |
|    value_loss           | 2.42e+04   |
----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 334         |
|    ep_rew_m

<stable_baselines3.ppo.ppo.PPO at 0x2af6cd2d370>

### 5. Save model

In [70]:
PPO_Path = os.path.join('Training', 'Saved Models', 'Knapsack_model')

In [None]:
model.save(PPO_Path)

### 6. Load model

In [None]:
model = PPO.load(PPO_Path, env=env)

### 7. Test model

In [71]:
real_data = knapsack_data(env)
env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

In [72]:
MAX_STEPS = 1000
episodes = 10

for episode in range(1, episodes+1):
    obs = env.reset()
    steps = 0
    done = False
    score = 0
    
    while not done and steps < MAX_STEPS:
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
        steps += 1

    print('Episode: {} Score: {}'.format(episode,score))
    print(env.logs)
    real_data.log(env)
    env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

Episode: 1 Score: -97
{'placed': 46, 'misplaced': 168, 'discarded': 64}
Episode: 2 Score: -2305
{'placed': 55, 'misplaced': 336, 'discarded': 191}
Episode: 3 Score: 448
{'placed': 48, 'misplaced': 122, 'discarded': 63}
Episode: 4 Score: 459
{'placed': 43, 'misplaced': 114, 'discarded': 75}
Episode: 5 Score: 683
{'placed': 48, 'misplaced': 140, 'discarded': 50}
Episode: 6 Score: 8
{'placed': 46, 'misplaced': 148, 'discarded': 77}
Episode: 7 Score: 326
{'placed': 45, 'misplaced': 112, 'discarded': 80}
Episode: 8 Score: -1289
{'placed': 44, 'misplaced': 220, 'discarded': 91}
Episode: 9 Score: 631
{'placed': 47, 'misplaced': 107, 'discarded': 78}
Episode: 10 Score: -997
{'placed': 47, 'misplaced': 246, 'discarded': 109}


In [73]:
control_data.print_data()

Average number of steps taken: 432.3
Average bin utilization: 99.91%
Accuracy: 11.72%
Min value: 77
Max value: 443
Average value: 235.78


In [74]:
real_data.print_data()

Average number of steps taken: 306.0
Average bin utilization: 99.88%
Accuracy: 21.49%
Min value: 44
Max value: 460
Average value: 237.77
