### 0. Helper functions

In [9]:
import numpy as np

class knapsack_data:
    def __init__(self, env):
        self.total_capacity = np.full(env.num_knapsacks, 0) # Total capacity seen across all trials
        self.capacity_used = np.full(env.num_knapsacks, 0) # Total capacity used across all trials
        
        self.placed = 0 # Number of items correctly placed
        self.misplaced = 0
        self.discarded = 0
        
        self.total_values = np.full(env.num_knapsacks, 0)
        self.min_value = 1_000_000
        self.max_value = -1
        
        self.steps_taken = 0 # Number of timesteps taken before trial terminated
        
        self.data_points = 0 # Number of trials we've witnessed
    

    def log(self, env):        
        self.total_capacity += np.add(self.total_capacity, env.num_knapsacks * env.capacity)
        self.capacity_used += np.add(self.capacity_used, env.state[0][:-1])
        #print(env.state)
        
        self.placed += env.logs["placed"]
        self.misplaced += env.logs["misplaced"]
        self.discarded += env.logs["discarded"]
        
        values = env.state[1][:-1]
        
        self.total_values += values
        self.max_value = max(self.max_value, max(values))
        self.min_value = min(self.min_value, min(values))
        
        self.steps_taken += sum(env.logs.values())
        
        self.data_points += 1
        

    def get_avg(self):
        percentages = (self.capacity_used / self.total_capacity) * 100
        
        return {
            "steps" : self.steps_taken / self.data_points,
            "utilization" : 100 - (sum(percentages) / len(percentages)),
            "accuracy" : (self.placed / (self.placed + self.misplaced)) * 100,
            "min_value" : self.min_value,
            "max_value" : self.max_value,
            "avg_value" : sum(self.total_values) / (len(self.total_values) * self.data_points)
        }
    

    def print_data(self):
        data = self.get_avg()
        print(f'Average number of steps taken: {data["steps"]}')
        print(f'Average bin utilization: {round(data["utilization"], 2)}%')
        print(f'Accuracy: {round(data["accuracy"], 2)}%')
        print(f'Min value: {data["min_value"]}')
        print(f'Max value: {data["max_value"]}')
        print(f'Average value: {data["avg_value"]}')

### 1. Import dependencies

In [None]:
!pip install tensorflow==2.7.0
!pip install gym
!pip install keras
!pip install keras-rl2

In [2]:
from KnapsackEnvironment1D import KnapsackPacking

### 2. Create environment

In [10]:
env = KnapsackPacking(num_knapsacks=10, capacity=20)

In [11]:
env.action_space.sample()

4

In [12]:
env.observation_space.sample()

array([[   2,    3,    4,    5,    8,    5,    5,    2,    6,    5,   14],
       [ 214,  568, 1565, 1240, 1547, 1849, 1944, 1986, 1851, 1465, 1563]])

### 3. Run baseline test (No ML)

In [13]:
control_data = knapsack_data(env)

In [14]:
print(env.state)

[[20 20 20 20 20 20 20 20 20 20  9]
 [ 0  0  0  0  0  0  0  0  0  0 59]]


In [15]:
MAX_STEPS = 1000
episodes = 10

for episode in range(1, episodes+1):
    state = env.reset()
    steps = 0
    done = False
    score = 0
    
    while not done and steps < MAX_STEPS:
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
        steps += 1

    print('Episode:{} Score:{}'.format(episode,score))
    print(env.logs)
    control_data.log(env)
    env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

Episode:1 Score:2472
{'placed': 48, 'misplaced': 780, 'discarded': 91}
Episode:2 Score:1704
{'placed': 42, 'misplaced': 420, 'discarded': 40}
Episode:3 Score:2529
{'placed': 47, 'misplaced': 246, 'discarded': 25}
Episode:4 Score:2673
{'placed': 45, 'misplaced': 156, 'discarded': 18}
Episode:5 Score:2324
{'placed': 44, 'misplaced': 514, 'discarded': 54}
Episode:6 Score:2451
{'placed': 49, 'misplaced': 525, 'discarded': 69}
Episode:7 Score:2270
{'placed': 43, 'misplaced': 317, 'discarded': 31}
Episode:8 Score:2413
{'placed': 46, 'misplaced': 276, 'discarded': 36}
Episode:9 Score:2303
{'placed': 45, 'misplaced': 408, 'discarded': 52}
Episode:10 Score:2420
{'placed': 49, 'misplaced': 150, 'discarded': 17}


In [16]:
control_data.print_data()

Average number of steps taken: 468.3
Average bin utilization: 99.93%
Accuracy: 10.78%
Min value: 64
Max value: 529
Average value: 235.59


### 4. Train an RL Model

In [18]:
import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

In [19]:
# Will throw an error if these don't exist
log_path = os.path.join('Training', 'Logs')

In [20]:
env = KnapsackPacking(num_knapsacks=10, capacity=20)

In [13]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [14]:
model.learn(total_timesteps=80000)

Logging to Training\Logs\PPO_57
Adding 77 to value of bin 5. Total value: 77
Adding 63 to value of bin 5. Total value: 140
Adding 92 to value of bin 5. Total value: 232
Adding 64 to value of bin 5. Total value: 296
-----------------------------
| time/              |      |
|    fps             | 251  |
|    iterations      | 1    |
|    time_elapsed    | 8    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 369        |
|    iterations           | 2          |
|    time_elapsed         | 11         |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.02180513 |
|    clip_fraction        | 0.133      |
|    clip_range           | 0.2        |
|    entropy_loss         | -2.38      |
|    explained_variance   | -0.000467  |
|    learning_rate        | 0.0003     |
|    loss                 | 4.63e+03   |
|

-----------------------------------------
| time/                   |             |
|    fps                  | 588         |
|    iterations           | 13          |
|    time_elapsed         | 45          |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.005596636 |
|    clip_fraction        | 0.124       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.841      |
|    explained_variance   | 0           |
|    learning_rate        | 0.0003      |
|    loss                 | 2.15e+05    |
|    n_updates            | 120         |
|    policy_gradient_loss | -0.0155     |
|    value_loss           | 4.44e+05    |
-----------------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 591          |
|    iterations           | 14           |
|    time_elapsed         | 48           |
|    total_timesteps      | 2

-------------------------------------------
| time/                   |               |
|    fps                  | 601           |
|    iterations           | 24            |
|    time_elapsed         | 81            |
|    total_timesteps      | 49152         |
| train/                  |               |
|    approx_kl            | 0.00023630317 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.118        |
|    explained_variance   | 0             |
|    learning_rate        | 0.0003        |
|    loss                 | 3.22e+05      |
|    n_updates            | 230           |
|    policy_gradient_loss | -0.000485     |
|    value_loss           | 6.25e+05      |
-------------------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 600          |
|    iterations           | 25           |
|    time_elapsed         | 85      

-------------------------------------------
| time/                   |               |
|    fps                  | 608           |
|    iterations           | 35            |
|    time_elapsed         | 117           |
|    total_timesteps      | 71680         |
| train/                  |               |
|    approx_kl            | 7.1148825e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.0358       |
|    explained_variance   | 0             |
|    learning_rate        | 0.0003        |
|    loss                 | 3.1e+05       |
|    n_updates            | 340           |
|    policy_gradient_loss | -0.000394     |
|    value_loss           | 6.34e+05      |
-------------------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 609          |
|    iterations           | 36           |
|    time_elapsed         | 121     

<stable_baselines3.ppo.ppo.PPO at 0x21544d75910>

### 5. Save model

In [21]:
PPO_Path = os.path.join('Training', 'Saved Models', 'Knapsack_model')

In [20]:
model.save(PPO_Path)

### 6. Load model

In [22]:
model = PPO.load(PPO_Path, env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


### 7. Test model

In [23]:
real_data = knapsack_data(env)
env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

In [24]:
MAX_STEPS = 1000
episodes = 10

for episode in range(1, episodes+1):
    obs = env.reset()
    steps = 0
    done = False
    score = 0
    
    while not done and steps < MAX_STEPS:
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
        steps += 1

    print('Episode:{} Score:{}'.format(episode,score))
    print(env.logs)
    real_data.log(env)
    env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

Episode:1 Score:2647
{'placed': 49, 'misplaced': 73, 'discarded': 228}
Episode:2 Score:2037
{'placed': 40, 'misplaced': 23, 'discarded': 937}
Episode:3 Score:2484
{'placed': 47, 'misplaced': 73, 'discarded': 880}
Episode:4 Score:2082
{'placed': 48, 'misplaced': 110, 'discarded': 842}
Episode:5 Score:2264
{'placed': 44, 'misplaced': 133, 'discarded': 708}
Episode:6 Score:2607
{'placed': 46, 'misplaced': 83, 'discarded': 871}
Episode:7 Score:2348
{'placed': 50, 'misplaced': 109, 'discarded': 391}
Episode:8 Score:2396
{'placed': 50, 'misplaced': 85, 'discarded': 865}
Episode:9 Score:2222
{'placed': 51, 'misplaced': 41, 'discarded': 908}
Episode:10 Score:2448
{'placed': 43, 'misplaced': 19, 'discarded': 938}


In [25]:
control_data.print_data()

Average number of steps taken: 468.3
Average bin utilization: 99.93%
Accuracy: 10.78%
Min value: 64
Max value: 529
Average value: 235.59


In [26]:
real_data.print_data()

Average number of steps taken: 878.5
Average bin utilization: 99.43%
Accuracy: 38.46%
Min value: 68
Max value: 519
Average value: 235.35
