### 0. Helper functions

In [1]:
import numpy as np

class knapsack_data:
    def __init__(self, env):
        self.total_capacity = np.full(env.num_knapsacks, 0) # Total capacity seen across all trials
        self.capacity_used = np.full(env.num_knapsacks, 0) # Total capacity used across all trials
        
        self.placed = 0 # Number of items correctly placed
        self.misplaced = 0
        self.discarded = 0
        
        self.total_values = np.full(env.num_knapsacks, 0)
        self.min_value = 1_000_000
        self.max_value = -1
        
        self.steps_taken = 0 # Number of timesteps taken before trial terminated
        
        self.data_points = 0 # Number of trials we've witnessed
    

    def log(self, env):        
        self.total_capacity += np.add(self.total_capacity, env.num_knapsacks * env.capacity)
        self.capacity_used += np.add(self.capacity_used, env.state[0][:-1])
        #print(env.state)
        
        self.placed += env.logs["placed"]
        self.misplaced += env.logs["misplaced"]
        self.discarded += env.logs["discarded"]
        
        values = env.state[1][:-1]
        
        self.total_values += values
        self.max_value = max(self.max_value, max(values))
        self.min_value = min(self.min_value, min(values))
        
        self.steps_taken += sum(env.logs.values())
        
        self.data_points += 1
        

    def get_avg(self):
        percentages = (self.capacity_used / self.total_capacity) * 100
        
        return {
            "steps" : self.steps_taken / self.data_points,
            "utilization" : 100 - (sum(percentages) / len(percentages)),
            "accuracy" : (self.placed / (self.placed + self.misplaced)) * 100,
            "min_value" : self.min_value,
            "max_value" : self.max_value,
            "avg_value" : sum(self.total_values) / (len(self.total_values) * self.data_points)
        }
    

    def print_data(self):
        data = self.get_avg()
        print(f'Average number of steps taken: {data["steps"]}')
        print(f'Average bin utilization: {round(data["utilization"], 2)}%')
        print(f'Accuracy: {round(data["accuracy"], 2)}%')
        print(f'Min value: {data["min_value"]}')
        print(f'Max value: {data["max_value"]}')
        print(f'Average value: {data["avg_value"]}')

### 1. Import dependencies

In [None]:
!pip install tensorflow==2.7.0
!pip install gym
!pip install keras
!pip install keras-rl2

In [2]:
from KnapsackEnvironment1D import KnapsackPacking

### 2. Create environment

In [3]:
env = KnapsackPacking(num_knapsacks=10, capacity=20)

In [4]:
env.action_space.sample()

5

In [5]:
env.observation_space.sample()

array([[   7,    1,   13,   17,   10,   20,    3,   16,   14,   13,    8],
       [1997,  203,  768, 1892,   80, 1322, 1948,  610,  170,  631, 1879]])

### 3. Run baseline test (No ML)

In [6]:
control_data = knapsack_data(env)

In [7]:
print(env.state)

[[20 20 20 20 20 20 20 20 20 20  4]
 [ 0  0  0  0  0  0  0  0  0  0 75]]


In [8]:
MAX_STEPS = 1000
episodes = 10

for episode in range(1, episodes+1):
    state = env.reset()
    steps = 0
    done = False
    score = 0
    
    while not done and steps < MAX_STEPS:
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
        steps += 1

    print('Episode:{} Score:{}'.format(episode,score))
    print(env.logs)
    control_data.log(env)
    env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

Episode:1 Score:1773
{'placed': 39, 'misplaced': 382, 'discarded': 39}
Episode:2 Score:1740
{'placed': 38, 'misplaced': 208, 'discarded': 21}
Episode:3 Score:2190
{'placed': 44, 'misplaced': 328, 'discarded': 34}
Episode:4 Score:2173
{'placed': 44, 'misplaced': 175, 'discarded': 35}
Episode:5 Score:2550
{'placed': 47, 'misplaced': 386, 'discarded': 56}
Episode:6 Score:2358
{'placed': 48, 'misplaced': 405, 'discarded': 45}
Episode:7 Score:2223
{'placed': 50, 'misplaced': 432, 'discarded': 40}
Episode:8 Score:2143
{'placed': 46, 'misplaced': 391, 'discarded': 40}
Episode:9 Score:2354
{'placed': 48, 'misplaced': 499, 'discarded': 59}
Episode:10 Score:2132
{'placed': 45, 'misplaced': 440, 'discarded': 45}


In [9]:
control_data.print_data()

Average number of steps taken: 450.9
Average bin utilization: 99.84%
Accuracy: 10.96%
Min value: 34
Max value: 384
Average value: 216.36


### 4. Train an RL Model

In [10]:
import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

In [11]:
# Will throw an error if these don't exist
log_path = os.path.join('Training', 'Logs')

In [12]:
env = KnapsackPacking(num_knapsacks=10, capacity=20)

In [13]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [14]:
model.learn(total_timesteps=80000)

Logging to Training\Logs\PPO_59
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 440      |
|    ep_rew_mean     | 2.36e+03 |
| time/              |          |
|    fps             | 305      |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 439         |
|    ep_rew_mean          | 2.44e+03    |
| time/                   |             |
|    fps                  | 404         |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.011048477 |
|    clip_fraction        | 0.0926      |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.39       |
|    explained_variance   | 0.000623    

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 394         |
|    ep_rew_mean          | 2.33e+03    |
| time/                   |             |
|    fps                  | 563         |
|    iterations           | 11          |
|    time_elapsed         | 40          |
|    total_timesteps      | 22528       |
| train/                  |             |
|    approx_kl            | 0.006002997 |
|    clip_fraction        | 0.0324      |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.36       |
|    explained_variance   | -1.68e-05   |
|    learning_rate        | 0.0003      |
|    loss                 | 1.24e+04    |
|    n_updates            | 100         |
|    policy_gradient_loss | -0.017      |
|    value_loss           | 2.36e+04    |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 394 

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 390         |
|    ep_rew_mean          | 2.29e+03    |
| time/                   |             |
|    fps                  | 584         |
|    iterations           | 21          |
|    time_elapsed         | 73          |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008904998 |
|    clip_fraction        | 0.0568      |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.31       |
|    explained_variance   | -2.38e-06   |
|    learning_rate        | 0.0003      |
|    loss                 | 1.4e+04     |
|    n_updates            | 200         |
|    policy_gradient_loss | -0.0185     |
|    value_loss           | 2.51e+04    |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 380   

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 425          |
|    ep_rew_mean          | 2.31e+03     |
| time/                   |              |
|    fps                  | 579          |
|    iterations           | 31           |
|    time_elapsed         | 109          |
|    total_timesteps      | 63488        |
| train/                  |              |
|    approx_kl            | 0.0055621658 |
|    clip_fraction        | 0.0397       |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.24        |
|    explained_variance   | 0.0121       |
|    learning_rate        | 0.0003       |
|    loss                 | 6.32e+03     |
|    n_updates            | 300          |
|    policy_gradient_loss | -0.00986     |
|    value_loss           | 1.64e+04     |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

<stable_baselines3.ppo.ppo.PPO at 0x2487b6556a0>

### 5. Save model

In [None]:
PPO_Path = os.path.join('Training', 'Saved Models', 'Knapsack_model')

In [None]:
model.save(PPO_Path)

### 6. Load model

In [None]:
model = PPO.load(PPO_Path, env=env)

### 7. Test model

In [15]:
real_data = knapsack_data(env)
env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

In [16]:
MAX_STEPS = 1000
episodes = 10

for episode in range(1, episodes+1):
    obs = env.reset()
    steps = 0
    done = False
    score = 0
    
    while not done and steps < MAX_STEPS:
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
        steps += 1

    print('Episode:{} Score:{}'.format(episode,score))
    print(env.logs)
    real_data.log(env)
    env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

Episode:1 Score:2524
{'placed': 51, 'misplaced': 386, 'discarded': 53}
Episode:2 Score:2594
{'placed': 48, 'misplaced': 466, 'discarded': 53}
Episode:3 Score:2063
{'placed': 43, 'misplaced': 222, 'discarded': 29}
Episode:4 Score:2420
{'placed': 52, 'misplaced': 768, 'discarded': 93}
Episode:5 Score:2139
{'placed': 44, 'misplaced': 103, 'discarded': 15}
Episode:6 Score:2022
{'placed': 41, 'misplaced': 118, 'discarded': 19}
Episode:7 Score:2257
{'placed': 43, 'misplaced': 261, 'discarded': 34}
Episode:8 Score:2369
{'placed': 44, 'misplaced': 470, 'discarded': 55}
Episode:9 Score:1986
{'placed': 46, 'misplaced': 349, 'discarded': 46}
Episode:10 Score:2270
{'placed': 48, 'misplaced': 351, 'discarded': 34}


In [17]:
control_data.print_data()

Average number of steps taken: 450.9
Average bin utilization: 99.84%
Accuracy: 10.96%
Min value: 34
Max value: 384
Average value: 216.36


In [18]:
real_data.print_data()

Average number of steps taken: 438.5
Average bin utilization: 99.81%
Accuracy: 11.63%
Min value: 19
Max value: 526
Average value: 226.44
