### 0. Helper functions

In [1]:
import numpy as np

class knapsack_data:
    def __init__(self, env):
        self.total_capacity = np.full(env.num_knapsacks, 0) # Total capacity seen across all trials
        self.capacity_used = np.full(env.num_knapsacks, 0) # Total capacity used across all trials
        
        self.placed = 0 # Number of items correctly placed
        self.misplaced = 0
        self.discarded = 0
        
        self.total_values = np.full(env.num_knapsacks, 0)
        self.min_value = 1_000_000
        self.max_value = -1
        
        self.steps_taken = 0 # Number of timesteps taken before trial terminated
        
        self.data_points = 0 # Number of trials we've witnessed
    

    def log(self, env):        
        self.total_capacity += np.add(self.total_capacity, env.num_knapsacks * env.capacity)
        self.capacity_used += np.add(self.capacity_used, env.state[0][:-1])
        #print(env.state)
        
        self.placed += env.logs["placed"]
        self.misplaced += env.logs["misplaced"]
        self.discarded += env.logs["discarded"]
        
        values = env.state[1][:-1]
        
        self.total_values += values
        self.max_value = max(self.max_value, max(values))
        self.min_value = min(self.min_value, min(values))
        
        self.steps_taken += sum(env.logs.values())
        
        self.data_points += 1
        

    def get_avg(self):
        percentages = (self.capacity_used / self.total_capacity) * 100
        
        return {
            "steps" : self.steps_taken / self.data_points,
            "utilization" : 100 - (sum(percentages) / len(percentages)),
            "accuracy" : (self.placed / (self.placed + self.misplaced)) * 100,
            "min_value" : self.min_value,
            "max_value" : self.max_value,
            "avg_value" : sum(self.total_values) / (len(self.total_values) * self.data_points)
        }
    

    def print_data(self):
        data = self.get_avg()
        print(f'Average number of steps taken: {data["steps"]}')
        print(f'Average bin utilization: {round(data["utilization"], 2)}%')
        print(f'Accuracy: {round(data["accuracy"], 2)}%')
        print(f'Min value: {data["min_value"]}')
        print(f'Max value: {data["max_value"]}')
        print(f'Average value: {data["avg_value"]}')

### 1. Import dependencies

In [None]:
!pip install tensorflow==2.7.0
!pip install gym
!pip install keras
!pip install keras-rl2

In [2]:
from KnapsackEnvironment1D import KnapsackPacking

### 2. Create environment

In [3]:
env = KnapsackPacking(num_knapsacks=10, capacity=20)

In [4]:
env.action_space.sample()

2

In [5]:
env.observation_space.sample()

array([[  19,   19,   18,   19,   13,    9,    5,   15,    4,    4,   18],
       [ 839,  857, 1504, 1263, 1213, 1798, 1525, 1038,   20, 1003,  590]])

### 3. Run baseline test (No ML)

In [6]:
control_data = knapsack_data(env)

In [7]:
print(env.state)

[[20 20 20 20 20 20 20 20 20 20  8]
 [ 0  0  0  0  0  0  0  0  0  0 46]]


In [8]:
MAX_STEPS = 1000
episodes = 10

for episode in range(1, episodes+1):
    state = env.reset()
    steps = 0
    done = False
    score = 0
    
    while not done and steps < MAX_STEPS:
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
        steps += 1

    print('Episode: {} Score: {}'.format(episode,score))
    print(env.logs)
    control_data.log(env)
    env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

Episode:1 Score:97076.14
{'placed': 48, 'misplaced': 389, 'discarded': 52}
Episode:2 Score:108429.1
{'placed': 42, 'misplaced': 565, 'discarded': 60}
Episode:3 Score:78958.25
{'placed': 42, 'misplaced': 380, 'discarded': 30}
Episode:4 Score:106429.2
{'placed': 48, 'misplaced': 535, 'discarded': 60}
Episode:5 Score:103415.31000000001
{'placed': 45, 'misplaced': 218, 'discarded': 21}
Episode:6 Score:126850.35999999999
{'placed': 46, 'misplaced': 105, 'discarded': 18}
Episode:7 Score:78026.27999999998
{'placed': 46, 'misplaced': 185, 'discarded': 33}
Episode:8 Score:95537.94999999997
{'placed': 45, 'misplaced': 544, 'discarded': 58}
Episode:9 Score:118437.72000000004
{'placed': 46, 'misplaced': 225, 'discarded': 35}
Episode:10 Score:80638.81000000001
{'placed': 44, 'misplaced': 400, 'discarded': 52}


In [9]:
control_data.print_data()

Average number of steps taken: 441.7
Average bin utilization: 99.78%
Accuracy: 11.31%
Min value: 47
Max value: 390
Average value: 213.0


In [10]:
print(env.state)

[[  0   0   0   0   0   0   0   0   0   0   1]
 [143 250 110 219 260  82 209  48 172 258  67]]


### 4. Train an RL Model

In [11]:
import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

In [12]:
# Will throw an error if these don't exist
log_path = os.path.join('Training', 'Logs')

In [13]:
env = KnapsackPacking(num_knapsacks=10, capacity=20)

In [14]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [15]:
model.learn(total_timesteps=100000)

Logging to Training\Logs\PPO_65
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 369      |
|    ep_rew_mean     | 1.01e+05 |
| time/              |          |
|    fps             | 270      |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 2048     |
---------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 409           |
|    ep_rew_mean          | 1.11e+05      |
| time/                   |               |
|    fps                  | 390           |
|    iterations           | 2             |
|    time_elapsed         | 10            |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 0.00034982347 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -2.4          |
|    explain

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 500           |
|    ep_rew_mean          | 1.07e+05      |
| time/                   |               |
|    fps                  | 612           |
|    iterations           | 11            |
|    time_elapsed         | 36            |
|    total_timesteps      | 22528         |
| train/                  |               |
|    approx_kl            | 0.00030931467 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -2.39         |
|    explained_variance   | -4.77e-07     |
|    learning_rate        | 0.0003        |
|    loss                 | 2.53e+07      |
|    n_updates            | 100           |
|    policy_gradient_loss | -0.00158      |
|    value_loss           | 4.51e+07      |
-------------------------------------------
-------------------------------------------
| rollout/                |     

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 426          |
|    ep_rew_mean          | 1.09e+05     |
| time/                   |              |
|    fps                  | 638          |
|    iterations           | 20           |
|    time_elapsed         | 64           |
|    total_timesteps      | 40960        |
| train/                  |              |
|    approx_kl            | 9.229779e-05 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.39        |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 4.3e+07      |
|    n_updates            | 190          |
|    policy_gradient_loss | -0.000812    |
|    value_loss           | 8.57e+07     |
------------------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_l

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 376          |
|    ep_rew_mean          | 1.12e+05     |
| time/                   |              |
|    fps                  | 635          |
|    iterations           | 29           |
|    time_elapsed         | 93           |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 5.452102e-05 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.39        |
|    explained_variance   | -1.19e-07    |
|    learning_rate        | 0.0003       |
|    loss                 | 3.31e+07     |
|    n_updates            | 280          |
|    policy_gradient_loss | -0.00083     |
|    value_loss           | 9.09e+07     |
------------------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_l

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 391           |
|    ep_rew_mean          | 1.14e+05      |
| time/                   |               |
|    fps                  | 625           |
|    iterations           | 38            |
|    time_elapsed         | 124           |
|    total_timesteps      | 77824         |
| train/                  |               |
|    approx_kl            | 0.00045903737 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -2.38         |
|    explained_variance   | 0             |
|    learning_rate        | 0.0003        |
|    loss                 | 1.31e+07      |
|    n_updates            | 370           |
|    policy_gradient_loss | -0.00195      |
|    value_loss           | 2.98e+07      |
-------------------------------------------
------------------------------------------
| rollout/                |      

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 391           |
|    ep_rew_mean          | 1.13e+05      |
| time/                   |               |
|    fps                  | 619           |
|    iterations           | 47            |
|    time_elapsed         | 155           |
|    total_timesteps      | 96256         |
| train/                  |               |
|    approx_kl            | 0.00026132504 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -2.38         |
|    explained_variance   | 0             |
|    learning_rate        | 0.0003        |
|    loss                 | 7.98e+07      |
|    n_updates            | 460           |
|    policy_gradient_loss | -0.00129      |
|    value_loss           | 6.17e+07      |
-------------------------------------------
------------------------------------------
| rollout/                |      

<stable_baselines3.ppo.ppo.PPO at 0x1cb7edc4700>

### 5. Save model

In [16]:
PPO_Path = os.path.join('Training', 'Saved Models', 'Knapsack_model')

In [17]:
model.save(PPO_Path)

### 6. Load model

In [None]:
model = PPO.load(PPO_Path, env=env)

### 7. Test model

In [16]:
real_data = knapsack_data(env)
env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

In [17]:
MAX_STEPS = 1000
episodes = 10

for episode in range(1, episodes+1):
    obs = env.reset()
    steps = 0
    done = False
    score = 0
    
    while not done and steps < MAX_STEPS:
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
        steps += 1

    print('Episode: {} Score: {}'.format(episode,score))
    print(env.logs)
    real_data.log(env)
    env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

Episode:1 Score:107060.54999999999
{'placed': 43, 'misplaced': 95, 'discarded': 19}
Episode:2 Score:90523.84000000001
{'placed': 43, 'misplaced': 183, 'discarded': 34}
Episode:3 Score:174652.81000000003
{'placed': 51, 'misplaced': 137, 'discarded': 25}
Episode:4 Score:166382.34999999995
{'placed': 43, 'misplaced': 462, 'discarded': 77}
Episode:5 Score:114070.89000000003
{'placed': 49, 'misplaced': 185, 'discarded': 36}
Episode:6 Score:136171.81000000003
{'placed': 40, 'misplaced': 146, 'discarded': 33}
Episode:7 Score:134696.93999999997
{'placed': 47, 'misplaced': 60, 'discarded': 11}
Episode:8 Score:106022.64
{'placed': 43, 'misplaced': 133, 'discarded': 28}
Episode:9 Score:85201.35999999999
{'placed': 44, 'misplaced': 95, 'discarded': 23}
Episode:10 Score:105080.50999999998
{'placed': 44, 'misplaced': 249, 'discarded': 49}


In [20]:
control_data.print_data()

Average number of steps taken: 441.7
Average bin utilization: 99.78%
Accuracy: 11.31%
Min value: 47
Max value: 390
Average value: 213.0


In [21]:
real_data.print_data()

Average number of steps taken: 252.7
Average bin utilization: 99.8%
Accuracy: 20.39%
Min value: 83
Max value: 466
Average value: 233.16
