### 0. Helper functions

In [9]:
import numpy as np

class knapsack_data:
    def __init__(self, env):
        self.total_capacity = np.full(env.num_knapsacks, 0) # Total capacity seen across all trials
        self.capacity_used = np.full(env.num_knapsacks, 0) # Total capacity used across all trials
        
        self.placed = 0 # Number of items correctly placed
        self.misplaced = 0
        self.discarded = 0
        
        self.total_values = np.full(env.num_knapsacks, 0)
        self.min_value = 1_000_000
        self.max_value = -1
        
        self.steps_taken = 0 # Number of timesteps taken before trial terminated
        
        self.data_points = 0 # Number of trials we've witnessed
    

    def log(self, env):        
        self.total_capacity += np.add(self.total_capacity, env.num_knapsacks * env.capacity)
        self.capacity_used += np.add(self.capacity_used, env.state[0][:-1])
        #print(env.state)
        
        self.placed += env.logs["placed"]
        self.misplaced += env.logs["misplaced"]
        self.discarded += env.logs["discarded"]
        
        values = env.state[1][:-1]
        
        self.total_values += values
        self.max_value = max(self.max_value, max(values))
        self.min_value = min(self.min_value, min(values))
        
        self.steps_taken += sum(env.logs.values())
        
        self.data_points += 1
        

    def get_avg(self):
        percentages = (self.capacity_used / self.total_capacity) * 100
        
        return {
            "steps" : self.steps_taken / self.data_points,
            "utilization" : 100 - (sum(percentages) / len(percentages)),
            "accuracy" : (self.placed / (self.placed + self.misplaced)) * 100,
            "min_value" : self.min_value,
            "max_value" : self.max_value,
            "avg_value" : sum(self.total_values) / (len(self.total_values) * self.data_points)
        }
    

    def print_data(self):
        data = self.get_avg()
        print(f'Average number of steps taken: {data["steps"]}')
        print(f'Average bin utilization: {round(data["utilization"], 2)}%')
        print(f'Accuracy: {round(data["accuracy"], 2)}%')
        print(f'Min value: {data["min_value"]}')
        print(f'Max value: {data["max_value"]}')
        print(f'Average value: {data["avg_value"]}')

In [15]:
class bin_data:
    def __init__(self, env):
        self.total_capacity = np.full(env.num_knapsacks, 0) # Total capacity seen across all trials
        self.capacity_used = np.full(env.num_knapsacks, 0) # Total capacity used across all trials
        self.steps_taken = 0 # Number of timesteps taken before trial terminated
        self.placed = 0 # Number of items correctly placed
        self.misplaced = 0
        self.discarded = 0
        self.data_points = 0 # Number of trials we've witnessed
    

    def log(self, env):
        self.total_capacity = np.add(self.total_capacity, env.capacity)
        self.capacity_used = np.add(self.capacity_used, env.state[:-1])
        self.steps_taken += sum(env.logs.values())
        self.placed += env.logs["placed"]
        self.misplaced += env.logs["misplaced"]
        self.discarded += env.logs["discarded"]
        self.data_points += 1
        

    def get_avg(self):
        percentages = 100 - ((self.capacity_used/self.total_capacity) * 100)
        
        return {
            "steps" : self.steps_taken / self.data_points,
            "utilization" : sum(percentages) / len(percentages),
            "accuracy" : (self.placed / (self.placed + self.misplaced)) * 100
        }
    

    def print_data(self):
        data = self.get_avg()
        print(f'Average number of steps taken: {data["steps"]}')
        print(f'Average bin utilization: {round(data["utilization"], 2)}%')
        print(f'Accuracy: {round(data["accuracy"], 2)}%')

### 1. Import dependencies

In [2]:
!pip install tensorflow==2.7.0
!pip install gym
!pip install keras
!pip install keras-rl2



In [2]:
from KnapsackEnvironment1D_2 import KnapsackPacking

### 2. Create environment

In [16]:
env = KnapsackPacking(num_knapsacks=10, capacity=20)

In [17]:
env.action_space.sample()

7

In [18]:
env.observation_space.sample()

array([17, 20,  7, 10,  5, 20, 14, 18,  2, 20,  8])

### 3. Run baseline test (No ML)

In [19]:
control_data = bin_data(env)

In [20]:
print(env.state)

[20 20 20 20 20 20 20 20 20 20  7]


In [21]:
MAX_STEPS = 1000
episodes = 10

for episode in range(1, episodes+1):
    state = env.reset()
    steps = 0
    done = False
    score = 0
    
    while not done and steps < MAX_STEPS:
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
        steps += 1

    print('Episode: {} Score: {}'.format(episode,score))
    print(env.logs)
    control_data.log(env)
    env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

Episode: 1 Score: 137
{'placed': 46, 'misplaced': 223, 'discarded': 18}
Episode: 2 Score: -2522
{'placed': 45, 'misplaced': 480, 'discarded': 40}
Episode: 3 Score: -1448
{'placed': 42, 'misplaced': 331, 'discarded': 40}
Episode: 4 Score: -2625
{'placed': 48, 'misplaced': 471, 'discarded': 54}
Episode: 5 Score: -2792
{'placed': 43, 'misplaced': 488, 'discarded': 51}
Episode: 6 Score: -642
{'placed': 55, 'misplaced': 317, 'discarded': 31}
Episode: 7 Score: 516
{'placed': 40, 'misplaced': 162, 'discarded': 23}
Episode: 8 Score: -2771
{'placed': 38, 'misplaced': 511, 'discarded': 54}
Episode: 9 Score: -5592
{'placed': 48, 'misplaced': 793, 'discarded': 75}
Episode: 10 Score: -1939
{'placed': 44, 'misplaced': 418, 'discarded': 53}


In [22]:
control_data.print_data()

Average number of steps taken: 508.2
Average bin utilization: 98.7%
Accuracy: 9.67%


In [23]:
print(env.state)

[0 0 1 0 0 1 0 0 0 0 1]


### 4. Train an RL Model

In [24]:
import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

In [25]:
# Will throw an error if these don't exist
log_path = os.path.join('Training', 'Logs')

In [27]:
env = KnapsackPacking(num_knapsacks=10, capacity=20)

In [28]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [29]:
model.learn(total_timesteps=100000)

Logging to Training\Logs\PPO_72
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 358      |
|    ep_rew_mean     | -1e+03   |
| time/              |          |
|    fps             | 275      |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 335         |
|    ep_rew_mean          | -438        |
| time/                   |             |
|    fps                  | 382         |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.013396248 |
|    clip_fraction        | 0.0707      |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.39       |
|    explained_variance   | -0.000637   

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 243         |
|    ep_rew_mean          | 1.05e+03    |
| time/                   |             |
|    fps                  | 598         |
|    iterations           | 11          |
|    time_elapsed         | 37          |
|    total_timesteps      | 22528       |
| train/                  |             |
|    approx_kl            | 0.009162987 |
|    clip_fraction        | 0.0454      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.9        |
|    explained_variance   | 0.327       |
|    learning_rate        | 0.0003      |
|    loss                 | 2.22e+04    |
|    n_updates            | 100         |
|    policy_gradient_loss | -0.0136     |
|    value_loss           | 3.86e+04    |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 229 

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 197         |
|    ep_rew_mean          | 1.95e+03    |
| time/                   |             |
|    fps                  | 616         |
|    iterations           | 21          |
|    time_elapsed         | 69          |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.004796072 |
|    clip_fraction        | 0.0259      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.53       |
|    explained_variance   | 0.242       |
|    learning_rate        | 0.0003      |
|    loss                 | 1.58e+04    |
|    n_updates            | 200         |
|    policy_gradient_loss | -0.0133     |
|    value_loss           | 4.16e+04    |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 192   

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 182         |
|    ep_rew_mean          | 2.18e+03    |
| time/                   |             |
|    fps                  | 617         |
|    iterations           | 31          |
|    time_elapsed         | 102         |
|    total_timesteps      | 63488       |
| train/                  |             |
|    approx_kl            | 0.004061784 |
|    clip_fraction        | 0.0205      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.35       |
|    explained_variance   | 0.356       |
|    learning_rate        | 0.0003      |
|    loss                 | 2.63e+04    |
|    n_updates            | 300         |
|    policy_gradient_loss | -0.013      |
|    value_loss           | 4.67e+04    |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 180 

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 162          |
|    ep_rew_mean          | 2.21e+03     |
| time/                   |              |
|    fps                  | 615          |
|    iterations           | 41           |
|    time_elapsed         | 136          |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0029272728 |
|    clip_fraction        | 0.0157       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.25        |
|    explained_variance   | 0.427        |
|    learning_rate        | 0.0003       |
|    loss                 | 3.02e+04     |
|    n_updates            | 400          |
|    policy_gradient_loss | -0.0121      |
|    value_loss           | 4.79e+04     |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

<stable_baselines3.ppo.ppo.PPO at 0x2af64c38ee0>

### 5. Save model

In [17]:
PPO_Path = os.path.join('Training', 'Saved Models', 'Knapsack_model')

In [18]:
model.save(PPO_Path)

### 6. Load model

In [19]:
model = PPO.load(PPO_Path, env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


### 7. Test model

In [31]:
real_data = bin_data(env)
env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

In [32]:
MAX_STEPS = 1000
episodes = 10

for episode in range(1, episodes+1):
    obs = env.reset()
    steps = 0
    done = False
    score = 0
    
    while not done and steps < MAX_STEPS:
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
        steps += 1

    print('Episode: {} Score: {}'.format(episode,score))
    print(env.logs)
    real_data.log(env)
    env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

Episode: 1 Score: 2954
{'placed': 60, 'misplaced': 15, 'discarded': 135}
Episode: 2 Score: 2222
{'placed': 46, 'misplaced': 3, 'discarded': 45}
Episode: 3 Score: 2461
{'placed': 52, 'misplaced': 15, 'discarded': 69}
Episode: 4 Score: 2122
{'placed': 43, 'misplaced': 8, 'discarded': 39}
Episode: 5 Score: 2377
{'placed': 49, 'misplaced': 16, 'discarded': 75}
Episode: 6 Score: 2613
{'placed': 50, 'misplaced': 12, 'discarded': 85}
Episode: 7 Score: 2665
{'placed': 57, 'misplaced': 16, 'discarded': 117}
Episode: 8 Score: 1981
{'placed': 49, 'misplaced': 25, 'discarded': 225}
Episode: 9 Score: 2142
{'placed': 49, 'misplaced': 28, 'discarded': 109}
Episode: 10 Score: 2137
{'placed': 48, 'misplaced': 19, 'discarded': 57}


In [33]:
control_data.print_data()

Average number of steps taken: 508.2
Average bin utilization: 98.7%
Accuracy: 9.67%


In [34]:
real_data.print_data()

Average number of steps taken: 161.6
Average bin utilization: 98.5%
Accuracy: 76.21%
