### 0. Helper functions

In [9]:
import numpy as np

class knapsack_data:
    def __init__(self, env):
        self.total_capacity = np.full(env.num_knapsacks, 0) # Total capacity seen across all trials
        self.capacity_used = np.full(env.num_knapsacks, 0) # Total capacity used across all trials
        
        self.placed = 0 # Number of items correctly placed
        self.misplaced = 0
        self.discarded = 0
        
        self.total_values = np.full(env.num_knapsacks, 0)
        self.min_value = 1_000_000
        self.max_value = -1
        
        self.steps_taken = 0 # Number of timesteps taken before trial terminated
        
        self.data_points = 0 # Number of trials we've witnessed
    

    def log(self, env):        
        self.total_capacity += np.add(self.total_capacity, env.num_knapsacks * env.capacity)
        self.capacity_used += np.add(self.capacity_used, env.state[0][:-1])
        #print(env.state)
        
        self.placed += env.logs["placed"]
        self.misplaced += env.logs["misplaced"]
        self.discarded += env.logs["discarded"]
        
        values = env.state[1][:-1]
        
        self.total_values += values
        self.max_value = max(self.max_value, max(values))
        self.min_value = min(self.min_value, min(values))
        
        self.steps_taken += sum(env.logs.values())
        
        self.data_points += 1
        

    def get_avg(self):
        percentages = (self.capacity_used / self.total_capacity) * 100
        
        return {
            "steps" : self.steps_taken / self.data_points,
            "utilization" : 100 - (sum(percentages) / len(percentages)),
            "accuracy" : (self.placed / (self.placed + self.misplaced)) * 100,
            "min_value" : self.min_value,
            "max_value" : self.max_value,
            "avg_value" : sum(self.total_values) / (len(self.total_values) * self.data_points)
        }
    

    def print_data(self):
        data = self.get_avg()
        print(f'Average number of steps taken: {data["steps"]}')
        print(f'Average bin utilization: {round(data["utilization"], 2)}%')
        print(f'Accuracy: {round(data["accuracy"], 2)}%')
        print(f'Min value: {data["min_value"]}')
        print(f'Max value: {data["max_value"]}')
        print(f'Average value: {data["avg_value"]}')

In [None]:
class bin_data:
    def __init__(self, env):
        self.total_capacity = np.full(env.num_bins, 0) # Total capacity seen across all trials
        self.capacity_used = np.full(env.num_bins, 0) # Total capacity used across all trials
        self.steps_taken = 0 # Number of timesteps taken before trial terminated
        self.placed = 0 # Number of items correctly placed
        self.misplaced = 0
        self.discarded = 0
        self.data_points = 0 # Number of trials we've witnessed
    

    def log(self, env):
        self.total_capacity = np.add(self.total_capacity, env.capacity)
        self.capacity_used = np.add(self.capacity_used, env.state[:-1])
        self.steps_taken += sum(env.logs.values())
        self.placed += env.logs["placed"]
        self.misplaced += env.logs["misplaced"]
        self.discarded += env.logs["discarded"]
        self.data_points += 1
        

    def get_avg(self):
        percentages = 100 - ((self.capacity_used/self.total_capacity) * 100)
        
        return {
            "steps" : self.steps_taken / self.data_points,
            "utilization" : sum(percentages) / len(percentages),
            "accuracy" : (self.placed / (self.placed + self.misplaced)) * 100
        }
    

    def print_data(self):
        data = self.get_avg()
        print(f'Average number of steps taken: {data["steps"]}')
        print(f'Average bin utilization: {round(data["utilization"], 2)}%')
        print(f'Accuracy: {round(data["accuracy"], 2)}%')

### 1. Import dependencies

In [2]:
!pip install tensorflow==2.7.0
!pip install gym
!pip install keras
!pip install keras-rl2



In [2]:
from KnapsackEnvironment1D_2 import KnapsackPacking

### 2. Create environment

In [3]:
env = KnapsackPacking(num_knapsacks=10, capacity=20)

In [4]:
env.action_space.sample()

3

In [5]:
env.observation_space.sample()

array([13,  5, 10,  0,  5, 14, 16, 13, 14, 17, 11])

### 3. Run baseline test (No ML)

In [6]:
control_data = knapsack_data(env)

In [7]:
print(env.state)

[20 20 20 20 20 20 20 20 20 20  3]


In [8]:
MAX_STEPS = 1000
episodes = 10

for episode in range(1, episodes+1):
    state = env.reset()
    steps = 0
    done = False
    score = 0
    
    while not done and steps < MAX_STEPS:
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
        steps += 1

    print('Episode: {} Score: {}'.format(episode,score))
    print(env.logs)
    control_data.log(env)
    env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

Episode: 1 Score: -3119
{'placed': 43, 'misplaced': 534, 'discarded': 73}


IndexError: invalid index to scalar variable.

In [10]:
control_data.print_data()

Average number of steps taken: 500.1
Average bin utilization: 99.91%
Accuracy: 10.25%
Min value: 84
Max value: 464
Average value: 241.11


In [11]:
print(env.state)

[[  0   0   0   0   0   0   0   0   0   0   1]
 [356 255 288 333 188 140 378 188 204 464  93]]


### 4. Train an RL Model

In [12]:
import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

In [13]:
# Will throw an error if these don't exist
log_path = os.path.join('Training', 'Logs')

In [14]:
env = KnapsackPacking(num_knapsacks=10, capacity=20)

In [15]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [16]:
model.learn(total_timesteps=100000)

Logging to Training\Logs\PPO_71
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 390       |
|    ep_rew_mean     | -1.14e+03 |
| time/              |           |
|    fps             | 279       |
|    iterations      | 1         |
|    time_elapsed    | 7         |
|    total_timesteps | 2048      |
----------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 436         |
|    ep_rew_mean          | -1.6e+03    |
| time/                   |             |
|    fps                  | 403         |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008475637 |
|    clip_fraction        | 0.0253      |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.39       |
|    explained_variance   | -0

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 373          |
|    ep_rew_mean          | -972         |
| time/                   |              |
|    fps                  | 636          |
|    iterations           | 11           |
|    time_elapsed         | 35           |
|    total_timesteps      | 22528        |
| train/                  |              |
|    approx_kl            | 0.0019325024 |
|    clip_fraction        | 4.88e-05     |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.33        |
|    explained_variance   | 0.262        |
|    learning_rate        | 0.0003       |
|    loss                 | 7.56e+03     |
|    n_updates            | 100          |
|    policy_gradient_loss | -0.00511     |
|    value_loss           | 2.66e+04     |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 336          |
|    ep_rew_mean          | -564         |
| time/                   |              |
|    fps                  | 637          |
|    iterations           | 21           |
|    time_elapsed         | 67           |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0039636213 |
|    clip_fraction        | 0.00659      |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.3         |
|    explained_variance   | 0.453        |
|    learning_rate        | 0.0003       |
|    loss                 | 1.24e+04     |
|    n_updates            | 200          |
|    policy_gradient_loss | -0.00531     |
|    value_loss           | 2.44e+04     |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 337          |
|    ep_rew_mean          | -559         |
| time/                   |              |
|    fps                  | 626          |
|    iterations           | 31           |
|    time_elapsed         | 101          |
|    total_timesteps      | 63488        |
| train/                  |              |
|    approx_kl            | 0.0037339504 |
|    clip_fraction        | 0.0105       |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.25        |
|    explained_variance   | 0.595        |
|    learning_rate        | 0.0003       |
|    loss                 | 1.3e+04      |
|    n_updates            | 300          |
|    policy_gradient_loss | -0.00542     |
|    value_loss           | 2.1e+04      |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 344         |
|    ep_rew_mean          | -592        |
| time/                   |             |
|    fps                  | 626         |
|    iterations           | 41          |
|    time_elapsed         | 133         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.004054798 |
|    clip_fraction        | 0.00659     |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.29       |
|    explained_variance   | 0.628       |
|    learning_rate        | 0.0003      |
|    loss                 | 8.31e+03    |
|    n_updates            | 400         |
|    policy_gradient_loss | -0.0073     |
|    value_loss           | 2.23e+04    |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 348   

<stable_baselines3.ppo.ppo.PPO at 0x208cb2abaf0>

### 5. Save model

In [17]:
PPO_Path = os.path.join('Training', 'Saved Models', 'Knapsack_model')

In [18]:
model.save(PPO_Path)

### 6. Load model

In [19]:
model = PPO.load(PPO_Path, env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


### 7. Test model

In [20]:
real_data = knapsack_data(env)
env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

In [21]:
MAX_STEPS = 1000
episodes = 10

for episode in range(1, episodes+1):
    obs = env.reset()
    steps = 0
    done = False
    score = 0
    
    while not done and steps < MAX_STEPS:
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
        steps += 1

    print('Episode: {} Score: {}'.format(episode,score))
    print(env.logs)
    real_data.log(env)
    env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

Episode: 1 Score: -856
{'placed': 51, 'misplaced': 266, 'discarded': 114}
Episode: 2 Score: -198
{'placed': 50, 'misplaced': 201, 'discarded': 98}
Episode: 3 Score: 1128
{'placed': 43, 'misplaced': 82, 'discarded': 27}
Episode: 4 Score: 1057
{'placed': 40, 'misplaced': 70, 'discarded': 34}
Episode: 5 Score: 1179
{'placed': 41, 'misplaced': 79, 'discarded': 27}
Episode: 6 Score: 123
{'placed': 51, 'misplaced': 118, 'discarded': 117}
Episode: 7 Score: -466
{'placed': 50, 'misplaced': 225, 'discarded': 90}
Episode: 8 Score: -1938
{'placed': 44, 'misplaced': 328, 'discarded': 59}
Episode: 9 Score: -15
{'placed': 41, 'misplaced': 142, 'discarded': 65}
Episode: 10 Score: 1384
{'placed': 45, 'misplaced': 45, 'discarded': 72}


In [22]:
control_data.print_data()

Average number of steps taken: 500.1
Average bin utilization: 99.91%
Accuracy: 10.25%
Min value: 84
Max value: 464
Average value: 241.11


In [23]:
real_data.print_data()

Average number of steps taken: 271.5
Average bin utilization: 99.89%
Accuracy: 22.66%
Min value: 92
Max value: 457
Average value: 239.88
