### 1. Import Dependencies

In [1]:
#!pip install tensorflow==2.7.0
#!pip install gym
#!pip install keras
#!pip install keras-rl2

In [1]:
from collections import Counter
from VMAllocationEnvironment import VMAllocationEnvironment

### 2. Create Environment

In [15]:
env = VMAllocationEnvironment(10, 10)

In [16]:
env.state

array([[ 0.        , 10.        , 10.        , 10.        ],
       [ 0.        , 10.        , 10.        , 10.        ],
       [ 0.        , 10.        , 10.        , 10.        ],
       [ 0.        , 10.        , 10.        , 10.        ],
       [ 0.        , 10.        , 10.        , 10.        ],
       [ 0.        , 10.        , 10.        , 10.        ],
       [ 0.        , 10.        , 10.        , 10.        ],
       [ 0.        , 10.        , 10.        , 10.        ],
       [ 0.        , 10.        , 10.        , 10.        ],
       [ 0.        , 10.        , 10.        , 10.        ],
       [ 0.08700111, -0.37007874, -0.04905854, -0.02919099]],
      dtype=float32)

In [17]:
env.observation_space.sample()

array([[20.842003  ,  9.268549  ,  8.491825  ,  9.248245  ],
       [ 7.993942  ,  5.837442  ,  8.538009  ,  8.524099  ],
       [56.32744   ,  6.018296  ,  1.3318549 ,  5.052313  ],
       [17.787182  ,  8.201955  ,  0.91848487,  5.255874  ],
       [66.11823   ,  7.2543774 ,  9.299523  ,  1.6687855 ],
       [ 1.5231788 ,  2.648834  ,  5.2091193 ,  3.1549041 ],
       [42.98362   ,  9.342381  ,  5.484462  ,  5.595374  ],
       [53.622566  ,  8.072747  ,  0.47493726,  2.0717106 ],
       [89.79644   ,  6.46535   ,  3.8974352 ,  7.2549415 ],
       [11.811794  ,  3.754883  ,  8.440101  ,  5.153111  ],
       [25.798693  ,  3.2170134 ,  9.835156  ,  2.1679347 ]],
      dtype=float32)

### 3. Run control test

In [18]:
MAX_STEPS = 1000
episodes = 10

for episode in range(1, episodes+1):
    state = env.reset()
    steps = 0
    done = False
    score = 0
    
    session_info = Counter({ 'placed':0, 'misplaced':0, 'discarded':0 })
    
    while not done and steps < MAX_STEPS:
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
        steps += 1
        session_info += Counter(info)

    print('Episode:{} Score:{}'.format(episode,score))
    print('Total steps: {}'.format(steps))
    
    # Counter method sometimes shuffles keys; this returns the order
    ordered_info = \
        sorted(dict(session_info).items(), key=lambda x:x[0], reverse=True)
    
    placed, misplaced, discarded = ordered_info[:]
    accuracy = (placed[1] / (placed[1] + misplaced[1])) * 100
    print('Accuracy of placed items: {:.3}%'.format(accuracy))
    
    first_time = (placed[1] / (placed[1] + misplaced[1] + discarded[1])) * 100
    print('Items placed correctly first time: {:.3}%'.format(first_time))
    
    print(ordered_info)
    
    print()
    
    #print(env.logs)
    #control_data.log(env)
    #env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

Episode:1 Score:-525.9621966497798
Total steps: 1000
Accuracy of placed items: 48.7%
Items placed correctly first time: 44.4%
[('placed', 444), ('misplaced', 468), ('discarded', 88)]

Episode:2 Score:-533.9990901288256
Total steps: 1000
Accuracy of placed items: 47.9%
Items placed correctly first time: 43.7%
[('placed', 437), ('misplaced', 476), ('discarded', 87)]

Episode:3 Score:-549.2607511374663
Total steps: 1000
Accuracy of placed items: 46.4%
Items placed correctly first time: 42.1%
[('placed', 421), ('misplaced', 487), ('discarded', 92)]

Episode:4 Score:-493.119188636163
Total steps: 1000
Accuracy of placed items: 52.5%
Items placed correctly first time: 48.0%
[('placed', 480), ('misplaced', 434), ('discarded', 86)]

Episode:5 Score:-487.6864903255628
Total steps: 1000
Accuracy of placed items: 53.5%
Items placed correctly first time: 48.5%
[('placed', 485), ('misplaced', 421), ('discarded', 94)]

Episode:6 Score:-525.1763892765157
Total steps: 1000
Accuracy of placed items: 49

### 4. Train RL model

In [19]:
import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

In [20]:
# Will throw an error if these don't exist
log_path = os.path.join('Training', 'Logs')

In [21]:
env = VMAllocationEnvironment(10, 10)

In [22]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [23]:
model.learn(total_timesteps=100000)

Logging to Training\Logs\PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 833      |
|    ep_rew_mean     | -357     |
| time/              |          |
|    fps             | 292      |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 2048     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.18e+03   |
|    ep_rew_mean          | -704       |
| time/                   |            |
|    fps                  | 404        |
|    iterations           | 2          |
|    time_elapsed         | 10         |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.01738692 |
|    clip_fraction        | 0.136      |
|    clip_range           | 0.2        |
|    entropy_loss         | -2.39      |
|    explained_variance   | -0.179     |
|    learning_

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.43e+03    |
|    ep_rew_mean          | -945        |
| time/                   |             |
|    fps                  | 581         |
|    iterations           | 11          |
|    time_elapsed         | 38          |
|    total_timesteps      | 22528       |
| train/                  |             |
|    approx_kl            | 0.015539047 |
|    clip_fraction        | 0.223       |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.31       |
|    explained_variance   | 0.941       |
|    learning_rate        | 0.0003      |
|    loss                 | 2.9         |
|    n_updates            | 100         |
|    policy_gradient_loss | -0.0134     |
|    value_loss           | 34.8        |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.46e+

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.47e+03    |
|    ep_rew_mean          | -991        |
| time/                   |             |
|    fps                  | 596         |
|    iterations           | 21          |
|    time_elapsed         | 72          |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.010556149 |
|    clip_fraction        | 0.125       |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.27       |
|    explained_variance   | 0.957       |
|    learning_rate        | 0.0003      |
|    loss                 | 1.07        |
|    n_updates            | 200         |
|    policy_gradient_loss | -0.00984    |
|    value_loss           | 30.5        |
-----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.47e+03

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.48e+03    |
|    ep_rew_mean          | -993        |
| time/                   |             |
|    fps                  | 590         |
|    iterations           | 31          |
|    time_elapsed         | 107         |
|    total_timesteps      | 63488       |
| train/                  |             |
|    approx_kl            | 0.014009278 |
|    clip_fraction        | 0.141       |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.25       |
|    explained_variance   | 0.942       |
|    learning_rate        | 0.0003      |
|    loss                 | 5.22        |
|    n_updates            | 300         |
|    policy_gradient_loss | -0.00677    |
|    value_loss           | 61          |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.49e+

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1.47e+03     |
|    ep_rew_mean          | -987         |
| time/                   |              |
|    fps                  | 583          |
|    iterations           | 41           |
|    time_elapsed         | 143          |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0110415025 |
|    clip_fraction        | 0.124        |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.25        |
|    explained_variance   | 0.961        |
|    learning_rate        | 0.0003       |
|    loss                 | 17.4         |
|    n_updates            | 400          |
|    policy_gradient_loss | -0.00738     |
|    value_loss           | 30.3         |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

<stable_baselines3.ppo.ppo.PPO at 0x2061711afd0>

### 5. Save model

In [24]:
PPO_Path = os.path.join('Training', 'Saved Models', 'VM_Allocation')

In [25]:
model.save(PPO_Path)



### 6. Load model

In [26]:
model = PPO.load(PPO_Path, env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


### 7. Test model

In [27]:
MAX_STEPS = 1000
episodes = 10

for episode in range(1, episodes+1):
    obs = env.reset()
    steps = 0
    done = False
    score = 0
    
    session_info = Counter({ 'placed':0, 'misplaced':0, 'discarded':0 })
    
    while not done and steps < MAX_STEPS:
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
        steps += 1
        session_info += Counter(info)

    print('Episode:{} Score:{}'.format(episode,score))
    print('Total steps: {}'.format(steps))
    
    # Counter method sometimes shuffles keys; this returns the order
    ordered_info = \
        sorted(dict(session_info).items(), key=lambda x:x[0], reverse=True)
    
    placed, misplaced, discarded = ordered_info[:]
    accuracy = (placed[1] / (placed[1] + misplaced[1])) * 100
    print('Accuracy of placed items: {:.3}%'.format(accuracy))
    
    first_time = (placed[1] / (placed[1] + misplaced[1] + discarded[1])) * 100
    print('Items placed correctly first time: {:.3}%'.format(first_time))
    
    print(ordered_info)
    
    print()
    #real_data.log(env)
    #env.logs = { 'placed':0, 'misplaced':0, 'discarded':0 }

Episode:1 Score:-530.8549420638155
Total steps: 1000
Accuracy of placed items: 50.7%
Items placed correctly first time: 44.2%
[('placed', 442), ('misplaced', 429), ('discarded', 129)]

Episode:2 Score:-513.0729227057745
Total steps: 1000
Accuracy of placed items: 52.9%
Items placed correctly first time: 46.0%
[('placed', 460), ('misplaced', 409), ('discarded', 131)]

Episode:3 Score:-489.4733477924892
Total steps: 1000
Accuracy of placed items: 54.3%
Items placed correctly first time: 48.3%
[('placed', 483), ('misplaced', 406), ('discarded', 111)]

Episode:4 Score:-487.63329112260544
Total steps: 1000
Accuracy of placed items: 57.7%
Items placed correctly first time: 48.5%
[('placed', 485), ('misplaced', 355), ('discarded', 160)]

Episode:5 Score:-510.94449898543826
Total steps: 1000
Accuracy of placed items: 53.8%
Items placed correctly first time: 46.2%
[('placed', 462), ('misplaced', 396), ('discarded', 142)]

Episode:6 Score:-535.8375195217959
Total steps: 1000
Accuracy of placed i

In [28]:
print(env.state)

[[ 2.5455301e+00  7.8696869e-03  8.1553640e+00  6.9690075e+00]
 [ 2.4491055e+00  7.8723319e-03  8.4670677e+00  7.2796016e+00]
 [ 2.8138154e+00  7.8677535e-03  7.7076912e+00  7.7496271e+00]
 [ 2.8888593e+00  7.8734122e-03  7.5815210e+00  6.9494562e+00]
 [ 3.2366431e+00  7.8690015e-03  7.2140975e+00  8.5514917e+00]
 [ 4.0037842e+00  2.3621943e-02  7.6126523e+00  8.0060730e+00]
 [ 2.8303866e+00  7.8695603e-03  7.7997885e+00  6.8577323e+00]
 [ 2.6254239e+00  1.5743826e-02  7.8740835e+00  8.8141489e+00]
 [ 2.7915351e+00  7.8712814e-03  8.2274799e+00  7.2857556e+00]
 [ 2.6157935e+00  2.3620624e-02  8.3170147e+00  8.5263863e+00]
 [ 1.0388374e-02 -5.5118110e-02 -3.9707948e-03 -2.9190991e-02]]
