In [1]:
import gym 
from gym import Env
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete 
import numpy as np
import random
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy

In [2]:
Box(0,255,shape=(3,3), dtype=int).sample()

array([[181, 224,   1],
       [112,  34, 230],
       [168,  73, 240]])

In [98]:
class ChargeEnv(Env):
    def __init__(self):
        # Actions we can take, down, stay, up
        self.action_space = Discrete(3)
        # Observation box
        self.observation_space = Box(low=np.array([0]), high=np.array([100]))
        # Set start SOC
        self.state = 20 + random.randint(-3,3)
        # Set max SOC
        self.max_state = 100
        # Set power
        self.power = 20
        # Set price data
        self.priceArray = np.float32(np.array([36.22, 27.98, 4.6, 9.38, -0.43, 2.26, 4.02, 8.48, 18.73,
                                               20.64, 22.99, 18.93, 16.67, 9.51, 3.59, 2.31, 2.78, 6.3, 14.18,
                                               16.11, 21.42, 25.87, 30.71, 18.44]))
        # Set current index
        self.currentIndex = 0
        self.currentprice = 0
        # Set temp 
        self.temp = 0
        self.change = 0
        # Set median
        self.mid = np.median(self.priceArray)
        self.open = 0
        self.rate = 0
        # Set total time 
        self.time = 23 
        
    def step(self, action):
        # Apply action
        # 0 -1 = -1 sell
        # 1 -1 = 0 hold
        # 2 -1 = 1 buy 
       
        self.currentprice = self.priceArray[self.currentIndex]
        # Reduce time by 1 h
        self.time -= 1 
        
        # If state exceeds boundaries after the action due to power limit, set state to 100 or 0
        if 100 < self.state < (self.max_state + self.power):
            self.rate = (self.state - self.max_state) / self.power
            self.state = 100
            #action = 0
        elif (0 - self.power) < self.state < 0:
            self.rate = (0 - self.state) / self.power
            self.state = 0
            #action = 2
        else: 
            self.rate = 1
        
        #if self.state >= 90 and len(self.priceArray - 1) - self.currentIndex < 2:
        #    action = 1
        #elif (self.max_state - self.state) / self.power >= len(self.priceArray - 1) - self.currentIndex:
        #    action = 2
        
        # Lose money when buy, and get when sell
        self.temp = -abs(self.currentprice) * (action - 1) * self.rate
        
        # Calculate cumulative profit
        self.change += self.temp
        
        # Give reward based on the profit
        if self.change >= 0: 
            reward = 1
        else:
            reward = -1
            
        # Update the state
        self.state += (action - 1) * self.power 
        
        # Move to the next price
        self.currentIndex += 1
        
        # Set Low boundary and Up boundary
        #Set end state
        if self.time <= 0: 
            done = True
            if self.state < 80:
                done = True
                reward = -1000
        elif self.state <= -20 or self.state >= 120:
            done = True
            reward = -1000
        else:
            done = False       

        

        # Set placeholder for info
        info = {}
        
        # Return step information
        return self.state, reward, done, info

    def render(self):
        # Implement viz
        pass
    
    def reset(self):
        # Reset shower temperature
        self.state = np.array([20 + random.randint(-3,3)])#.astype(float)
        # Set max SOC
        self.max_state = 100
        # Set power
        self.power = 20
        # Set price data
        self.priceArray = np.float32(np.array([36.22, 27.98, 4.6, 9.38, -0.43, 2.26, 4.02, 8.48, 18.73,
                                               20.64, 22.99, 18.93, 16.67, 9.51, 3.59, 2.31, 2.78, 6.3, 14.18,
                                               16.11, 21.42, 25.87, 30.71, 18.44]))
        # Set current index
        self.currentIndex = 0
        self.currentprice = 0
        # Set temp 
        self.temp = 0
        self.change = 0
        self.rate = 0
        # Set median
        self.mid = np.median(self.priceArray)
        self.open = 0
        # Set total time 
        self.time = 23 
        return self.state

In [99]:
env=ChargeEnv()

In [100]:
env.observation_space.sample()

array([25.448338], dtype=float32)

In [101]:
env.reset()

array([23])

In [102]:
from stable_baselines3.common.env_checker import check_env

In [103]:
check_env(env, warn=True)

AssertionError: The observation returned by the `reset()` method does not match the given observation space

In [104]:
episodes = 50
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{} State:{}'.format(episode, score, n_state))
env.close()

Episode:1 Score:-988 State:-20
Episode:2 Score:-1014 State:[-20]
Episode:3 Score:-1018 State:-20
Episode:4 Score:-1007 State:[-20]
Episode:5 Score:-1018 State:[77]
Episode:6 Score:-1017 State:-20
Episode:7 Score:-997 State:-20
Episode:8 Score:-995 State:[-20]
Episode:9 Score:-1011 State:-20
Episode:10 Score:-990 State:[-20]
Episode:11 Score:23 State:100
Episode:12 Score:-992 State:-20
Episode:13 Score:-1022 State:[42]
Episode:14 Score:-1009 State:120
Episode:15 Score:-1005 State:-20
Episode:16 Score:-981 State:120
Episode:17 Score:-1011 State:[120]
Episode:18 Score:-1018 State:-20
Episode:19 Score:-1005 State:-20
Episode:20 Score:-999 State:[-20]
Episode:21 Score:-999 State:-20
Episode:22 Score:-999 State:-20
Episode:23 Score:-999 State:[-20]
Episode:24 Score:-999 State:-20
Episode:25 Score:-1009 State:120
Episode:26 Score:-999 State:-20
Episode:27 Score:-998 State:-20
Episode:28 Score:-997 State:[-20]
Episode:29 Score:-1004 State:-20
Episode:30 Score:-995 State:-20
Episode:31 Score:-1

In [105]:
log_path = os.path.join('Training', 'Logs')

In [106]:
model = PPO("MlpPolicy", env, learning_rate=0.0001, verbose=1, tensorboard_log=log_path)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [107]:
model.learn(total_timesteps=500000)

Logging to Training\Logs\PPO_35
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 14       |
|    ep_rew_mean     | -943     |
| time/              |          |
|    fps             | 587      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 14          |
|    ep_rew_mean          | -885        |
| time/                   |             |
|    fps                  | 416         |
|    iterations           | 2           |
|    time_elapsed         | 9           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.010261548 |
|    clip_fraction        | 0.0223      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | 0.000861    

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 19.8         |
|    ep_rew_mean          | -711         |
| time/                   |              |
|    fps                  | 345          |
|    iterations           | 11           |
|    time_elapsed         | 65           |
|    total_timesteps      | 22528        |
| train/                  |              |
|    approx_kl            | 0.0014151854 |
|    clip_fraction        | 0.0104       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.695       |
|    explained_variance   | -0.000866    |
|    learning_rate        | 0.0001       |
|    loss                 | 9.68e+04     |
|    n_updates            | 100          |
|    policy_gradient_loss | -0.0027      |
|    value_loss           | 2.11e+05     |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 21.9         |
|    ep_rew_mean          | -485         |
| time/                   |              |
|    fps                  | 331          |
|    iterations           | 21           |
|    time_elapsed         | 129          |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0045178076 |
|    clip_fraction        | 0.0233       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.493       |
|    explained_variance   | -2.48e-05    |
|    learning_rate        | 0.0001       |
|    loss                 | 9.86e+04     |
|    n_updates            | 200          |
|    policy_gradient_loss | -0.00407     |
|    value_loss           | 1.78e+05     |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 21.6         |
|    ep_rew_mean          | -546         |
| time/                   |              |
|    fps                  | 332          |
|    iterations           | 31           |
|    time_elapsed         | 191          |
|    total_timesteps      | 63488        |
| train/                  |              |
|    approx_kl            | 0.0018989067 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.511       |
|    explained_variance   | -1.67e-06    |
|    learning_rate        | 0.0001       |
|    loss                 | 5.83e+04     |
|    n_updates            | 300          |
|    policy_gradient_loss | -0.000496    |
|    value_loss           | 1.37e+05     |
------------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mea

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 22.3         |
|    ep_rew_mean          | -378         |
| time/                   |              |
|    fps                  | 332          |
|    iterations           | 41           |
|    time_elapsed         | 252          |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0022047285 |
|    clip_fraction        | 0.00635      |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.445       |
|    explained_variance   | -1.91e-06    |
|    learning_rate        | 0.0001       |
|    loss                 | 4.19e+04     |
|    n_updates            | 400          |
|    policy_gradient_loss | -0.00119     |
|    value_loss           | 1.15e+05     |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 22.6         |
|    ep_rew_mean          | -240         |
| time/                   |              |
|    fps                  | 334          |
|    iterations           | 51           |
|    time_elapsed         | 312          |
|    total_timesteps      | 104448       |
| train/                  |              |
|    approx_kl            | 0.0025389714 |
|    clip_fraction        | 0.0158       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.39        |
|    explained_variance   | -3.58e-07    |
|    learning_rate        | 0.0001       |
|    loss                 | 4.27e+04     |
|    n_updates            | 500          |
|    policy_gradient_loss | -0.00311     |
|    value_loss           | 8.54e+04     |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 22.6         |
|    ep_rew_mean          | -181         |
| time/                   |              |
|    fps                  | 334          |
|    iterations           | 61           |
|    time_elapsed         | 373          |
|    total_timesteps      | 124928       |
| train/                  |              |
|    approx_kl            | 0.0004590089 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.395       |
|    explained_variance   | -2.38e-07    |
|    learning_rate        | 0.0001       |
|    loss                 | 2.61e+04     |
|    n_updates            | 600          |
|    policy_gradient_loss | -0.000248    |
|    value_loss           | 9.33e+04     |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 22.3         |
|    ep_rew_mean          | -331         |
| time/                   |              |
|    fps                  | 335          |
|    iterations           | 71           |
|    time_elapsed         | 433          |
|    total_timesteps      | 145408       |
| train/                  |              |
|    approx_kl            | 0.0022487123 |
|    clip_fraction        | 0.0374       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.402       |
|    explained_variance   | 0            |
|    learning_rate        | 0.0001       |
|    loss                 | 5.49e+04     |
|    n_updates            | 700          |
|    policy_gradient_loss | -0.00393     |
|    value_loss           | 8.19e+04     |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 22.9         |
|    ep_rew_mean          | -291         |
| time/                   |              |
|    fps                  | 334          |
|    iterations           | 81           |
|    time_elapsed         | 495          |
|    total_timesteps      | 165888       |
| train/                  |              |
|    approx_kl            | 0.0005684624 |
|    clip_fraction        | 0.0215       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.341       |
|    explained_variance   | 1.79e-07     |
|    learning_rate        | 0.0001       |
|    loss                 | 3.43e+04     |
|    n_updates            | 800          |
|    policy_gradient_loss | -0.000545    |
|    value_loss           | 6.61e+04     |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 22.9        |
|    ep_rew_mean          | -382        |
| time/                   |             |
|    fps                  | 331         |
|    iterations           | 91          |
|    time_elapsed         | 562         |
|    total_timesteps      | 186368      |
| train/                  |             |
|    approx_kl            | 0.011848494 |
|    clip_fraction        | 0.0373      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.308      |
|    explained_variance   | 0           |
|    learning_rate        | 0.0001      |
|    loss                 | 2.23e+04    |
|    n_updates            | 900         |
|    policy_gradient_loss | 0.000359    |
|    value_loss           | 3.85e+04    |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 22.7

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 22.3         |
|    ep_rew_mean          | -211         |
| time/                   |              |
|    fps                  | 328          |
|    iterations           | 101          |
|    time_elapsed         | 629          |
|    total_timesteps      | 206848       |
| train/                  |              |
|    approx_kl            | 0.0020881486 |
|    clip_fraction        | 0.0698       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.32        |
|    explained_variance   | 0            |
|    learning_rate        | 0.0001       |
|    loss                 | 2.59e+04     |
|    n_updates            | 1000         |
|    policy_gradient_loss | -0.00584     |
|    value_loss           | 4.34e+04     |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 22.6         |
|    ep_rew_mean          | -212         |
| time/                   |              |
|    fps                  | 325          |
|    iterations           | 111          |
|    time_elapsed         | 698          |
|    total_timesteps      | 227328       |
| train/                  |              |
|    approx_kl            | 0.0008422673 |
|    clip_fraction        | 0.0397       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.333       |
|    explained_variance   | 1.19e-07     |
|    learning_rate        | 0.0001       |
|    loss                 | 4.82e+04     |
|    n_updates            | 1100         |
|    policy_gradient_loss | -0.000986    |
|    value_loss           | 5.39e+04     |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 22.7        |
|    ep_rew_mean          | -292        |
| time/                   |             |
|    fps                  | 320         |
|    iterations           | 121         |
|    time_elapsed         | 773         |
|    total_timesteps      | 247808      |
| train/                  |             |
|    approx_kl            | 0.005596339 |
|    clip_fraction        | 0.0388      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.302      |
|    explained_variance   | 0           |
|    learning_rate        | 0.0001      |
|    loss                 | 2.18e+04    |
|    n_updates            | 1200        |
|    policy_gradient_loss | 0.00303     |
|    value_loss           | 4.42e+04    |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 21.7  

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 22.5         |
|    ep_rew_mean          | -162         |
| time/                   |              |
|    fps                  | 315          |
|    iterations           | 131          |
|    time_elapsed         | 849          |
|    total_timesteps      | 268288       |
| train/                  |              |
|    approx_kl            | 0.0044479906 |
|    clip_fraction        | 0.0457       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.271       |
|    explained_variance   | 0            |
|    learning_rate        | 0.0001       |
|    loss                 | 2.38e+04     |
|    n_updates            | 1300         |
|    policy_gradient_loss | -0.00437     |
|    value_loss           | 5.06e+04     |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 22.4         |
|    ep_rew_mean          | -182         |
| time/                   |              |
|    fps                  | 311          |
|    iterations           | 141          |
|    time_elapsed         | 925          |
|    total_timesteps      | 288768       |
| train/                  |              |
|    approx_kl            | 0.0017595881 |
|    clip_fraction        | 0.0937       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.328       |
|    explained_variance   | -1.19e-07    |
|    learning_rate        | 0.0001       |
|    loss                 | 2.37e+04     |
|    n_updates            | 1400         |
|    policy_gradient_loss | 0.00123      |
|    value_loss           | 4.89e+04     |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 22.7         |
|    ep_rew_mean          | -192         |
| time/                   |              |
|    fps                  | 309          |
|    iterations           | 151          |
|    time_elapsed         | 1000         |
|    total_timesteps      | 309248       |
| train/                  |              |
|    approx_kl            | 0.0056762258 |
|    clip_fraction        | 0.047        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.294       |
|    explained_variance   | 1.79e-07     |
|    learning_rate        | 0.0001       |
|    loss                 | 1.87e+04     |
|    n_updates            | 1500         |
|    policy_gradient_loss | 0.00325      |
|    value_loss           | 4.83e+04     |
------------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mea

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 22.4         |
|    ep_rew_mean          | -132         |
| time/                   |              |
|    fps                  | 306          |
|    iterations           | 161          |
|    time_elapsed         | 1075         |
|    total_timesteps      | 329728       |
| train/                  |              |
|    approx_kl            | 0.0020530596 |
|    clip_fraction        | 0.0419       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.253       |
|    explained_variance   | 0            |
|    learning_rate        | 0.0001       |
|    loss                 | 1.01e+04     |
|    n_updates            | 1600         |
|    policy_gradient_loss | -0.00181     |
|    value_loss           | 3.8e+04      |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 22.7        |
|    ep_rew_mean          | -162        |
| time/                   |             |
|    fps                  | 304         |
|    iterations           | 171         |
|    time_elapsed         | 1150        |
|    total_timesteps      | 350208      |
| train/                  |             |
|    approx_kl            | 0.001800021 |
|    clip_fraction        | 0.0383      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.251      |
|    explained_variance   | 0           |
|    learning_rate        | 0.0001      |
|    loss                 | 1.29e+04    |
|    n_updates            | 1700        |
|    policy_gradient_loss | 0.00251     |
|    value_loss           | 3.63e+04    |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 22.1

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 22.3        |
|    ep_rew_mean          | -172        |
| time/                   |             |
|    fps                  | 303         |
|    iterations           | 181         |
|    time_elapsed         | 1223        |
|    total_timesteps      | 370688      |
| train/                  |             |
|    approx_kl            | 0.014530772 |
|    clip_fraction        | 0.0856      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.266      |
|    explained_variance   | 1.19e-07    |
|    learning_rate        | 0.0001      |
|    loss                 | 1.77e+04    |
|    n_updates            | 1800        |
|    policy_gradient_loss | 0.000485    |
|    value_loss           | 2.91e+04    |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 22.3

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 22.9         |
|    ep_rew_mean          | -72.8        |
| time/                   |              |
|    fps                  | 301          |
|    iterations           | 191          |
|    time_elapsed         | 1296         |
|    total_timesteps      | 391168       |
| train/                  |              |
|    approx_kl            | 0.0010879282 |
|    clip_fraction        | 0.0183       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.163       |
|    explained_variance   | 0            |
|    learning_rate        | 0.0001       |
|    loss                 | 2.01e+03     |
|    n_updates            | 1900         |
|    policy_gradient_loss | -0.000127    |
|    value_loss           | 1.19e+04     |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 22.7        |
|    ep_rew_mean          | -72.7       |
| time/                   |             |
|    fps                  | 298         |
|    iterations           | 201         |
|    time_elapsed         | 1377        |
|    total_timesteps      | 411648      |
| train/                  |             |
|    approx_kl            | 0.004797438 |
|    clip_fraction        | 0.0522      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.208      |
|    explained_variance   | 0           |
|    learning_rate        | 0.0001      |
|    loss                 | 2.29e+03    |
|    n_updates            | 2000        |
|    policy_gradient_loss | 0.000703    |
|    value_loss           | 1.25e+04    |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 22.9

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 22.2         |
|    ep_rew_mean          | -102         |
| time/                   |              |
|    fps                  | 296          |
|    iterations           | 211          |
|    time_elapsed         | 1458         |
|    total_timesteps      | 432128       |
| train/                  |              |
|    approx_kl            | 0.0067807604 |
|    clip_fraction        | 0.0564       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.136       |
|    explained_variance   | -1.19e-07    |
|    learning_rate        | 0.0001       |
|    loss                 | 1.02e+04     |
|    n_updates            | 2100         |
|    policy_gradient_loss | 0.00534      |
|    value_loss           | 1.26e+04     |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 22.7        |
|    ep_rew_mean          | -52.7       |
| time/                   |             |
|    fps                  | 294         |
|    iterations           | 221         |
|    time_elapsed         | 1538        |
|    total_timesteps      | 452608      |
| train/                  |             |
|    approx_kl            | 0.027675878 |
|    clip_fraction        | 0.118       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.224      |
|    explained_variance   | 0           |
|    learning_rate        | 0.0001      |
|    loss                 | 1.2e+04     |
|    n_updates            | 2200        |
|    policy_gradient_loss | -0.00182    |
|    value_loss           | 1.52e+04    |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 23  

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 22.9       |
|    ep_rew_mean          | -143       |
| time/                   |            |
|    fps                  | 293        |
|    iterations           | 231        |
|    time_elapsed         | 1614       |
|    total_timesteps      | 473088     |
| train/                  |            |
|    approx_kl            | 0.01728965 |
|    clip_fraction        | 0.0416     |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.101     |
|    explained_variance   | 1.19e-07   |
|    learning_rate        | 0.0001     |
|    loss                 | 3.13e+03   |
|    n_updates            | 2300       |
|    policy_gradient_loss | 0.00117    |
|    value_loss           | 1.03e+04   |
----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 23          |
|    ep_rew_m

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 22.7         |
|    ep_rew_mean          | -42.7        |
| time/                   |              |
|    fps                  | 292          |
|    iterations           | 241          |
|    time_elapsed         | 1690         |
|    total_timesteps      | 493568       |
| train/                  |              |
|    approx_kl            | 0.0008037207 |
|    clip_fraction        | 0.00977      |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.106       |
|    explained_variance   | 0            |
|    learning_rate        | 0.0001       |
|    loss                 | 287          |
|    n_updates            | 2400         |
|    policy_gradient_loss | -0.000541    |
|    value_loss           | 511          |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

<stable_baselines3.ppo.ppo.PPO at 0x2d021162940>

In [13]:
#evaluate_policy(model, env, n_eval_episodes=10, render=True)

In [14]:
#model.save('PPO')

In [97]:
episodes = 100
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action, _states = model.predict(obs)
        n_state, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{} State:{}'.format(episode, score, n_state))
env.close()

Episode:1 Score:-1010 State:120
Episode:2 Score:-7 State:100
Episode:3 Score:-23 State:[99]
Episode:4 Score:-19 State:100
Episode:5 Score:-21 State:100
Episode:6 Score:-1015 State:120
Episode:7 Score:-1014 State:120
Episode:8 Score:-19 State:100
Episode:9 Score:-1013 State:[120]
Episode:10 Score:-1012 State:[62]
Episode:11 Score:-1016 State:[77]
Episode:12 Score:-1011 State:120
Episode:13 Score:-21 State:[100]
Episode:14 Score:-19 State:100
Episode:15 Score:-15 State:100
Episode:16 Score:-21 State:[83]
Episode:17 Score:-23 State:[98]
Episode:18 Score:-23 State:[80]
Episode:19 Score:-988 State:[43]
Episode:20 Score:-1020 State:[57]
Episode:21 Score:-11 State:[100]
Episode:22 Score:-21 State:[83]
Episode:23 Score:-1022 State:[77]
Episode:24 Score:-3 State:120
Episode:25 Score:-1012 State:[78]
Episode:26 Score:-1016 State:[120]
Episode:27 Score:-1015 State:120
Episode:28 Score:-17 State:[103]
Episode:29 Score:-21 State:[99]
Episode:30 Score:-1010 State:[120]
Episode:31 Score:-19 State:100

In [None]:
env.close()

In [None]:
#Save the model
PPO_path = os.path.join('Training', 'Saved Models', 'PPO_model')

In [None]:
model.save(PPO_path)

In [None]:
del model

In [None]:
model = PPO.load('PPO_model', env=env)

In [None]:
#View logs in Tensorboard
training_log_path = os.path.join(log_path, 'PPO')
!tensorboard --logdir={training_log_path}