In [1]:
import gym 
from gym import Env
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete 
import numpy as np
import random
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy

In [2]:
Box(0,255,shape=(3,3), dtype=int).sample()

array([[120, 246,  73],
       [ 85,  99, 102],
       [ 52,  42, 100]])

In [557]:
class ChargeEnv(Env):
    def __init__(self):
        # Actions we can take, down, stay, up
        self.action_space = Discrete(3)
        # Observation box
        self.observation_space = Box(low=np.array([0]), high=np.array([100]))
        # Set start SOC
        self.state = 20 + random.randint(-3,3)
        # Set max SOC
        self.max_state = 100
        # Set power
        self.power = 20
        # Set price data
        self.priceArray = np.float32(np.array([36.22, 27.98, 4.6, 9.38, -0.43, 2.26, 4.02, 8.48, 18.73,
                                               20.64, 22.99, 18.93, 16.67, 9.51, 3.59, 2.31, 2.78, 6.3, 14.18,
                                               16.11, 21.42, 25.87, 30.71, 18.44]))
        # Set current index
        self.currentIndex = 0
        self.currentprice = 0
        # Set temp 
        self.temp = 0
        self.change = 0
        self.rate = 0
        self.profit = 0
        self.per = 0
        # Set median
        self.mid = np.median(self.priceArray)
        
    def step(self, action):
        # Apply action
        # 0 -1 = -1 sell 
        # 1 -1 = 0 hold
        # 2 -1 = 1 buy 
        
        self.currentprice = self.priceArray[self.currentIndex]
        
        # If state exceeds boundaries after the action due to power limit, set state to 100 or 0
        # change action based on new state
        if self.state == 0:
            if self.currentprice < self.mid - 2:
                action = 2
            else:
                action = 1
                
        # Hold when state >= 90 while approaching the end
        if self.state >= 80 and (len(self.priceArray) - 1) - self.currentIndex < 5:
            if self.profit - self.currentprice >= 0:
                action = 2
            else:
                action = 1
        # Buy when state < 100 while approaching the end
        elif (self.max_state - self.state) / self.power >= (len(self.priceArray) - 10) - self.currentIndex:
            if self.currentprice < self.mid - 2:
                action = 2
            else:
                action = 1
                
        if self.state == 100:
            if self.profit - self.currentprice >= 0 and (len(self.priceArray) - 1) - self.currentIndex > 5:
                action = 1
            else:
                action = 0
            

        # Calculate reward    
        # BUy
        # Give - reward

        
        if 0 <= self.state < 20:
            
            if action == 2:
                if self.currentprice < self.mid - 2:
                    reward = self.currentprice * 10 
                else:
                    reward = self.currentprice * 0.1
            elif action == 0:
                reward = -self.currentprice * 10
            else:
                if self.currentprice < self.mid - 2:
                    reward = -self.currentprice * 0.5
                else:
                    reward = 0                
            self.temp = - self.currentprice * (action - 1) * self.rate
        
        elif 20 <= self.state <= 80:
            if self.currentprice < self.mid - 2:
                reward = self.currentprice * (action - 1) + self.profit 
            elif self.currentprice > self.mid + 2:
                reward = -self.currentprice * (action - 1) + self.profit 
            else:
                reward = abs(abs(action) - 1) * self.mid
            self.temp = - self.currentprice * (action - 1) * self.rate
            
        else:
            if action == 0:
                if self.currentprice > self.mid + 2 and (len(self.priceArray) - 1) - self.currentIndex > 5:
                    reward = self.currentprice * 10 #* (self.state / self.max_state)
                else:
                    reward = self.currentprice * 0.1
                    
            elif action == 1:
                if (len(self.priceArray) - 1) - self.currentIndex > 5:
                    if self.currentprice > self.mid + 2:
                        reward = -self.currentprice * 0.1
                    else:
                        reward = 0
                else:
                    reward = 10
                    
            else:
                reward = -self.currentprice * 10
                
            self.temp = - self.currentprice * (action - 1) * self.rate
            
            
        
        self.profit += self.temp
        
        self.state += (action - 1) * self.power
        
        if (0 - self.power) < self.state < 0:
            self.state = 0
            self.rate = (0 - self.state) / self.power
        elif 100 < self.state < (self.max_state + self.power):
            self.state = 100
            self.rate = (self.state - self.max_state) / self.power
        else: 
            self.rate = 1
            
        # Move to the next price
        self.currentIndex += 1
        
        # Set Low boundary and Up boundary
        #Set end state
        if self.currentIndex == len(self.priceArray)-1: 
            done = True
            if self.state < 80:
                reward = -10000
            if self.profit < 0:
                reward = -1000
            if self.state >= 80 and self.profit >= 0:
                reward = 1000
        elif self.state < -0 or self.state > 100:
                done = True
                reward = -1000
        elif self.profit >=0 and self.state >= 90:
            done = True
            reward = 2000
        else:
            done = False       

        

        # Set placeholder for info
        info = {}
        
        # Return step information
        return self.state, reward, done, info ,self.profit

    def render(self):
        # Implement viz
        pass
    
    def reset(self):
        # Reset shower temperature
        self.state = np.array([20 + random.randint(-3,3)])#.astype(float)
        # Set max SOC
        self.max_state = 100
        # Set power
        self.power = 20
        # Set price data
        self.priceArray = np.float32(np.array([36.22, 27.98, 4.6, 9.38, -0.43, 2.26, 4.02, 8.48, 18.73,
                                               20.64, 22.99, 18.93, 16.67, 9.51, 3.59, 2.31, 2.78, 6.3, 14.18,
                                               16.11, 21.42, 25.87, 30.71, 18.44]))
        # Set current index
        self.currentIndex = 0
        self.currentprice = 0
        # Set temp 
        self.temp = 0
        self.change = 0
        self.rate = 0
        self.profit = 0
        # Set median
        self.mid = np.median(self.priceArray)
        return self.state

In [558]:
env=ChargeEnv()

In [559]:
env.observation_space.sample()

array([57.180286], dtype=float32)

In [560]:
env.reset()

array([17])

In [561]:
from stable_baselines3.common.env_checker import check_env

In [562]:
check_env(env, warn=True)

AssertionError: The observation returned by the `reset()` method does not match the given observation space

In [563]:
episodes = 50
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info, p = env.step(action)
        score+=reward
    print('Episode:{} Score:{} State:{} p:{}'.format(episode, score, n_state,p))
env.close()

Episode:1 Score:-1609.533001422882 State:80 p:-51.66999822854996
Episode:2 Score:-1543.4179942131043 State:80 p:-53.03999948501587
Episode:3 Score:-1490.8199931383133 State:80 p:-45.0099995136261
Episode:4 Score:1226.197998213768 State:80 p:13.999999105930328
Episode:5 Score:2152.687994027138 State:100 p:16.30999916791916
Episode:6 Score:-1343.6440046548844 State:80 p:-0.37999898195266724
Episode:7 Score:-1606.4290097832682 State:80 p:-32.41999953985214
Episode:8 Score:1941.2600049972534 State:100 p:6.3899986743927
Episode:9 Score:-1511.460014438629 State:80 p:-19.94000005722046
Episode:10 Score:2386.559998512268 State:100 p:16.569998741149902
Episode:11 Score:-1174.6219953298569 State:80 p:-15.489999532699585
Episode:12 Score:-1309.7300105452537 State:80 p:-6.679999649524689
Episode:13 Score:-1080.5039920628071 State:80 p:-15.209999084472656
Episode:14 Score:1789.2899844646454 State:100 p:7.289999425411224
Episode:15 Score:-985.1030048131943 State:80 p:-0.5100007653236389
Episode:16 S

In [509]:
log_path = os.path.join('Training', 'Logs')

In [510]:
model = PPO("MlpPolicy", env, learning_rate=0.001, verbose=1, tensorboard_log=log_path)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [511]:
model.learn(total_timesteps=400000)

Logging to Training\Logs\PPO_77
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.4     |
|    ep_rew_mean     | -275     |
| time/              |          |
|    fps             | 575      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 20.7        |
|    ep_rew_mean          | 292         |
| time/                   |             |
|    fps                  | 407         |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.011389356 |
|    clip_fraction        | 0.157       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | 5.27e-05    

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 20.5          |
|    ep_rew_mean          | 788           |
| time/                   |               |
|    fps                  | 224           |
|    iterations           | 11            |
|    time_elapsed         | 100           |
|    total_timesteps      | 22528         |
| train/                  |               |
|    approx_kl            | 0.00022242437 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.937        |
|    explained_variance   | 0.0307        |
|    learning_rate        | 0.001         |
|    loss                 | 5.6e+05       |
|    n_updates            | 100           |
|    policy_gradient_loss | -3.84e-05     |
|    value_loss           | 8.96e+05      |
-------------------------------------------
-------------------------------------------
| rollout/                |     

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 21.3          |
|    ep_rew_mean          | 527           |
| time/                   |               |
|    fps                  | 228           |
|    iterations           | 21            |
|    time_elapsed         | 188           |
|    total_timesteps      | 43008         |
| train/                  |               |
|    approx_kl            | 0.00041404506 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.799        |
|    explained_variance   | 0.0146        |
|    learning_rate        | 0.001         |
|    loss                 | 5.21e+05      |
|    n_updates            | 200           |
|    policy_gradient_loss | -0.000222     |
|    value_loss           | 7.38e+05      |
-------------------------------------------
-------------------------------------------
| rollout/                |     

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 20.7          |
|    ep_rew_mean          | 815           |
| time/                   |               |
|    fps                  | 229           |
|    iterations           | 30            |
|    time_elapsed         | 267           |
|    total_timesteps      | 61440         |
| train/                  |               |
|    approx_kl            | 5.5566168e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.824        |
|    explained_variance   | 0.0601        |
|    learning_rate        | 0.001         |
|    loss                 | 2.88e+05      |
|    n_updates            | 290           |
|    policy_gradient_loss | -0.000176     |
|    value_loss           | 6.53e+05      |
-------------------------------------------
------------------------------------------
| rollout/                |      

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 20.8         |
|    ep_rew_mean          | 734          |
| time/                   |              |
|    fps                  | 233          |
|    iterations           | 40           |
|    time_elapsed         | 350          |
|    total_timesteps      | 81920        |
| train/                  |              |
|    approx_kl            | 0.0002454064 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.873       |
|    explained_variance   | 0.0612       |
|    learning_rate        | 0.001        |
|    loss                 | 4.57e+05     |
|    n_updates            | 390          |
|    policy_gradient_loss | -0.00035     |
|    value_loss           | 7.78e+05     |
------------------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_l

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 20.9          |
|    ep_rew_mean          | 665           |
| time/                   |               |
|    fps                  | 242           |
|    iterations           | 49            |
|    time_elapsed         | 413           |
|    total_timesteps      | 100352        |
| train/                  |               |
|    approx_kl            | 5.0396688e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.892        |
|    explained_variance   | 0.0944        |
|    learning_rate        | 0.001         |
|    loss                 | 4.8e+05       |
|    n_updates            | 480           |
|    policy_gradient_loss | -6.41e-05     |
|    value_loss           | 6.71e+05      |
-------------------------------------------
------------------------------------------
| rollout/                |      

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 21.2          |
|    ep_rew_mean          | 534           |
| time/                   |               |
|    fps                  | 253           |
|    iterations           | 58            |
|    time_elapsed         | 469           |
|    total_timesteps      | 118784        |
| train/                  |               |
|    approx_kl            | 0.00026908447 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.899        |
|    explained_variance   | 0.0878        |
|    learning_rate        | 0.001         |
|    loss                 | 4.22e+05      |
|    n_updates            | 570           |
|    policy_gradient_loss | -0.000146     |
|    value_loss           | 6.55e+05      |
-------------------------------------------
------------------------------------------
| rollout/                |      

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 20.5         |
|    ep_rew_mean          | 859          |
| time/                   |              |
|    fps                  | 261          |
|    iterations           | 67           |
|    time_elapsed         | 524          |
|    total_timesteps      | 137216       |
| train/                  |              |
|    approx_kl            | 0.0010504951 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.844       |
|    explained_variance   | 0.104        |
|    learning_rate        | 0.001        |
|    loss                 | 4.28e+05     |
|    n_updates            | 660          |
|    policy_gradient_loss | -0.00193     |
|    value_loss           | 7.13e+05     |
------------------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_l

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 20.9        |
|    ep_rew_mean          | 652         |
| time/                   |             |
|    fps                  | 268         |
|    iterations           | 76          |
|    time_elapsed         | 579         |
|    total_timesteps      | 155648      |
| train/                  |             |
|    approx_kl            | 9.91406e-05 |
|    clip_fraction        | 0           |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.872      |
|    explained_variance   | 0.107       |
|    learning_rate        | 0.001       |
|    loss                 | 2.99e+05    |
|    n_updates            | 750         |
|    policy_gradient_loss | -0.000333   |
|    value_loss           | 6.87e+05    |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 21  

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 20.4         |
|    ep_rew_mean          | 846          |
| time/                   |              |
|    fps                  | 274          |
|    iterations           | 86           |
|    time_elapsed         | 641          |
|    total_timesteps      | 176128       |
| train/                  |              |
|    approx_kl            | 0.0009093853 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.895       |
|    explained_variance   | 0.119        |
|    learning_rate        | 0.001        |
|    loss                 | 3.37e+05     |
|    n_updates            | 850          |
|    policy_gradient_loss | -0.000218    |
|    value_loss           | 6.57e+05     |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 20.7         |
|    ep_rew_mean          | 653          |
| time/                   |              |
|    fps                  | 279          |
|    iterations           | 96           |
|    time_elapsed         | 704          |
|    total_timesteps      | 196608       |
| train/                  |              |
|    approx_kl            | 0.0007606392 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.929       |
|    explained_variance   | 0.125        |
|    learning_rate        | 0.001        |
|    loss                 | 3.75e+05     |
|    n_updates            | 950          |
|    policy_gradient_loss | -0.000457    |
|    value_loss           | 7.11e+05     |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 20.2          |
|    ep_rew_mean          | 975           |
| time/                   |               |
|    fps                  | 283           |
|    iterations           | 106           |
|    time_elapsed         | 766           |
|    total_timesteps      | 217088        |
| train/                  |               |
|    approx_kl            | 2.0447333e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.947        |
|    explained_variance   | 0.142         |
|    learning_rate        | 0.001         |
|    loss                 | 3.8e+05       |
|    n_updates            | 1050          |
|    policy_gradient_loss | -5.46e-05     |
|    value_loss           | 6.46e+05      |
-------------------------------------------
-----------------------------------------
| rollout/                |       

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 20            |
|    ep_rew_mean          | 1.07e+03      |
| time/                   |               |
|    fps                  | 286           |
|    iterations           | 116           |
|    time_elapsed         | 829           |
|    total_timesteps      | 237568        |
| train/                  |               |
|    approx_kl            | 6.9737725e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.952        |
|    explained_variance   | 0.119         |
|    learning_rate        | 0.001         |
|    loss                 | 4.03e+05      |
|    n_updates            | 1150          |
|    policy_gradient_loss | -0.000199     |
|    value_loss           | 7.02e+05      |
-------------------------------------------
------------------------------------------
| rollout/                |      

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 21            |
|    ep_rew_mean          | 476           |
| time/                   |               |
|    fps                  | 289           |
|    iterations           | 126           |
|    time_elapsed         | 892           |
|    total_timesteps      | 258048        |
| train/                  |               |
|    approx_kl            | 0.00080188294 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.988        |
|    explained_variance   | 0.146         |
|    learning_rate        | 0.001         |
|    loss                 | 4.05e+05      |
|    n_updates            | 1250          |
|    policy_gradient_loss | -0.000189     |
|    value_loss           | 7.21e+05      |
-------------------------------------------
------------------------------------------
| rollout/                |      

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 20.5          |
|    ep_rew_mean          | 854           |
| time/                   |               |
|    fps                  | 291           |
|    iterations           | 136           |
|    time_elapsed         | 954           |
|    total_timesteps      | 278528        |
| train/                  |               |
|    approx_kl            | 0.00048952695 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.902        |
|    explained_variance   | 0.136         |
|    learning_rate        | 0.001         |
|    loss                 | 3.72e+05      |
|    n_updates            | 1350          |
|    policy_gradient_loss | -0.000783     |
|    value_loss           | 7.14e+05      |
-------------------------------------------
-------------------------------------------
| rollout/                |     

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 20.4          |
|    ep_rew_mean          | 945           |
| time/                   |               |
|    fps                  | 293           |
|    iterations           | 145           |
|    time_elapsed         | 1010          |
|    total_timesteps      | 296960        |
| train/                  |               |
|    approx_kl            | 0.00023349913 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.909        |
|    explained_variance   | 0.141         |
|    learning_rate        | 0.001         |
|    loss                 | 3.18e+05      |
|    n_updates            | 1440          |
|    policy_gradient_loss | -0.00112      |
|    value_loss           | 6.09e+05      |
-------------------------------------------
-----------------------------------------
| rollout/                |       

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 20.5          |
|    ep_rew_mean          | 915           |
| time/                   |               |
|    fps                  | 296           |
|    iterations           | 155           |
|    time_elapsed         | 1072          |
|    total_timesteps      | 317440        |
| train/                  |               |
|    approx_kl            | 0.00014634497 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.883        |
|    explained_variance   | 0.134         |
|    learning_rate        | 0.001         |
|    loss                 | 2.65e+05      |
|    n_updates            | 1540          |
|    policy_gradient_loss | -0.000439     |
|    value_loss           | 5.74e+05      |
-------------------------------------------
------------------------------------------
| rollout/                |      

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 20.6          |
|    ep_rew_mean          | 817           |
| time/                   |               |
|    fps                  | 297           |
|    iterations           | 164           |
|    time_elapsed         | 1129          |
|    total_timesteps      | 335872        |
| train/                  |               |
|    approx_kl            | 0.00023172703 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.914        |
|    explained_variance   | 0.133         |
|    learning_rate        | 0.001         |
|    loss                 | 2.48e+05      |
|    n_updates            | 1630          |
|    policy_gradient_loss | 6.69e-05      |
|    value_loss           | 6.88e+05      |
-------------------------------------------
------------------------------------------
| rollout/                |      

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 20.6          |
|    ep_rew_mean          | 781           |
| time/                   |               |
|    fps                  | 299           |
|    iterations           | 174           |
|    time_elapsed         | 1190          |
|    total_timesteps      | 356352        |
| train/                  |               |
|    approx_kl            | 0.00016141607 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.904        |
|    explained_variance   | 0.119         |
|    learning_rate        | 0.001         |
|    loss                 | 2.52e+05      |
|    n_updates            | 1730          |
|    policy_gradient_loss | -4.15e-05     |
|    value_loss           | 5.56e+05      |
-------------------------------------------
------------------------------------------
| rollout/                |      

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 21           |
|    ep_rew_mean          | 554          |
| time/                   |              |
|    fps                  | 300          |
|    iterations           | 184          |
|    time_elapsed         | 1253         |
|    total_timesteps      | 376832       |
| train/                  |              |
|    approx_kl            | 9.602605e-06 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.899       |
|    explained_variance   | 0.134        |
|    learning_rate        | 0.001        |
|    loss                 | 3.33e+05     |
|    n_updates            | 1830         |
|    policy_gradient_loss | -3.44e-05    |
|    value_loss           | 6.13e+05     |
------------------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_l

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 20.8          |
|    ep_rew_mean          | 715           |
| time/                   |               |
|    fps                  | 301           |
|    iterations           | 193           |
|    time_elapsed         | 1309          |
|    total_timesteps      | 395264        |
| train/                  |               |
|    approx_kl            | 0.00047337235 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.9          |
|    explained_variance   | 0.149         |
|    learning_rate        | 0.001         |
|    loss                 | 3.59e+05      |
|    n_updates            | 1920          |
|    policy_gradient_loss | -0.000702     |
|    value_loss           | 6.55e+05      |
-------------------------------------------
------------------------------------------
| rollout/                |      

<stable_baselines3.ppo.ppo.PPO at 0x24c0719d4c0>

In [13]:
#evaluate_policy(model, env, n_eval_episodes=10, render=True)

In [14]:
#model.save('PPO')

In [512]:
episodes = 100
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action, _states = model.predict(obs)
        n_state, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{} State:{}'.format(episode, score, n_state))
env.close()

Episode:1 Score:398.5720014810562 State:80
Episode:2 Score:437.7120009899139 State:80
Episode:3 Score:-1157.1280219316482 State:80
Episode:4 Score:335.4770023226738 State:80
Episode:5 Score:-1668.4480102419852 State:80
Episode:6 Score:-771.23898999691 State:80
Episode:7 Score:2503.5419944763185 State:100
Episode:8 Score:-1231.1779921412467 State:80
Episode:9 Score:-956.5680038690567 State:80
Episode:10 Score:2329.9399967193604 State:[100]
Episode:11 Score:-1351.0959739208222 State:80
Episode:12 Score:-1542.0749986052513 State:80
Episode:13 Score:1899.6000027656555 State:100
Episode:14 Score:2346.9660001277925 State:100
Episode:15 Score:209.9280012726784 State:[80]
Episode:16 Score:-1035.4280193805694 State:80
Episode:17 Score:-1376.7200158596038 State:80
Episode:18 Score:-1377.2380139589309 State:80
Episode:19 Score:191.27800159454347 State:[80]
Episode:20 Score:1900.030002772808 State:100
Episode:21 Score:-1085.9230110645294 State:80
Episode:22 Score:-1359.918018555641 State:80
Episod

In [None]:
env.close()

In [None]:
#Save the model
PPO_path = os.path.join('Training', 'Saved Models', 'PPO_model')

In [None]:
model.save(PPO_path)

In [None]:
del model

In [None]:
model = PPO.load('PPO_model', env=env)

In [None]:
#View logs in Tensorboard
training_log_path = os.path.join(log_path, 'PPO')
!tensorboard --logdir={training_log_path}