In [38]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load old models that we trained in the other script (`train_shower_sb3`) and test them

## Setup

In [39]:
from shower_environment import Shower
from stable_baselines3 import PPO,A2C
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
import os
from stable_baselines3.common.env_checker import check_env

In [40]:
env = Shower()

print("Random valid temperature:", env.observation_space.sample())
print("Random valid temperature change:", env.action_space.sample()-1)

print("\nA few random showers:")
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    terminated = False
    score = 0 
    
    while not terminated:
        #env.render()
        action = env.action_space.sample()
        n_state, reward, terminated, truncated, info = env.step(action)
        score+=reward
    print(f"Episode: {episode} Score: {score}")

Random valid temperature: 68
Random valid temperature change: 0

A few random showers:
Episode: 1 Score: -50
Episode: 2 Score: -60
Episode: 3 Score: 18
Episode: 4 Score: -60
Episode: 5 Score: -60


## Load & Test old models

### PPO

In [46]:
env.reset()
model_path = f"models/PPO/best_model.zip"
model = PPO.load(model_path, env=env)
check_env(env)
episodes = 5
for episode in range(1, episodes+1):
    state, info = env.reset()
    terminated = False
    score = 0 
    while not terminated:
        action, _ = model.predict(state, deterministic=True)
        state, reward, terminated, truncated, info = env.step(action)
        score+=reward
    print(f"Episode: {episode} Score: {score}")

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Episode: 1 Score: 52
Episode: 2 Score: 46
Episode: 3 Score: 58
Episode: 4 Score: 60
Episode: 5 Score: 60


### A2C

In [49]:
env.reset()
model_path = f"models/A2C/best_model.zip"
model = A2C.load(model_path, env=env)
check_env(env)
episodes = 5
for episode in range(1, episodes+1):
    state, info = env.reset()
    terminated = False
    score = 0 
    while not terminated:
        action, _ = model.predict(state, deterministic=True)
        state, reward, terminated, truncated, info = env.step(action)
        score+=reward
    print(f"Episode: {episode} Score: {score}")

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Episode: 1 Score: 58
Episode: 2 Score: 56
Episode: 3 Score: 58
Episode: 4 Score: 54
Episode: 5 Score: 60


Neither of these really perform as well as the Q-table that we trained with the Bellman equation!