<a href="https://colab.research.google.com/github/arac22/keras-demo/blob/main/shower_discrete_ok.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gym[box2d]
!pip install stable_baselines3[extra]

In [22]:
import gym

In [16]:
from time import sleep
import numpy as np
import matplotlib.pyplot as plt
import gym
import random

from gym import Env
from gym.spaces import Discrete, Box

INITIAL_STATE = 3
TARGET_STATE = 3
TEMPERATURE_MIN = 0
TEMPERATURE_MAX = 5

SHOWER_LENGTH = 20


class ShowerEnv(Env):

    # limit temperature to obs range
    def validate_state(self):
        if self.state < int(self.observation_space.low):
            self.state = int (self.observation_space.low)
        elif self.state > int(self.observation_space.high):
            self.state = int(self.observation_space.high)
     
    def __init__(self):
        # Actions we can take, down, stay, up
        self.action_space = Discrete(3)
        # Temperature array
        #self.observation_space = Box(low=np.array(TEMPERATURE_MIN), high=np.array(TEMPERATURE_MAX))
        self.observation_space = Box(low=TEMPERATURE_MIN, high=TEMPERATURE_MAX, shape=(1,), dtype=np.int32)
        # Set start temp
        self.state = INITIAL_STATE + random.randint(-3,3)
        self.validate_state()
        # Set shower length
        self.shower_length = SHOWER_LENGTH
    

        
    def step(self, action):
        # Apply action
        # 0 -1 = -1 temperature
        # 1 -1 = 0 
        # 2 -1 = 1 temperature 
        self.state += action -1 

        self.validate_state()

        # Reduce shower length by 1 second
        self.shower_length -= 1 
        
        # Calculate reward
        if self.state == TARGET_STATE: 
            reward =1 
        else: 
            reward = -1 
        
        # Check if shower is done
        if self.shower_length <= 0: 
            done = True
        else:
            done = False
        
        # Apply temperature noise
        # self.state += random.randint(-1,1)
        # Set placeholder for info
        info = {}
        
        # Return step information
        return (self.state,), reward, done, info

    def render(self):
        # Implement viz
        pass
    
    def reset(self):
        # Reset shower temperature
        # self.state = 38 + random.randint(-3,3)
        self.state = INITIAL_STATE + random.randint(-3,3)
        self.validate_state()
        # Reset shower time
        self.shower_length = SHOWER_LENGTH 
        return (self.state,)

In [23]:
env = ShowerEnv()

print(env.metadata)
print('Action space:', env.action_space)
print('Observation space:', env.observation_space)

{'render.modes': []}
Action space: Discrete(3)
Observation space: Box([0], [5], (1,), int32)


In [None]:

from stable_baselines3 import A2C
from stable_baselines3 import PPO

#model = A2C('MlpPolicy', env, verbose=1)
model = PPO('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=100000)



In [None]:


episodes = 10

for ep in range(episodes):
    ep_score = 0
    obs = env.reset()
    done = False
    while not done:
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        env.render()
        # print(rewards)
        ep_score += rewards 
    print(ep_score) 

In [36]:
# save models

import gym
from stable_baselines3 import PPO
import os


models_dir = "models/PPO"
log_dir = "logs"

if not os.path.exists(models_dir):
    os.makedirs(models_dir)

if not os.path.exists(log_dir):
    os.makedirs(log_dir)


env = ShowerEnv() 
env.reset()

model = PPO('MlpPolicy', env, verbose=1)

TIMESTEPS = 1000
iters = 0

for i in range(10):

    model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False)
    model.save(f"{models_dir}/{TIMESTEPS*iters}")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 20       |
|    ep_rew_mean     | -13.5    |
| time/              |          |
|    fps             | 1530     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 20       |
|    ep_rew_mean     | -10.9    |
| time/              |          |
|    fps             | 1549     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 4096     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 20       |
|    ep_rew_mean     | -7.74    |
| time/              |          |
|    fps             | 1522     |
|    iterations      |

In [37]:
!tensorboard --logdir=logs

2022-10-20 14:12:03.209083: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.9.1 at http://localhost:6006/ (Press CTRL+C to quit)


In [32]:
# laod model

import gym
from stable_baselines3 import PPO

models_dir = "models/PPO"
logdir = "logs"

env = ShowerEnv()
env.reset()

model_path = f"{models_dir}/60000.zip"
model = PPO.load(model_path, env=env)

episodes = 5

for ep in range(episodes):
    ep_score = 0
    obs = env.reset()
    done = False
    while not done:
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        env.render()
        # print(rewards)
        ep_score += rewards 
    print(ep_score)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
20
20
20
20
20
