In [1]:
%load_ext autoreload
%autoreload 2

# Shower Temperature Beginner Reinforcement Learning Example

This script implements the Bellman equation to train a Q table and test it.

In [None]:
from shower_environment import Shower
import numpy as np

## Sanity checks on our shower environment

In [8]:
env = Shower()

print("Random valid temperature:", env.observation_space.sample())
print("Random valid temperature change:", env.action_space.sample()-1)

print("\nA few random showers:")
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        #env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print(f"Episode: {episode} Score: {score}")

Random valid temperature: 55
Random valid temperature change: 1

A few random showers:
Episode: 1 Score: -40
Episode: 2 Score: -60
Episode: 3 Score: -60
Episode: 4 Score: -38
Episode: 5 Score: 22


## Update q table function
* Based on initial and final states
* the action that took us from initial to final, 
* and the reward assigned to the action/final state

In [4]:
def update_q_table(q_table, state_i, state_f, action, step_reward):
    alpha = 0.9  # learning rate
    gamma = 1.0  # discount rate
    epsilon = 0.0  # exploration threshold
    try:
        # note about 2d np array access:
        # q_table[state_i] accesses the state_i-th row, which is an array with
        # length equal to number of actions.
        # q_table[state_i, action] access the action-th element of the state_ith
        # array.
        old_q_value = q_table[state_i, action]
    except IndexError:
        print("ERROR with q_table")
        print(q_table)
        print(state_i, action)
        return None
    
    # max q value given the state after this temp change
    next_max = np.max(q_table[state_f])
    q_target = step_reward + gamma * next_max
    q_delta = q_target - old_q_value
    q_table[state_i, action] = old_q_value + alpha * q_delta
    
    return q_table

## Train and test function
Loop showers, update q table

In [5]:
# loop showers
def train_test(env, q_table, episodes = 200, do_train = True):
    total_reward = 0
    for i_shower in range(episodes):
        done = False
        env.reset()
        state_i = env.state
        shower_reward = 0
        #print(i_shower)
        while not done:
            # choose action
            action = env.action_space.sample() if do_train else np.argmax(q_table[state_i])

            # take a step
            state_f, reward, done, info = env.step(action)
            #print("  ", env.shower_time, state, reward, done)
            try:
                assert state_f in env.observation_space
            except AssertionError:
                print("Invalid state obtained", state_f, i_shower, env.shower_time, action)
                break
                                        
            # update q table
            if do_train:
                q_table = update_q_table(q_table, state_i, state_f, action, reward)

            # increment reward
            shower_reward += reward

            state_i = state_f
            
        #print("  Shower reward:", shower_reward)
        total_reward += shower_reward

    #np.savetxt("qtable.csv", q_table, delimiter=",")
    print("average reward:", total_reward / episodes)
    return q_table

## Train

In [6]:
env = Shower()
init_q_table = np.zeros([env.observation_space.n, env.action_space.n])
q_table = train_test(env, init_q_table, episodes = 100, do_train = True)

average reward: -22.78


## Test

In [7]:
train_test(env, q_table, episodes = 100, do_train = False);
train_test(env, q_table, episodes = 100, do_train = False);
train_test(env, q_table, episodes = 100, do_train = False);

average reward: 58.96
average reward: 59.0
average reward: 59.0
