In [1]:
import gym
from gym import Env
from gym.spaces import Discrete, Box
import numpy as np
import random
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping,  ModelCheckpoint, TensorBoard, CSVLogger

from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

In [95]:
class ChargeEnv(Env):
    def __init__(self):
        # Actions we can take, down, stay, up
        self.action_space = Discrete(3)
        # Peice array
        self.observation_space = Box(low=np.array([0]), high=np.array([100]))
        # Set start SOC
        self.state = 20 + random.randint(-3,3)
        # Set max SOC
        self.max_state = 100
        # Set power
        self.power = 20
        # Set price data
        self.priceArray = np.float32(np.array([36.22, 27.98, 4.6, 9.38, -0.43, 2.26, 4.02, 8.48, 18.73,
                                               20.64, 22.99, 18.93, 16.67, 9.51, 3.59, 2.31, 2.78, 6.3, 14.18,
                                               16.11, 21.42, 25.87, 30.71, 18.44]))
        # Set current index
        self.currentIndex = 0
        self.currentprice = 0
        # Set temp 
        self.temp = 0
        self.change = 0
        # Set median
        self.mid = np.median(self.priceArray)
        self.open = 0
        
    def step(self, action):
        # Apply action
        # 0 -1 = -1 sell
        # 1 -1 = 0 hold
        # 2 -1 = 1 buy 
        self.state += (action - 1) * self.power  
        self.currentprice = self.priceArray[self.currentIndex]

        
        
        if 100 < self.state < (self.max_state + self.power):
            self.state = 100
        elif (0 - self.power) < self.state < 0:
            self.state = 0
        # Calculate reward    
        # BUy
        # Give plus reward when charge at low proce, and minus at high price, and 0 at hold
        if self.currentprice < (self.mid - 2):
            temp = abs(self.mid - self.currentprice) * (action - 1)
            reward = temp
            
        # Sell
        # Give minus reward when discharge at low proce, and plus at high price, and 0 at hold
        elif self.currentprice > (self.mid + 2):
            temp = -abs(self.currentprice - self.mid) * (action - 1)
            reward = temp
            
        # Hold
        # Give plus reward when hold in median price range, and 0 reward otherwise
        else:
            temp = abs(abs(action) - 1) * self.currentprice
            reward = temp
        
        # Move to the next price
        self.currentIndex += 1
        
        # Set Low boundary and Up boundary
        # Set end state
        if self.currentIndex == len(self.priceArray)-1 and self.state == 100:
            done = True
            reward += 50
        elif self.currentIndex == len(self.priceArray)-1 and self.state >= 90:
            done = True
            reward += 30
        elif self.currentIndex == len(self.priceArray)-1 and self.state >= 80:
            done = True
            reward += 20               
        elif self.currentIndex == len(self.priceArray)-1 and self.state < 80:
            done = True
            reward += -100           
        elif self.state < 0 or self.state > 100:
            done = True
            reward = -1000
        else:
            done = False       

        

        # Set placeholder for info
        info = {}
        
        # Return step information
        return self.state, reward, done, info

    def render(self):
        # Implement viz
        pass
    
    def reset(self):
        # Reset shower temperature
        self.state = 20 + random.randint(-3,3)
        # Set max SOC
        self.max_state = 100
        # Set power
        self.power = 20
        # Set price data
        self.priceArray = np.float32(np.array([36.22, 27.98, 4.6, 9.38, -0.43, 2.26, 4.02, 8.48, 18.73,
                                               20.64, 22.99, 18.93, 16.67, 9.51, 3.59, 2.31, 2.78, 6.3, 14.18,
                                               16.11, 21.42, 25.87, 30.71, 18.44]))
        # Set current index
        self.currentIndex = 0
        self.currentprice = 0
        # Set temp 
        self.temp = 0
        self.change = 0
        # Set median
        self.mid = np.median(self.priceArray)
        self.open = 0
        return self.state

In [96]:
env = ChargeEnv()

In [97]:
env.observation_space.sample()

array([30.072573], dtype=float32)

In [98]:
episodes = 100
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        #env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{} State:{}'.format(episode, score, n_state))

Episode:1 Score:-996.3349981307983 State:120
Episode:2 Score:-978.9249992370605 State:-20
Episode:3 Score:-922.835000038147 State:120
Episode:4 Score:-29.17000102996826 State:40
Episode:5 Score:-1003.3150043487549 State:-20
Episode:6 Score:-95.33999729156494 State:40
Episode:7 Score:13.15000057220459 State:60
Episode:8 Score:-974.3549995422363 State:120
Episode:9 Score:-971.4549989700317 State:-20
Episode:10 Score:-1000.6699981689453 State:120
Episode:11 Score:-99.69999980926514 State:60
Episode:12 Score:-978.9249992370605 State:-20
Episode:13 Score:-970.2750015258789 State:-20
Episode:14 Score:-26.38999652862549 State:81
Episode:15 Score:-6.190000534057617 State:3
Episode:16 Score:-96.5550012588501 State:40
Episode:17 Score:-974.2849969863892 State:120
Episode:18 Score:-990.3650007247925 State:-20
Episode:19 Score:-123.82000064849854 State:79
Episode:20 Score:-1019.3599996566772 State:120
Episode:21 Score:-1002.7450017929077 State:-20
Episode:22 Score:-947.5499992370605 State:-20
Epis

In [99]:
states = env.observation_space.shape
actions = env.action_space.n

In [100]:
actions

3

In [101]:
def build_model(states, actions):
    model = Sequential()    
    model.add(Dense(24, activation='relu', input_shape=states))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

In [102]:
# Apply when 'Sequential' object has no attribute '_compile_time_distribution_strategy'
del model 

In [103]:
model = build_model(states, actions)

In [104]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_18 (Dense)            (None, 24)                48        
                                                                 
 dense_19 (Dense)            (None, 24)                600       
                                                                 
 dense_20 (Dense)            (None, 3)                 75        
                                                                 
Total params: 723
Trainable params: 723
Non-trainable params: 0
_________________________________________________________________


In [105]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=actions, nb_steps_warmup=100, target_model_update=1e-2)
    return dqn

In [106]:
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-4), metrics=['mse']) #mean squared error mse, mean absloute error mae.

#my_callbacks = [
    #EarlyStopping(patience=2),
    #ModelCheckpoint(filepath='model.{epoch:02d}-{val_loss:.2f}.h5'),
    #CSVLogger('training.log')]

history = dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)

Training for 50000 steps ...
Interval 1 (0 steps performed)
694 episodes - episode_reward: -586.676 [-1024.010, 33.625] - loss: 14886.497 - mse: 10783.019 - mean_q: -10.155

Interval 2 (10000 steps performed)
437 episodes - episode_reward: -83.494 [-978.925, -6.025] - loss: 12120.757 - mse: 10030.620 - mean_q: -19.426

Interval 3 (20000 steps performed)
435 episodes - episode_reward: -78.756 [-128.875, -5.740] - loss: 5982.479 - mse: 8963.064 - mean_q: -46.978

Interval 4 (30000 steps performed)
435 episodes - episode_reward: -81.835 [-121.070, -17.840] - loss: 3013.865 - mse: 16049.631 - mean_q: -64.845

Interval 5 (40000 steps performed)
done, took 622.405 seconds


In [108]:
scores = dqn.test(env, nb_episodes=100, visualize=False)
print(np.mean(scores.history['episode_reward']))

Testing for 100 episodes ...
Episode 1: reward: -40.070, steps: 23
Episode 2: reward: -40.070, steps: 23
Episode 3: reward: -40.070, steps: 23
Episode 4: reward: -40.070, steps: 23
Episode 5: reward: -40.070, steps: 23
Episode 6: reward: -40.070, steps: 23
Episode 7: reward: -40.070, steps: 23
Episode 8: reward: -40.070, steps: 23
Episode 9: reward: -40.070, steps: 23
Episode 10: reward: -40.070, steps: 23
Episode 11: reward: -40.070, steps: 23
Episode 12: reward: -40.070, steps: 23
Episode 13: reward: -40.070, steps: 23
Episode 14: reward: -40.070, steps: 23
Episode 15: reward: -40.070, steps: 23
Episode 16: reward: -40.070, steps: 23
Episode 17: reward: -40.070, steps: 23
Episode 18: reward: -40.070, steps: 23
Episode 19: reward: -40.070, steps: 23
Episode 20: reward: -40.070, steps: 23
Episode 21: reward: -40.070, steps: 23
Episode 22: reward: -40.070, steps: 23
Episode 23: reward: -40.070, steps: 23
Episode 24: reward: -40.070, steps: 23
Episode 25: reward: -40.070, steps: 23
Episo

In [None]:
#dqn.save_weights('dqn_weights.h5f', overwrite=True)

In [None]:
print(history.history.keys())