In [1]:
import gym
from gym import Env
from gym.spaces import Discrete, Box
import numpy as np
import random
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping,  ModelCheckpoint, TensorBoard, CSVLogger

from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

In [2]:
class ChargeEnv(Env):
    def __init__(self):
        # Actions we can take, down, stay, up
        self.action_space = Discrete(3)
        # Observation box
        self.observation_space = Box(low=np.array([0]), high=np.array([100]))
        # Set start SOC
        self.state = 20 + random.randint(-3,3)
        # Set max SOC
        self.max_state = 100
        # Set power
        self.power = 20
        # Set price data
        self.priceArray = np.float32(np.array([36.22, 27.98, 4.6, 9.38, -0.43, 2.26, 4.02, 8.48, 18.73,
                                               20.64, 22.99, 18.93, 16.67, 9.51, 3.59, 2.31, 2.78, 6.3, 14.18,
                                               16.11, 21.42, 25.87, 30.71, 18.44]))
        # Set current index
        self.currentIndex = 0
        self.currentprice = 0
        # Set temp 
        self.temp = 0
        self.change = 0
        # Set median
        self.mid = np.median(self.priceArray)
        self.open = 0
        
    def step(self, action):
        # Apply action
        # 0 -1 = -1 sell
        # 1 -1 = 0 hold
        # 2 -1 = 1 buy 
        self.state += (action - 1) * self.power  
        self.currentprice = self.priceArray[self.currentIndex]

        
        
        if 100 < self.state < (self.max_state + self.power):
            self.state = 100
            action = 0
        elif (0 - self.power) < self.state < 0:
            self.state = 0
            action = 2
        
        # Buy when state < 100 while approaching the end
        if self.max_state == self.state and (len(self.priceArray) - 1) - self.currentIndex <= 1:
            action = 2
            
        # Calculate reward    
        # BUy
        # Give plus reward when charge at low proce, and minus at high price, and 0 at hold
        if self.currentprice < (self.mid - 2):
            temp = abs(self.currentprice) * (action - 1) #* ((0.001 + self.max_state - self.state) / self.max_state)
            
            
        # Sell
        # Give minus reward when discharge at low proce, and plus at high price, and 0 at hold
        elif self.currentprice > (self.mid + 2):
            temp = -abs(self.currentprice) * (action - 1) #* ((0.001 + self.state) / self.max_state)
            
            
        # Hold
        # Give plus reward when hold in median price range, and 0 reward otherwise
        else:
            temp = abs(abs(action) - 1) * 0.001
            
        change += temp
        reward = temp
        


        # Move to the next price
        self.currentIndex += 1
        
        
        # Set Low boundary and Up boundary
        # Set end state
        if self.currentIndex == len(self.priceArray)-1:
            done = True
            if self.state < 80:
                reward = -1000
        elif self.state <= -0 or self.state >= 100:
            done = True
            reward = -1000
        else:
            done = False       

        

        # Set placeholder for info
        info = {}
        
        # Return step information
        return self.state, reward, done, info

    def render(self):
        # Implement viz
        pass
    
    def reset(self):
        # Reset shower temperature
        self.state = 20 + random.randint(-3,3)
        # Set max SOC
        self.max_state = 100
        # Set power
        self.power = 20
        # Set price data
        self.priceArray = np.float32(np.array([36.22, 27.98, 4.6, 9.38, -0.43, 2.26, 4.02, 8.48, 18.73,
                                               20.64, 22.99, 18.93, 16.67, 9.51, 3.59, 2.31, 2.78, 6.3, 14.18,
                                               16.11, 21.42, 25.87, 30.71, 18.44]))
        # Set current index
        self.currentIndex = 0
        self.currentprice = 0
        # Set temp 
        self.temp = 0
        self.change = 0
        # Set median
        self.mid = np.median(self.priceArray)
        self.open = 0
        return self.state

In [3]:
env = ChargeEnv()

  logger.warn(


In [4]:
env.observation_space.sample()

array([22.723347], dtype=float32)

In [5]:
episodes = 100
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        #env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{} State:{}'.format(episode, score, n_state))

Episode:1 Score:-7.770964238577481 State:80
Episode:2 Score:59.26626057405652 State:100
Episode:3 Score:-1021.5201695520848 State:40
Episode:4 Score:-1035.7138128341621 State:20
Episode:5 Score:-1004.4587609432059 State:20
Episode:6 Score:-1008.3474792587978 State:20
Episode:7 Score:-995.6015696825987 State:40
Episode:8 Score:3.8566295512932562 State:80
Episode:9 Score:-1015.3876083809322 State:40
Episode:10 Score:-1051.882654131251 State:42
Episode:11 Score:49.79850429360093 State:100
Episode:12 Score:48.52319086188734 State:100
Episode:13 Score:24.934841618378762 State:100
Episode:14 Score:44.70567005585433 State:100
Episode:15 Score:-982.377748162232 State:0
Episode:16 Score:-991.6736742015219 State:0
Episode:17 Score:-40.29308317648828 State:80
Episode:18 Score:-1010.7607481053358 State:0
Episode:19 Score:-1013.035677367329 State:0
Episode:20 Score:-1006.1093289513797 State:77
Episode:21 Score:-972.4024512809807 State:0
Episode:22 Score:-981.005925485869 State:40
Episode:23 Score:-

In [6]:
states = env.observation_space.shape
actions = env.action_space.n

In [7]:
actions

3

In [8]:
def build_model(states, actions):
    model = Sequential()    
    model.add(Dense(24, activation='relu', input_shape=states))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

In [9]:
# Apply when 'Sequential' object has no attribute '_compile_time_distribution_strategy'
del model 

NameError: name 'model' is not defined

In [10]:
model = build_model(states, actions)

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 24)                48        
                                                                 
 dense_1 (Dense)             (None, 24)                600       
                                                                 
 dense_2 (Dense)             (None, 3)                 75        
                                                                 
Total params: 723
Trainable params: 723
Non-trainable params: 0
_________________________________________________________________


In [12]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=actions, nb_steps_warmup=100, target_model_update=1e-2)
    return dqn

In [14]:
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-4), metrics=['mse']) #mean squared error mse, mean absloute error mae.

#my_callbacks = [
    #EarlyStopping(patience=2),
    #ModelCheckpoint(filepath='model.{epoch:02d}-{val_loss:.2f}.h5'),
    #CSVLogger('training.log')]

history = dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)

Training for 50000 steps ...
Interval 1 (0 steps performed)
  250/10000 [..............................] - ETA: 1:12 - reward: -37.4824done, took 1.976 seconds


In [15]:
scores = dqn.test(env, nb_episodes=100, visualize=False)
print(np.mean(scores.history['episode_reward']))

Testing for 100 episodes ...
Episode 1: reward: -943.341, steps: 23
Episode 2: reward: -943.341, steps: 23
Episode 3: reward: -942.254, steps: 23
Episode 4: reward: -942.616, steps: 23
Episode 5: reward: -943.341, steps: 23
Episode 6: reward: -943.341, steps: 23
Episode 7: reward: -943.341, steps: 23
Episode 8: reward: -942.254, steps: 23
Episode 9: reward: -943.341, steps: 23
Episode 10: reward: -943.341, steps: 23
Episode 11: reward: -943.341, steps: 23
Episode 12: reward: -942.978, steps: 23
Episode 13: reward: -942.616, steps: 23
Episode 14: reward: -943.341, steps: 23
Episode 15: reward: -943.341, steps: 23
Episode 16: reward: -943.341, steps: 23
Episode 17: reward: -942.616, steps: 23
Episode 18: reward: -942.978, steps: 23
Episode 19: reward: -942.616, steps: 23
Episode 20: reward: -942.616, steps: 23
Episode 21: reward: -943.341, steps: 23
Episode 22: reward: -943.341, steps: 23
Episode 23: reward: -942.978, steps: 23
Episode 24: reward: -943.341, steps: 23
Episode 25: reward: 

In [None]:
#dqn.save_weights('dqn_weights.h5f', overwrite=True)

In [None]:
print(history.history.keys())