In [1]:
import gym
from gym import Env
from gym.spaces import Discrete, Box
import numpy as np
import random
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping,  ModelCheckpoint, TensorBoard, CSVLogger

from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

In [48]:
class ChargeEnv(Env):
    def __init__(self):
        # Actions we can take, down, stay, up
        self.action_space = Discrete(3)
        # Peice array
        self.observation_space = Box(low=np.array([0]), high=np.array([100]))
        # Set start SOC
        self.state = 20 #+ random.randint(-3,3)
        # Set max SOC
        self.max_state = 100
        # Set power
        self.power = 20
        # Set price data
        self.priceArray = np.float32(np.array([36.22, 27.98, 4.6, 9.38, -0.43, 2.26, 4.02, 8.48, 18.73,
                                               20.64, 22.99, 18.93, 16.67, 9.51, 3.59, 2.31, 2.78, 6.3, 14.18,
                                               16.11, 21.42, 25.87, 30.71, 18.44]))
        # Set current index
        self.currentIndex = 0
        self.currentprice = 0
        # Set temp 
        self.temp = 0
        self.change = 0
        # Set median
        self.mid = np.median(self.priceArray)
        
    def step(self, action):
        # Apply action
        # 0 -1 = -1 sell
        # 1 -1 = 0 hold
        # 2 -1 = 1 buy 
        self.state += (action - 1) * self.power 
        self.currentprice = self.priceArray[self.currentIndex]

         
        if 100 < self.state < (self.max_state + self.power):
            self.state = 100
        elif (0 - self.power) < self.state < 0:
            self.state = 0
            
        # Calculate reward    
        # BUy
        # Give plus reward is charge at low proce, and minus at high price

        if self.state < 80:
            self.change = -10
        self.temp = -self.currentprice * (action - 1)
        reward = self.temp + self.change

        
        # Move to the next price
        self.currentIndex += 1
        
        # Set Low boundary and Up boundary
        #Set end state
        # Set Low boundary and Up boundary
        # Set end state
        if self.currentIndex == len(self.priceArray)-1 and self.state >= 80:
            done = True               
        elif self.currentIndex == len(self.priceArray)-1 and self.state < 80:
            done = True
            reward = -1000           
        elif self.state < 0 or self.state > 100:
            done = True
            reward = -1000
        else:
            done = False       

        

        # Set placeholder for info
        info = {}
        
        # Return step information
        return self.state, reward, done, info

    def render(self):
        # Implement viz
        pass
    
    def reset(self):
        # Reset shower temperature
        self.state = 20 #+ random.randint(-3,3)
        # Set max SOC
        self.max_state = 100
        # Set power
        self.power = 20
        # Set price data
        self.priceArray = np.float32(np.array([36.22, 27.98, 4.6, 9.38, -0.43, 2.26, 4.02, 8.48, 18.73,
                                               20.64, 22.99, 18.93, 16.67, 9.51, 3.59, 2.31, 2.78, 6.3, 14.18,
                                               16.11, 21.42, 25.87, 30.71, 18.44]))
        # Set current index
        self.currentIndex = 0
        self.currentprice = 0
        # Set temp 
        self.temp = 0
        self.change = 0
        # Set median
        self.mid = np.median(self.priceArray)
        return self.state

In [49]:
env = ChargeEnv()

In [50]:
env.observation_space.sample()

array([27.755512], dtype=float32)

In [51]:
episodes = 100
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        #env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{} State:{}'.format(episode, score, n_state))

Episode:1 Score:-1012.0200004577637 State:-20
Episode:2 Score:-973.7799987792969 State:-20
Episode:3 Score:-1067.7400000095367 State:-20
Episode:4 Score:-1212.0899993777275 State:120
Episode:5 Score:-973.7799987792969 State:-20
Episode:6 Score:-1157.1300001740456 State:-20
Episode:7 Score:-1052.7100000977516 State:-20
Episode:8 Score:-1199.4300003647804 State:-20
Episode:9 Score:-1208.980003118515 State:60
Episode:10 Score:-1191.3900003433228 State:120
Episode:11 Score:-1002.0200004577637 State:-20
Episode:12 Score:-1066.5499987006187 State:-20
Episode:13 Score:-1097.2400012612343 State:-20
Episode:14 Score:-1154.2399985194206 State:120
Episode:15 Score:-1228.6699987053871 State:120
Episode:16 Score:-973.7799987792969 State:-20
Episode:17 Score:-1155.2500021457672 State:-20
Episode:18 Score:-1038.8600015640259 State:-20
Episode:19 Score:-992.0200004577637 State:-20
Episode:20 Score:-1163.5500014424324 State:-20
Episode:21 Score:-1222.0799995064735 State:-20
Episode:22 Score:-1003.77999

In [52]:
states = env.observation_space.shape
actions = env.action_space.n

In [53]:
actions

3

In [54]:
def build_model(states, actions):
    model = Sequential()    
    model.add(Dense(24, activation='relu', input_shape=states))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

In [55]:
# Apply when 'Sequential' object has no attribute '_compile_time_distribution_strategy'
del model 

In [56]:
model = build_model(states, actions)

In [57]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 24)                48        
                                                                 
 dense_10 (Dense)            (None, 24)                600       
                                                                 
 dense_11 (Dense)            (None, 3)                 75        
                                                                 
Total params: 723
Trainable params: 723
Non-trainable params: 0
_________________________________________________________________


In [58]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=actions, nb_steps_warmup=100, target_model_update=1e-2)
    return dqn

In [59]:
dqn = build_agent(model, actions)
dqn.compile(Adam(learning_rate=1e-4), metrics=['mse']) #mean squared error mse, mean absloute error mae.
dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)

Training for 50000 steps ...
Interval 1 (0 steps performed)
758 episodes - episode_reward: -1075.892 [-1284.200, -260.070] - loss: 46435.200 - mse: 31706.523 - mean_q: -14.587

Interval 2 (10000 steps performed)
435 episodes - episode_reward: -1206.679 [-1242.660, -1158.250] - loss: 26660.461 - mse: 27079.703 - mean_q: -87.068

Interval 3 (20000 steps performed)
496 episodes - episode_reward: -1184.827 [-1252.100, -973.780] - loss: 11714.465 - mse: 106823.898 - mean_q: -347.106

Interval 4 (30000 steps performed)
1026 episodes - episode_reward: -1089.360 [-1318.760, -272.030] - loss: 8976.312 - mse: 164480.594 - mean_q: -383.066

Interval 5 (40000 steps performed)
done, took 449.419 seconds


<keras.callbacks.History at 0x28f802fad30>

In [61]:
scores = dqn.test(env, nb_episodes=100, visualize=False)
print(np.mean(scores.history['episode_reward']))

Testing for 100 episodes ...
Episode 1: reward: -1217.000, steps: 23
Episode 2: reward: -1217.000, steps: 23
Episode 3: reward: -1217.000, steps: 23
Episode 4: reward: -1217.000, steps: 23
Episode 5: reward: -1217.000, steps: 23
Episode 6: reward: -1217.000, steps: 23
Episode 7: reward: -1217.000, steps: 23
Episode 8: reward: -1217.000, steps: 23
Episode 9: reward: -1217.000, steps: 23
Episode 10: reward: -1217.000, steps: 23
Episode 11: reward: -1217.000, steps: 23
Episode 12: reward: -1217.000, steps: 23
Episode 13: reward: -1217.000, steps: 23
Episode 14: reward: -1217.000, steps: 23
Episode 15: reward: -1217.000, steps: 23
Episode 16: reward: -1217.000, steps: 23
Episode 17: reward: -1217.000, steps: 23
Episode 18: reward: -1217.000, steps: 23
Episode 19: reward: -1217.000, steps: 23
Episode 20: reward: -1217.000, steps: 23
Episode 21: reward: -1217.000, steps: 23
Episode 22: reward: -1217.000, steps: 23
Episode 23: reward: -1217.000, steps: 23
Episode 24: reward: -1217.000, steps:

In [None]:
#dqn.save_weights('dqn_weights.h5f', overwrite=True)