In [1]:
import gym
from gym import Env
from gym.spaces import Discrete, Box
import numpy as np
import random
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping,  ModelCheckpoint, TensorBoard, CSVLogger

from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

In [124]:
class ChargeEnv(Env):
    def __init__(self):
        # Actions we can take, down, stay, up
        self.action_space = Discrete(3)
        # Observation box
        self.observation_space = Box(low=np.array([0]), high=np.array([100]))
        # Set start SOC
        self.state = 20 + random.randint(-3,3)
        # Set max SOC
        self.max_state = 100
        # Set power
        self.power = 20
        # Set price data
        self.priceArray = np.float32(np.array([36.22, 27.98, 4.6, 9.38, -0.43, 2.26, 4.02, 8.48, 18.73,
                                               20.64, 22.99, 18.93, 16.67, 9.51, 3.59, 2.31, 2.78, 6.3, 14.18,
                                               16.11, 21.42, 25.87, 30.71, 18.44]))
        self.currentarray = [0]
        # Set current index
        self.currentIndex = 0
        self.currentprice = 0
        # Set temp 
        self.temp = 0
        self.change = 0
        self.rate = 0
        # Set median
        self.mid = np.median(self.priceArray)
        
    def step(self, action):
        # Apply action
        # 0 -1 = -1 sell 
        # 1 -1 = 0 hold
        # 2 -1 = 1 buy 
        
        self.currentprice = self.priceArray[self.currentIndex]
        
        # If state exceeds boundaries after the action due to power limit, set state to 100 or 0
        if 100 < self.state < (self.max_state + self.power):
            self.rate = (self.state - self.max_state) / self.power
            self.state = 100
            #action = 0
        elif (0 - self.power) < self.state < 0:
            self.rate = (0 - self.state) / self.power
            self.state = 0
            #action = 2
        else: 
            self.rate = 1
        
        # Hold when state >= 90 while approaching the end
        if self.state >= 90 and len(self.priceArray - 1) - self.currentIndex < 2:
            action = 1
        # Buy when state < 100 while approaching the end
        elif (self.max_state - self.state) / self.power >= len(self.priceArray - 1) - self.currentIndex:
            action = 2
         
        # Lose money when buy, and get when sell
        self.temp = -abs(self.currentprice) * (action - 1) * self.rate
        
        # Calculate cumulative profit
        self.change += self.temp
        
        # Give reward based on the profit
        if self.change >= 0: 
            reward = 1
        else:
            reward = -1
        
        # Update the state
        self.state += (action - 1) * self.power 
        
        # Move to the next price
        self.currentIndex += 1

        # Set Low boundary and Up boundary
        # Set end state
        if self.currentIndex == len(self.priceArray)-1:
            done = True
            if self.state < 80:
                reward = -1000
        elif self.state <= -20 or self.state >= 120:
            done = True
            reward = -1000
        else:
            done = False       
   
        

        # Set placeholder for info
        info = {}
        
        # Return step information
        return self.state, reward, done, info

    def render(self):
        # Implement viz
        pass
    
    def reset(self):
        # Reset shower temperature
        self.state = 20 + random.randint(-3,3)
        # Set max SOC
        self.max_state = 100
        # Set power
        self.power = 20
        # Set price data
        self.priceArray = np.float32(np.array([36.22, 27.98, 4.6, 9.38, -0.43, 2.26, 4.02, 8.48, 18.73,
                                               20.64, 22.99, 18.93, 16.67, 9.51, 3.59, 2.31, 2.78, 6.3, 14.18,
                                               16.11, 21.42, 25.87, 30.71, 18.44]))
        self.currentarray = [0]
        # Set current index
        self.currentIndex = 0
        self.currentprice = 0
        # Set temp 
        self.temp = 0
        self.change = 0
        self.rate = 0
        # Set median
        self.mid = np.median(self.priceArray)
        return self.state

In [125]:
env = ChargeEnv()

In [126]:
env.observation_space.sample()

array([68.060776], dtype=float32)

In [127]:
episodes = 100
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        #env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{} State:{}'.format(episode, score, n_state))

Episode:1 Score:-999 State:-20
Episode:2 Score:-23 State:80
Episode:3 Score:-19 State:80
Episode:4 Score:-998 State:-20
Episode:5 Score:-1020 State:62
Episode:6 Score:-1008 State:120
Episode:7 Score:-1016 State:-20
Episode:8 Score:-993 State:-20
Episode:9 Score:-989 State:-20
Episode:10 Score:-23 State:80
Episode:11 Score:-1022 State:42
Episode:12 Score:-999 State:-20
Episode:13 Score:-999 State:-20
Episode:14 Score:-994 State:-20
Episode:15 Score:-997 State:-20
Episode:16 Score:-999 State:-20
Episode:17 Score:-998 State:-20
Episode:18 Score:-1005 State:-20
Episode:19 Score:-999 State:-20
Episode:20 Score:-993 State:-20
Episode:21 Score:-993 State:-20
Episode:22 Score:-1007 State:-20
Episode:23 Score:-984 State:60
Episode:24 Score:-978 State:60
Episode:25 Score:-1002 State:-20
Episode:26 Score:-1008 State:120
Episode:27 Score:-11 State:80
Episode:28 Score:-998 State:-20
Episode:29 Score:19 State:80
Episode:30 Score:-1007 State:-20
Episode:31 Score:-982 State:62
Episode:32 Score:-1009 S

In [128]:
states = env.observation_space.shape
actions = env.action_space.n

In [129]:
actions

3

In [130]:
def build_model(states, actions):
    model = Sequential()    
    model.add(Dense(24, activation='relu', input_shape=states))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

In [131]:
# Apply when 'Sequential' object has no attribute '_compile_time_distribution_strategy'
del model 

In [132]:
model = build_model(states, actions)

In [133]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_24 (Dense)            (None, 24)                48        
                                                                 
 dense_25 (Dense)            (None, 24)                600       
                                                                 
 dense_26 (Dense)            (None, 3)                 75        
                                                                 
Total params: 723
Trainable params: 723
Non-trainable params: 0
_________________________________________________________________


In [134]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=actions, nb_steps_warmup=100, target_model_update=1e-2)
    return dqn

In [135]:
dqn = build_agent(model, actions)
dqn.compile(Adam(learning_rate=1e-4), metrics=['mse']) #mean squared error mse, mean absloute error mae.
dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)

Training for 50000 steps ...
Interval 1 (0 steps performed)
510 episodes - episode_reward: -562.792 [-1022.000, 21.000] - loss: 23484.485 - mse: 17892.421 - mean_q: -20.774

Interval 2 (10000 steps performed)
434 episodes - episode_reward: -405.691 [-980.000, 17.000] - loss: 10512.207 - mse: 24502.879 - mean_q: -58.121

Interval 3 (20000 steps performed)
435 episodes - episode_reward: -452.851 [-980.000, 17.000] - loss: 8816.167 - mse: 47525.887 - mean_q: -83.035

Interval 4 (30000 steps performed)
435 episodes - episode_reward: -400.136 [-980.000, 17.000] - loss: 8013.395 - mse: 89044.547 - mean_q: -118.111

Interval 5 (40000 steps performed)
done, took 555.613 seconds


<keras.callbacks.History at 0x18bfda16700>

In [136]:
scores = dqn.test(env, nb_episodes=100, visualize=False)
print(np.mean(scores.history['episode_reward']))

Testing for 100 episodes ...
Episode 1: reward: -980.000, steps: 23
Episode 2: reward: 17.000, steps: 23
Episode 3: reward: -980.000, steps: 23
Episode 4: reward: -980.000, steps: 23
Episode 5: reward: -980.000, steps: 23
Episode 6: reward: 17.000, steps: 23
Episode 7: reward: 17.000, steps: 23
Episode 8: reward: -980.000, steps: 23
Episode 9: reward: 17.000, steps: 23
Episode 10: reward: 17.000, steps: 23
Episode 11: reward: 17.000, steps: 23
Episode 12: reward: 17.000, steps: 23
Episode 13: reward: 17.000, steps: 23
Episode 14: reward: -980.000, steps: 23
Episode 15: reward: 17.000, steps: 23
Episode 16: reward: -980.000, steps: 23
Episode 17: reward: 17.000, steps: 23
Episode 18: reward: -980.000, steps: 23
Episode 19: reward: -980.000, steps: 23
Episode 20: reward: 17.000, steps: 23
Episode 21: reward: -980.000, steps: 23
Episode 22: reward: 17.000, steps: 23
Episode 23: reward: -980.000, steps: 23
Episode 24: reward: 17.000, steps: 23
Episode 25: reward: -980.000, steps: 23
Episod

In [None]:
#dqn.save_weights('dqn_weights.h5f', overwrite=True)