In [132]:
import gym
from gym import Env
from gym.spaces import Discrete, Box
import numpy as np
import random
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping,  ModelCheckpoint, TensorBoard, CSVLogger

from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

In [214]:
class ChargeEnv(Env):
    def __init__(self):
        # Actions we can take, down, stay, up
        self.action_space = Discrete(3)
        # Observation box
        self.observation_space = Box(low=np.array([0]), high=np.array([100]))
        # Set start SOC
        self.state = 20 + random.randint(-3,3)
        # Set max SOC
        self.max_state = 100
        # Set power
        self.power = 20
        # Set price data
        self.priceArray = np.float32(np.array([36.22, 27.98, 4.6, 9.38, -0.43, 2.26, 4.02, 8.48, 18.73,
                                               20.64, 22.99, 18.93, 16.67, 9.51, 3.59, 2.31, 2.78, 6.3, 14.18,
                                               16.11, 21.42, 25.87, 30.71, 18.44]))
        # Set current index
        self.currentIndex = 0
        # Set temp 
        self.temp = 0
        self.change = 0
        self.rate = 0
        self.profit = 0
        self.per = 0
        # Set median
        self.mid = np.median(self.priceArray)
        
    def step(self, action):
        # Apply action
        # 0 - buy
        # 1 - sell
        # 2 - hold 
 
        
        
        # If state exceeds boundaries after the action due to power limit, set state to 100 or 0
        # change action based on new state

        # Hold when state >= 90 while approaching the end
        if self.state >= 90 and (len(self.priceArray) - 1) - self.currentIndex < 3:
            action = 2
        # Buy when state < 100 while approaching the end
        elif (self.max_state - self.state - 20) / self.power >= (len(self.priceArray) - 1) - self.currentIndex:
            action = 0
            
        # Calculate reward    
        # BUy
        # Give - reward
        if action == 0:
            self.change = 1
            if (self.max_state - self.state) < self.power:
                self.per = self.max_state - self.state
                self.rate = self.per / self.power
                self.state = 80
            else:
                self.rate = 1
                
            if self.priceArray[self.currentIndex] < self.mid - 2 and self.profit >=0 :
                reward = self.priceArray[self.currentIndex] * 5 #* ((self.max_state - self.per) / self.max_state) 
            else:
                reward = -1 * self.priceArray[self.currentIndex]
                
            self.temp = -self.priceArray[self.currentIndex] * self.rate 
                    
        # Sell
        # Give + reward
        elif action == 1:
            self.change = -1
            if self.state < self.power :
                self.per = self.state
                self.rate = self.state / self.power
                self.state = 20
            else:
                self.rate = 1
                
            if self.priceArray[self.currentIndex] > self.mid + 2 and self.profit >=0 :
                reward = 1 * self.priceArray[self.currentIndex] * 5 #*(self.per / self.max_state) 
            else:
                reward = -1 * self.priceArray[self.currentIndex]
            self.temp = self.priceArray[self.currentIndex] * self.rate 
                
        # Hold        
        else:
            self.change = 0
            if abs(self.priceArray[self.currentIndex] - self.mid) >= 2:
                reward = self.priceArray[self.currentIndex]
            else:
                reward = -1 * self.priceArray[self.currentIndex]
            self.temp = 0
        
        self.profit += self.temp
        #reward += self.profit
      
            
        self.state += self.change * self.power
        self.per = self.state
            
        # Move to the next price
        self.currentIndex += 1
        
        # Set Low boundary and Up boundary
        #Set end state
        if self.currentIndex == len(self.priceArray)-1: 
            done = True
            if self.state < 80:
                reward = -10000
            if self.profit < 0:
                reward = -1000
        elif self.state < -0 or self.state > 100:
                done = True
                reward = -1000
        elif self.state >= 80 and self.profit > 0:
                done = True
                reward = 1000
        else:
            done = False       

        

        # Set placeholder for info
        info = {}
        
        # Return step information
        return self.state, reward, done, info ,self.profit

    def render(self):
        # Implement viz
        pass
    
    def reset(self):
        # Reset shower temperature
        self.state = 20 + random.randint(-3,3)
        # Set max SOC
        self.max_state = 100
        # Set power
        self.power = 20
        # Set price data
        self.priceArray = np.float32(np.array([36.22, 27.98, 4.6, 9.38, -0.43, 2.26, 4.02, 8.48, 18.73,
                                               20.64, 22.99, 18.93, 16.67, 9.51, 3.59, 2.31, 2.78, 6.3, 14.18,
                                               16.11, 21.42, 25.87, 30.71, 18.44]))
        # Set current index
        self.currentIndex = 0
        # Set temp 
        self.temp = 0
        self.change = 0
        self.rate = 0
        self.profit = 0
        # Set median
        self.mid = np.median(self.priceArray)
        return self.state

In [215]:
env = ChargeEnv()

In [216]:
env.observation_space.sample()

array([67.73381], dtype=float32)

In [218]:
episodes = 100
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        #env.render()
        action = env.action_space.sample()
        n_state, reward, done, info, p = env.step(action) 
        score+=reward
    #print('Episode:{} Score:{} State:{}'.format(episode, score, n_state))
    print('Episode:{} Score:{} State:{} p:{}'.format(episode, score, n_state, p))

Episode:1 Score:-977.4400048851967 State:80 p:-74.15000180006027
Episode:2 Score:-1020.480000436306 State:60 p:-62.95049940347671
Episode:3 Score:-619.4399985671043 State:100 p:-32.71099944114685
Episode:4 Score:-1180.860003054142 State:80 p:-129.02700178027152
Episode:5 Score:-579.1399993300438 State:80 p:-70.76099820137024
Episode:6 Score:-1165.4400028586388 State:80 p:-136.42000246047974
Episode:7 Score:-754.8200015425682 State:60 p:-15.95000247955322
Episode:8 Score:-1170.180002629757 State:80 p:-106.21799936294556
Episode:9 Score:-1186.9200006127357 State:100 p:-52.56000143289566
Episode:10 Score:-884.7399994730949 State:80 p:-102.96800132989884
Episode:11 Score:-1010.3399997353554 State:80 p:-61.94700312614441
Episode:12 Score:-1185.5000029206276 State:100 p:-97.89900096654893
Episode:13 Score:-659.3800048232079 State:80 p:-64.91000058650971
Episode:14 Score:-543.3799949288368 State:63 p:-25.389998018741608
Episode:15 Score:-1042.9200039505959 State:80 p:-78.86999970674515
Episod

In [204]:
states = env.observation_space.shape
actions = env.action_space.n

In [205]:
actions

3

In [206]:
def build_model(states, actions):
    model = Sequential()    
    model.add(Dense(24, activation='relu', input_shape=states))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear')) #linear, softmax
    return model

In [207]:
# Apply when 'Sequential' object has no attribute '_compile_time_distribution_strategy'
del model 

In [208]:
model = build_model(states, actions)

In [209]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_21 (Dense)            (None, 24)                48        
                                                                 
 dense_22 (Dense)            (None, 24)                600       
                                                                 
 dense_23 (Dense)            (None, 3)                 75        
                                                                 
Total params: 723
Trainable params: 723
Non-trainable params: 0
_________________________________________________________________


In [210]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=actions, nb_steps_warmup=100, target_model_update=1e-2)
    return dqn

In [211]:
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-4), metrics=['mse']) #mean squared error mse, mean absloute error mae.
dqn.fit(env, nb_steps=40000, visualize=False, verbose=1)

Training for 40000 steps ...
Interval 1 (0 steps performed)
435 episodes - episode_reward: -735.230 [-10052.250, 1401.600] - loss: 29416.277 - mse: 50414.204 - mean_q: -102.666

Interval 2 (10000 steps performed)
435 episodes - episode_reward: -427.843 [-660.740, -348.360] - loss: 20839.211 - mse: 98825.258 - mean_q: -174.019

Interval 3 (20000 steps performed)
435 episodes - episode_reward: -419.600 [-419.600, -419.600] - loss: 17412.863 - mse: 92638.172 - mean_q: -150.864

Interval 4 (30000 steps performed)
done, took 394.041 seconds


<keras.callbacks.History at 0x2216eaed6a0>

In [212]:
scores = dqn.test(env, nb_episodes=100, visualize=False)
print(np.mean(scores.history['episode_reward']))

Testing for 100 episodes ...
Episode 1: reward: -419.600, steps: 23
Episode 2: reward: -419.600, steps: 23
Episode 3: reward: -419.600, steps: 23
Episode 4: reward: -419.600, steps: 23
Episode 5: reward: -419.600, steps: 23
Episode 6: reward: -419.600, steps: 23
Episode 7: reward: -419.600, steps: 23
Episode 8: reward: -419.600, steps: 23
Episode 9: reward: -419.600, steps: 23
Episode 10: reward: -419.600, steps: 23
Episode 11: reward: -419.600, steps: 23
Episode 12: reward: -419.600, steps: 23
Episode 13: reward: -419.600, steps: 23
Episode 14: reward: -419.600, steps: 23
Episode 15: reward: -419.600, steps: 23
Episode 16: reward: -419.600, steps: 23
Episode 17: reward: -419.600, steps: 23
Episode 18: reward: -419.600, steps: 23
Episode 19: reward: -419.600, steps: 23
Episode 20: reward: -419.600, steps: 23
Episode 21: reward: -419.600, steps: 23
Episode 22: reward: -419.600, steps: 23
Episode 23: reward: -419.600, steps: 23
Episode 24: reward: -419.600, steps: 23
Episode 25: reward: 

In [None]:
#dqn.save_weights('dqn_weights.h5f', overwrite=True)