In [102]:
import gym
from gym import Env
from gym.spaces import Discrete, Box
import numpy as np
import random
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping,  ModelCheckpoint, TensorBoard, CSVLogger

from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

In [133]:
class ChargeEnv(Env):
    def __init__(self):
        # Actions we can take, down, stay, up
        self.action_space = Discrete(3)
        # Observation box
        self.observation_space = Box(low=np.array([0]), high=np.array([100]))
        # Set start SOC
        self.state = 20 #+ random.randint(-3,3)
        # Set max SOC
        self.max_state = 100
        # Set power
        self.power = 20
        # Set price data
        self.priceArray = np.float32(np.array([36.22, 27.98, 4.6, 9.38, -0.43, 2.26, 4.02, 8.48, 18.73,
                                               20.64, 22.99, 18.93, 16.67, 9.51, 3.59, 2.31, 2.78, 6.3, 14.18,
                                               16.11, 21.42, 25.87, 30.71, 18.44]))
        # Set current index
        self.currentIndex = 0
        # Set temp 
        self.temp = 0
        self.change = 0
        # Set median
        self.mid = np.median(self.priceArray)
        
    def step(self, action):
        # Apply action
        # 0 - buy
        # 1 - sell
        # 2 - hold 
 

        
        # If state exceeds boundaries after the action due to power limit, set state to 100 or 0
        # change action based on new state
        if 100 < self.state < 120:
            self.state = 100
            action = 1
        if -20 < self.state < 0:
            self.state = 0
            action = 0
            
        # Buy when state < 100 while approaching the end    
        if (self.max_state - self.state) / self.power >= len(self.priceArray - 1) - self.currentIndex:
            action = 0
            
        # Calculate reward    
        # BUy
        # Give - reward
        if action == 0:
            self.change = 1
            #if (self.max_state - self.state) < self.power:
                #self.temp = self.max_state - self.state
                #self.state = 100
            #if self.priceArray[self.currentIndex] < self.mid - 2:
            #    reward = self.priceArray[self.currentIndex]
            #else:
            #    reward = -1 * self.priceArray[self.currentIndex]
            reward = -self.priceArray[self.currentIndex] #* ((0.001 + self.state) / self.max_state)
                    
        # Sell
        # Give + reward
        elif action == 1:
            self.change = -1
            #if self.state < self.power :
                #self.temp = self.state - 0
                #self.state = 0
            #if self.priceArray[self.currentIndex] > self.mid + 2:
            #    reward = 1 * self.priceArray[self.currentIndex]
            #else:
            #    reward = -1 * self.priceArray[self.currentIndex]
            reward = self.priceArray[self.currentIndex] #* ((0.001 + self.state) / self.max_state)
                
        # Hold        
        else:
            self.change = 0
            reward = 0
        
        self.state += self.change * self.power 
            
        # Move to the next price
        self.currentIndex += 1
        
        # Set Low boundary and Up boundary
        #Set end state
        if self.currentIndex == len(self.priceArray)-1: 
            done = True
            if self.state < 80:
                reward = -1000
        elif self.state <= -20 or self.state >= 120:
                done = True
                reward = -1000
        else:
            done = False       

        

        # Set placeholder for info
        info = {}
        
        # Return step information
        return self.state, reward, done, info

    def render(self):
        # Implement viz
        pass
    
    def reset(self):
        # Reset shower temperature
        self.state = 20 #+ random.randint(-3,3)
        # Set max SOC
        self.max_state = 100
        # Set power
        self.power = 20
        # Set price data
        self.priceArray = np.float32(np.array([36.22, 27.98, 4.6, 9.38, -0.43, 2.26, 4.02, 8.48, 18.73,
                                               20.64, 22.99, 18.93, 16.67, 9.51, 3.59, 2.31, 2.78, 6.3, 14.18,
                                               16.11, 21.42, 25.87, 30.71, 18.44]))
        # Set current index
        self.currentIndex = 0
        # Set temp 
        self.temp = 0
        self.change = 0
        # Set median
        self.mid = np.median(self.priceArray)
        return self.state

In [134]:
env = ChargeEnv()

In [135]:
env.observation_space.sample()

array([46.79038], dtype=float32)

In [136]:
episodes = 100
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        #env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{} State:{}'.format(episode, score, n_state))

Episode:1 Score:-972.9677431232196 State:60
Episode:2 Score:-1047.1819594575072 State:60
Episode:3 Score:-47.82798325872123 State:80
Episode:4 Score:-62.1081006480807 State:80
Episode:5 Score:-34.289843985569476 State:80
Episode:6 Score:-96.62173851014794 State:80
Episode:7 Score:-43.4697852816087 State:80
Episode:8 Score:-31.849850562810293 State:80
Episode:9 Score:-980.3459290659475 State:60
Episode:10 Score:-75.60202133058011 State:80
Episode:11 Score:-112.7758136752528 State:80
Episode:12 Score:-1076.5818218628758 State:60
Episode:13 Score:-1004.3857344701737 State:60
Episode:14 Score:-24.397916355524657 State:80
Episode:15 Score:-973.7960962872053 State:60
Episode:16 Score:-75.5458184686619 State:80
Episode:17 Score:-1020.7761140931725 State:60
Episode:18 Score:-132.267766261639 State:80
Episode:19 Score:-1039.902113788764 State:60
Episode:20 Score:-75.77987030708076 State:80
Episode:21 Score:-1059.8876611610276 State:60
Episode:22 Score:-70.20587352138462 State:80
Episode:23 Scor

In [137]:
states = env.observation_space.shape
actions = env.action_space.n

In [138]:
actions

3

In [139]:
def build_model(states, actions):
    model = Sequential()    
    model.add(Dense(24, activation='relu', input_shape=states))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

In [140]:
# Apply when 'Sequential' object has no attribute '_compile_time_distribution_strategy'
del model 

In [141]:
model = build_model(states, actions)

In [142]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_24 (Dense)            (None, 24)                48        
                                                                 
 dense_25 (Dense)            (None, 24)                600       
                                                                 
 dense_26 (Dense)            (None, 3)                 75        
                                                                 
Total params: 723
Trainable params: 723
Non-trainable params: 0
_________________________________________________________________


In [143]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=actions, nb_steps_warmup=100, target_model_update=1e-2)
    return dqn

In [144]:
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-4), metrics=['mse']) #mean squared error mse, mean absloute error mae.
dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)

Training for 50000 steps ...
Interval 1 (0 steps performed)
434 episodes - episode_reward: -147.369 [-1090.880, -17.816] - loss: 2560.557 - mse: 4391.467 - mean_q: -34.112

Interval 2 (10000 steps performed)
435 episodes - episode_reward: -78.000 [-78.000, -78.000] - loss: 1250.667 - mse: 4645.794 - mean_q: -41.371

Interval 3 (20000 steps performed)
435 episodes - episode_reward: -78.000 [-78.000, -78.000] - loss: 725.002 - mse: 3930.915 - mean_q: -43.060

Interval 4 (30000 steps performed)
435 episodes - episode_reward: -77.977 [-97.384, -57.042] - loss: 445.725 - mse: 3735.909 - mean_q: -51.464

Interval 5 (40000 steps performed)
  315/10000 [..............................] - ETA: 1:38 - reward: -6.3318done, took 401.277 seconds


<keras.callbacks.History at 0x2557e5d1fa0>

In [145]:
scores = dqn.test(env, nb_episodes=100, visualize=False)
print(np.mean(scores.history['episode_reward']))

Testing for 100 episodes ...
Episode 1: reward: -78.000, steps: 23
Episode 2: reward: -78.000, steps: 23
Episode 3: reward: -78.000, steps: 23
Episode 4: reward: -78.000, steps: 23
Episode 5: reward: -78.000, steps: 23
Episode 6: reward: -78.000, steps: 23
Episode 7: reward: -78.000, steps: 23
Episode 8: reward: -78.000, steps: 23
Episode 9: reward: -78.000, steps: 23
Episode 10: reward: -78.000, steps: 23
Episode 11: reward: -78.000, steps: 23
Episode 12: reward: -78.000, steps: 23
Episode 13: reward: -78.000, steps: 23
Episode 14: reward: -78.000, steps: 23
Episode 15: reward: -78.000, steps: 23
Episode 16: reward: -78.000, steps: 23
Episode 17: reward: -78.000, steps: 23
Episode 18: reward: -78.000, steps: 23
Episode 19: reward: -78.000, steps: 23
Episode 20: reward: -78.000, steps: 23
Episode 21: reward: -78.000, steps: 23
Episode 22: reward: -78.000, steps: 23
Episode 23: reward: -78.000, steps: 23
Episode 24: reward: -78.000, steps: 23
Episode 25: reward: -78.000, steps: 23
Episo

In [None]:
#dqn.save_weights('dqn_weights.h5f', overwrite=True)