In [8]:
import tensorflow as tf
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.regularizers import l2

In [2]:
np.random.seed(42)

In [3]:
# Electricity price per unit for each hour (higher price during peak hours)
electricity_price_per_unit = np.random.randint(1, 6, size=12).tolist() + np.random.randint(6, 11, size=12).tolist()

# Solar power generation (0 at night, 1-10 units during the day)
solar_power_generation = np.concatenate([np.zeros(6), np.random.randint(1, 11, size=12), np.zeros(6)])

# Electricity load profile (higher demand during day/evening, lower at night)
electricity_demand = np.concatenate([np.random.randint(3, 12, size=6), np.random.randint(12, 20, size=12), np.random.randint(3, 12, size=6)])

# Print arrays to verify
print("Electricity Price Per Unit:", electricity_price_per_unit)
print("Solar Power Generation:", solar_power_generation)
print("Electricity Demand:", electricity_demand)

Electricity Price Per Unit: [4, 5, 3, 5, 5, 2, 3, 3, 3, 5, 4, 3, 10, 7, 9, 7, 9, 10, 6, 9, 7, 10, 9, 6]
Solar Power Generation: [ 0.  0.  0.  0.  0.  0.  1. 10.  3.  7.  4.  9.  3.  5.  3.  7.  5.  9.
  0.  0.  0.  0.  0.  0.]
Electricity Demand: [ 9  4  6 11  4 11 13 16 13 15 15 18 15 18 15 16 19 18  5  3  6  4 10  6]


In [48]:
init_state = 0
init_battery = 30
min_battery = 0
max_battery = 50
action_dim = [0,1]

In [62]:
# Define Q_online network for binary classification
Q_online = Sequential([
    Input(shape=(2,)),                          # Input layer with shape (24,)
    Dense(8, activation='relu', kernel_regularizer=l2(0.01)),  # Hidden Layer 1 with L2 regularization
    Dense(2, activation='linear')               # Output layer with 1 unit and sigmoid activation for binary classification
])

Q_target = Sequential([
    Input(shape=(2,)),                          # Input layer with shape (24,)
    Dense(8, activation='relu', kernel_regularizer=l2(0.01)),  # Hidden Layer 1 with L2 regularization
    Dense(2, activation='linear')              # Output layer with 1 unit and sigmoid activation for binary classification
])

# Compile the Q_online model for Q-learning
Q_online.compile(optimizer='adam', 
                 loss='mean_squared_error',  # MSE loss for Q-learning
                 metrics=['mae'])  # Mean Absolute Error can also be used to monitor learning

# Compile the Q_online model for Q-learning
Q_target.compile(optimizer='adam', 
                 loss='mean_squared_error',  # MSE loss for Q-learning
                 metrics=['mae'])  # Mean Absolute Error can also be used to monitor learning

In [56]:
# Hyperparameters
num_episodes = 1
batch_size = 32
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.1
epsilon_decay = 0.995
target_update_freq = 1000
buffer_size = 10000
learning_rate = 0.001
replay_buffer = deque(maxlen=buffer_size)

In [28]:
#Q_target.set_weights(Q_online.get_weights())

In [27]:
#Q_target.get_weights()

In [26]:
#Q_online.get_weights()

In [51]:
np.random.choice([0,1])  

1

In [61]:
q_values

array([[2.5924516e-08]], dtype=float32)

In [68]:
state[0]

0

In [91]:
for episode in range(num_episodes):
    state = np.array([0, init_battery])  # Initialize state: [hour, battery_level]
    reward = 0  # Reset reward at the start of each episode
    while state[0] < 24:  # Loop for 24 hours (states from 0 to 23)
        # Exploration vs Exploitation: Choose action based on epsilon-greedy strategy
        if np.random.rand() < epsilon:
            action = np.random.choice([0, 1])  # Random action (0: Charge, 1: Discharge)
        else:
            q_values = Q_online.predict(state[np.newaxis])  # Predict Q-values from current state
            action = np.argmax(q_values)  # Choose the action with the highest Q-value

        # Charging logic (action = 0)
        if action == 0:  # Charge the battery
            reward += -(electricity_price_per_unit[state[0]] * electricity_demand[state[0]])  # Cost of charging
            state[1] += solar_power_generation[state[0]]  # Add solar power to the battery
        else:  # Discharge the battery (action = 1)
            if state[1] >= electricity_demand[state[0]]:  # If there's enough battery power
                reward += 0  # No additional cost for discharging
                state[1] -= electricity_demand[state[0]]  # Discharge battery
            else:  # Not enough battery power to discharge
                # If not enough energy, the agent will need to buy extra electricity from the grid
                reward -= (electricity_price_per_unit[state[0]] * (electricity_demand[state[0]] - state[1]))  # Cost of extra electricity
                state[1] = 0  # Battery is empty after discharge

        # Ensure battery level is within valid range (0 to max_battery_capacity)
        state[1] = np.clip(state[1], 0, max_battery)
        print(f"Hour {state[0]}, State: {state}, Reward: {reward}")
        # Increment the hour
        state[0] += 1  # Move to the next hour

        
# Electricity Price Per Unit: [4, 5, 3, 5, 5, 2, 3, 3, 3, 5, 4, 3, 10, 7, 9, 7, 9, 10, 6, 9, 7, 10, 9, 6]
# Solar Power Generation: [ 0.  0.  0.  0.  0.  0.  1. 10.  3.  7.  4.  9.  3.  5.  3.  7.  5.  9.
#   0.  0.  0.  0.  0.  0.]
# Electricity Demand: [ 9  4  6 11  4 11 13 16 13 15 15 18 15 18 15 16 19 18  5  3  6  4 10  6]

Hour 0, State: [ 0 21], Reward: 0
Hour 1, State: [ 1 21], Reward: -20
Hour 2, State: [ 2 15], Reward: -20
Hour 3, State: [ 3 15], Reward: -75
Hour 4, State: [ 4 15], Reward: -95
Hour 5, State: [5 4], Reward: -95
Hour 6, State: [6 5], Reward: -134
Hour 7, State: [7 0], Reward: -167
Hour 8, State: [8 3], Reward: -206
Hour 9, State: [9 0], Reward: -266
Hour 10, State: [10  0], Reward: -326
Hour 11, State: [11  0], Reward: -380
Hour 12, State: [12  3], Reward: -530
Hour 13, State: [13  8], Reward: -656
Hour 14, State: [14 11], Reward: -791
Hour 15, State: [15 18], Reward: -903
Hour 16, State: [16 23], Reward: -1074
Hour 17, State: [17  5], Reward: -1074
Hour 18, State: [18  5], Reward: -1104
Hour 19, State: [19  2], Reward: -1104
Hour 20, State: [20  0], Reward: -1132
Hour 21, State: [21  0], Reward: -1172
Hour 22, State: [22  0], Reward: -1262
Hour 23, State: [23  0], Reward: -1298


Initialize environment:
    - Define electricity price per unit, solar power generation, electricity demand, and initial battery level
    - Set the current step to 0

Initialize Q-networks:
    - Q_online (the model being trained)
    - Q_target (the target model)

Initialize replay buffer (empty)

Initialize optimizer (e.g., Adam with learning rate)

Set hyperparameters:
    - gamma = discount factor
    - epsilon = exploration rate (starting value)
    - epsilon_min = minimum epsilon value
    - epsilon_decay = epsilon decay factor
    - num_episodes = number of episodes to train the model
    - batch_size = size of the batch sampled from the replay buffer
    - target_update_freq = frequency of updating the target network

Main Training Loop (for each episode):
    Reset environment
    - Set initial state (price, battery level)
    - Initialize total episode reward = 0

    While not done (until 24 steps):
        1. Select action (0 = charge, 1 = discharge):
            - With probability epsilon, select a random action (explore)
            - Otherwise, select the action with the highest Q-value (exploit) based on the current state

        2. Execute the action:
            - Apply the action in the environment (charge/discharge solar)
            - Get the reward (based on battery state and electricity price)
            - Get the next state (price, updated battery level)
            - Check if the episode is done (24 steps reached)

        3. Store the transition (state, action, reward, next_state, done) in the replay buffer

        4. Sample a minibatch of transitions from the replay buffer (size = batch_size)

        5. For each transition in the minibatch:
            - Calculate the target Q-value using the Bellman equation:
                Q_target = reward + gamma * max(Q_next_state) if not done else reward

        6. Perform a gradient descent step on Q_online:
            - Use the loss function (Mean Squared Error) between predicted Q-values and target Q-values
            - Update Q_online weights

        7. Periodically update the target network (Q_target) with Q_online weights

        8. Decay epsilon (epsilon = max(epsilon_min, epsilon * epsilon_decay))

    Print the total reward for the episode

    After all episodes:
        - Q_online model is trained to minimize the cost of electricity usage by optimizing charging/discharging decisions

End


In [97]:
import numpy as np
import random
import tensorflow as tf
from tensorflow.keras import layers

# Hyperparameters
num_episodes = 100
max_battery = 100  # Maximum battery capacity
epsilon = 1.0  # Exploration probability
epsilon_min = 0.1
epsilon_decay = 0.995
gamma = 0.99  # Discount factor
learning_rate = 0.001
batch_size = 32  # Batch size for replay buffer sampling

# Initialize replay buffer
class ReplayBuffer:
    def __init__(self, max_size=10000):
        self.buffer = []
        self.max_size = max_size
    
    def store(self, state, action, reward, next_state):
        # If the buffer is full, remove the oldest transition
        if len(self.buffer) >= self.max_size:
            self.buffer.pop(0)
        self.buffer.append((state, action, reward, next_state))
    
    def sample(self, batch_size):
        # Randomly sample a batch of transitions from the buffer
        return random.sample(self.buffer, batch_size)
    
    def size(self):
        return len(self.buffer)

# Initialize Q-network (online and target networks)
class QNetwork(tf.keras.Model):
    def __init__(self):
        super(QNetwork, self).__init__()
        self.dense1 = layers.Dense(8, activation='relu')
        self.dense2 = layers.Dense(8, activation='relu')
        self.dense3 = layers.Dense(2)  # Output layer for two actions (charge or discharge)

    def call(self, state):
        x = self.dense1(state)
        x = self.dense2(x)
        return self.dense3(x)

# Initialize Q-network (online and target networks)
Q_online = QNetwork()
Q_target = QNetwork()  # Target network
Q_online_optimizer = tf.keras.optimizers.Adam(learning_rate)

# Function to update Q-values using the Q-learning algorithm
def update_q_values(state, action, reward, next_state):
    with tf.GradientTape() as tape:
        # Predict Q-values for the current state using Q_online
        q_values = Q_online(state[np.newaxis])  # Shape: (1, 2) for two actions
        q_value = q_values[0, action]

        # Predict Q-values for the next state using Q_target
        next_q_values = Q_target(next_state[np.newaxis])  # Shape: (1, 2)
        next_q_value = np.max(next_q_values)  # Max Q-value for next state
        
        # Calculate the target Q-value
        target = reward + gamma * next_q_value

        # Compute the loss (mean squared error)
        loss = tf.reduce_mean(tf.square(target - q_value))

    # Compute gradients and apply them to the Q-online network
    grads = tape.gradient(loss, Q_online.trainable_variables)
    Q_online_optimizer.apply_gradients(zip(grads, Q_online.trainable_variables))

# Function for soft update of Q_target network
def update_target_network():
    Q_target.set_weights(Q_online.get_weights())

# Initialize replay buffer
replay_buffer = ReplayBuffer(max_size=10000)

# Main training loop
for episode in range(num_episodes):
    state = np.array([0, init_battery])  # Initialize state: [hour, battery_level]
    reward = 0  # Reset reward at the start of each episode
    while state[0] < 24:  # Loop for 24 hours (states from 0 to 23)
        # Exploration vs Exploitation: Choose action based on epsilon-greedy strategy
        if np.random.rand() < epsilon:
            action = np.random.choice([0, 1])  # Random action (0: Charge, 1: Discharge)
        else:
            q_values = Q_online.predict(state[np.newaxis])  # Predict Q-values from current state
            action = np.argmax(q_values)  # Choose the action with the highest Q-value

        # Charging logic (action = 0)
        if action == 0:  # Charge the battery
            reward += -(electricity_price_per_unit[state[0]] * electricity_demand[state[0]])  # Cost of charging
            state[1] += solar_power_generation[state[0]]  # Add solar power to the battery
        else:  # Discharge the battery (action = 1)
            if state[1] >= electricity_demand[state[0]]:  # If there's enough battery power
                reward += 0  # No additional cost for discharging
                state[1] -= electricity_demand[state[0]]  # Discharge battery
            else:  # Not enough battery power to discharge
                # If not enough energy, the agent will need to buy extra electricity from the grid
                reward -= (electricity_price_per_unit[state[0]] * (electricity_demand[state[0]] - state[1]))  # Cost of extra electricity
                state[1] = 0  # Battery is empty after discharge

        # Ensure battery level is within valid range (0 to max_battery_capacity)
        state[1] = np.clip(state[1], 0, max_battery)
        
        # Store experience in replay buffer
        replay_buffer.store(state, action, reward, state)

        # Increment the hour
        state[0] += 1  # Move to the next hour

        # Perform experience replay
        if replay_buffer.size() >= batch_size:
            minibatch = replay_buffer.sample(batch_size)
            for state_batch, action_batch, reward_batch, next_state_batch in minibatch:
                update_q_values(state_batch, action_batch, reward_batch, next_state_batch)

    # Epsilon decay
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    # Periodic target network update (every few episodes)
    if episode % 10 == 0:
        update_target_network()

    print(f"Episode {episode + 1}/{num_episodes}, Reward: {reward}, Epsilon: {epsilon}")



Episode 1/100, Reward: -1390, Epsilon: 0.995
Episode 2/100, Reward: -1295, Epsilon: 0.990025
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
Episode 3/100, Reward: -1350, Epsilon: 0.985074875
Episode 4/100, Reward: -1213, Epsilon: 0.9801495006250001
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Episode 5/100, Reward: -1177, Epsilon: 0.9752487531218751
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
Episode 6/100, Reward: -1297, Epsilon: 0.9703725093562657
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Episode 7/100, Reward: -1225, Epsilon: 0.9655206468094844
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Episode 8/100, Reward: -1321, Epsilon: 0.960693043575437
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1

KeyboardInterrupt: 

In [99]:
max_battery

100

In [101]:
import numpy as np
import random
import tensorflow as tf
from tensorflow.keras import layers

# Hyperparameters
num_episodes = 50  # Reduced number of episodes for faster testing
max_battery = 100  # Maximum battery capacity
epsilon = 1.0  # Exploration probability
epsilon_min = 0.1
epsilon_decay = 0.995
gamma = 0.99  # Discount factor
learning_rate = 0.005  # Increased learning rate for faster convergence
batch_size = 16  # Reduced batch size for faster updates

# Initialize replay buffer
class ReplayBuffer:
    def __init__(self, max_size=10000):
        self.buffer = []
        self.max_size = max_size
    
    def store(self, state, action, reward, next_state):
        # If the buffer is full, remove the oldest transition
        if len(self.buffer) >= self.max_size:
            self.buffer.pop(0)
        self.buffer.append((state, action, reward, next_state))
    
    def sample(self, batch_size):
        # Randomly sample a batch of transitions from the buffer
        return random.sample(self.buffer, batch_size)
    
    def size(self):
        return len(self.buffer)

# Initialize Q-network (online and target networks)
class QNetwork(tf.keras.Model):
    def __init__(self):
        super(QNetwork, self).__init__()
        self.dense1 = layers.Dense(4, activation='relu')  # Reduced layer size
        self.dense2 = layers.Dense(4, activation='relu')
        self.dense3 = layers.Dense(2)  # Output layer for two actions (charge or discharge)

    def call(self, state):
        x = self.dense1(state)
        x = self.dense2(x)
        return self.dense3(x)

# Initialize Q-network (online and target networks)
Q_online = QNetwork()
Q_target = QNetwork()  # Target network
Q_online_optimizer = tf.keras.optimizers.Adam(learning_rate)

# Function to update Q-values using the Q-learning algorithm
def update_q_values(states, actions, rewards, next_states):
    with tf.GradientTape() as tape:
        q_values = Q_online(states)  # Use batch processing directly
        q_value = tf.reduce_sum(q_values * tf.one_hot(actions, 2), axis=1)

        next_q_values = Q_target(next_states)  # Use batch processing for next states
        next_q_value = tf.reduce_max(next_q_values, axis=1)

        target = rewards + gamma * next_q_value
        loss = tf.reduce_mean(tf.square(target - q_value))

    grads = tape.gradient(loss, Q_online.trainable_variables)
    Q_online_optimizer.apply_gradients(zip(grads, Q_online.trainable_variables))

# Function for soft update of Q_target network
def update_target_network():
    Q_target.set_weights(Q_online.get_weights())

# Initialize replay buffer
replay_buffer = ReplayBuffer(max_size=10000)

# Simulated environment parameters (Replace with actual data)
electricity_price_per_unit = np.random.randint(1, 6, size=12).tolist() + np.random.randint(6, 11, size=12).tolist()

# Solar power generation (0 at night, 1-10 units during the day)
solar_power_generation = np.concatenate([np.zeros(6), np.random.randint(1, 11, size=12), np.zeros(6)])

# Electricity load profile (higher demand during day/evening, lower at night)
electricity_demand = np.concatenate([np.random.randint(3, 12, size=6), np.random.randint(12, 20, size=12), np.random.randint(3, 12, size=6)])

# Main training loop
for episode in range(num_episodes):
    state = np.array([0, max_battery])  # Initialize state: [hour, battery_level]
    reward = 0  # Reset reward at the start of each episode
    while state[0] < 24:
        # Exploration vs Exploitation: Choose action based on epsilon-greedy strategy
        if np.random.rand() < epsilon:
            action = np.random.choice([0, 1])  # Random action (0: Charge, 1: Discharge)
        else:
            q_values = Q_online.predict(state[np.newaxis])  # Predict Q-values from current state
            action = np.argmax(q_values)  # Choose the action with the highest Q-value

        # Charging logic (action = 0)
        if action == 0:  # Charge the battery
            reward += -(electricity_price_per_unit[state[0]] * electricity_demand[state[0]])  # Cost of charging
            state[1] += solar_power_generation[state[0]]  # Add solar power to the battery
        else:  # Discharge the battery (action = 1)
            if state[1] >= electricity_demand[state[0]]:  # If there's enough battery power
                reward += 0  # No additional cost for discharging
                state[1] -= electricity_demand[state[0]]  # Discharge battery
            else:  # Not enough battery power to discharge
                # If not enough energy, the agent will need to buy extra electricity from the grid
                reward -= (electricity_price_per_unit[state[0]] * (electricity_demand[state[0]] - state[1]))  # Cost of extra electricity
                state[1] = 0  # Battery is empty after discharge

        # Ensure battery level is within valid range (0 to max_battery_capacity)
        state[1] = np.clip(state[1], 0, max_battery)
        
        # Store experience in replay buffer
        replay_buffer.store(state, action, reward, state)

        # Increment the hour
        state[0] += 1  # Move to the next hour

        # Perform experience replay
        if replay_buffer.size() >= batch_size:
            minibatch = replay_buffer.sample(batch_size)
            states_batch = np.array([item[0] for item in minibatch])
            actions_batch = np.array([item[1] for item in minibatch])
            rewards_batch = np.array([item[2] for item in minibatch])
            next_states_batch = np.array([item[3] for item in minibatch])
            update_q_values(states_batch, actions_batch, rewards_batch, next_states_batch)

    # Epsilon decay
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    # Periodic target network update (every two episode)
    if episode % 2 == 0:
        update_target_network()

    print(f"Episode {episode + 1}/{num_episodes}, Reward: {reward}, Epsilon: {epsilon}")


Episode 1/50, Reward: -997, Epsilon: 0.995
Episode 2/50, Reward: -1144, Epsilon: 0.990025
Episode 3/50, Reward: -1043, Epsilon: 0.985074875
Episode 4/50, Reward: -922, Epsilon: 0.9801495006250001
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
Episode 5/50, Reward: -592, Epsilon: 0.9752487531218751
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
Episode 6/50, Reward: -664, Epsilon: 0.9703725093562657
Episode 7/50, Reward: -1110, Epsilon: 0.9655206468094844
Episode 8/50, Reward: -935, Epsilon: 0.960693043575437
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
Episode 9/50, Reward: -923, Epsilon: 0.9558895783575597
Episode 10/50, Reward: -1175, Epsilon: 0.9511101304657719
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
Episode 11/50, Reward: -1042, Epsilon: 0.946354579813443
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Episode 12/50, Reward: -755, Epsilon: 0.941622806