In [3]:
import gymnasium as gym
from gym import spaces
import pygame
import numpy as np
import time
import matplotlib.pyplot as plt
import time
import random
import pandas as pd

#Set the seed for reproducibility
np.random.seed(7)
random.seed(7)

In [7]:
train = pd.read_excel("train.xlsx", parse_dates=['PRICES'])
train['month'] = train['PRICES'].dt.month - 1
train['day'] = train['PRICES'].dt.day - 1
train['year'] = train['PRICES'].dt.year
train.head()

Unnamed: 0,PRICES,Hour 01,Hour 02,Hour 03,Hour 04,Hour 05,Hour 06,Hour 07,Hour 08,Hour 09,...,Hour 18,Hour 19,Hour 20,Hour 21,Hour 22,Hour 23,Hour 24,month,day,year
0,2007-01-01,24.31,24.31,21.71,8.42,0.01,0.01,0.02,0.01,0.01,...,27.77,37.99,33.11,37.99,33.0,36.48,30.65,0,0,2007
1,2007-01-02,16.01,11.0,9.01,7.5,9.0,7.45,16.5,28.01,29.96,...,53.0,59.69,50.09,50.0,36.22,31.09,29.84,0,1,2007
2,2007-01-03,28.0,24.5,24.15,18.99,15.5,23.11,29.01,39.73,43.81,...,57.0,60.99,55.51,51.77,34.51,39.31,38.05,0,2,2007
3,2007-01-04,31.01,28.69,27.0,24.2,12.02,27.0,28.0,34.85,41.31,...,53.0,58.99,53.5,44.01,35.0,34.46,31.57,0,3,2007
4,2007-01-05,22.0,19.5,17.55,15.91,11.4,22.65,27.3,34.0,40.01,...,49.84,53.0,45.91,36.74,29.61,30.61,27.51,0,4,2007


In [41]:
class SmartGridEnv(gym.Env):
    def __init__(self, price_data, battery_capacity=50, max_power=25, efficiency=0.9):
        super(SmartGridEnv, self).__init__()

        self.price_data = price_data
        self.battery_capacity = battery_capacity
        self.max_power = max_power
        self.time_constraint = 7
        self.battery_time_constraint = 20
        self.efficiency = efficiency

        self.current_step = 0
        self.current_month = 0
        self.current_day = 0
        self.current_hour = 0
        self.current_battery = 50  # Initial battery level
        self.current_state = [self.current_step, self.current_month, self.current_day, self.current_hour, self.current_battery]
        self.done = False
        self.profit = 0  # Initial profit

        # Action space: 0 for buying, 1 for selling, 2 for holding
        self.action_space = gym.spaces.MultiDiscrete([101], start=[-50])

        # Observation space: current step, current hour, and current battery level
        self.observation_space = gym.spaces.MultiDiscrete([len(price_data), 24, battery_capacity])

    def reset(self):
        self.current_step = 0
        self.current_month = 0
        self.current_day = 0
        self.current_hour = 0
        self.current_battery = 50
        self.current_state = [self.current_step, self.current_month, self.current_day, self.current_hour, self.current_battery]
        self.done = False
        self.profit = 0  # Reset profit to 0

    def step(self, action):
        if self.done:
            raise ValueError("Episode is done. Call reset() to start a new episode.")
    
        # Execute the action
        charge_cost = action * 2
        actual_charge = self.efficiency * action

        self.current_battery += actual_charge
        self.profit += charge_cost
    
        # Update time step and hour
        self.current_step += 1
        self.current_month = int(self.price_data.iloc[self.current_step]['month'])
        self.current_day = int(self.price_data.iloc[self.current_step]['day'])
        self.current_hour = int(self.current_step % 24)
    
        # Update state
        self.current_state = [self.current_step, self.current_month, self.current_day, self.current_hour, self.current_battery]
    
        # Check if the episode is done (after 3 years of past electricity prices)
        if self.current_step >= len(self.price_data) - 1:
            self.done = True
    
        return self.current_state, charge_cost, self.done, {}

    def render(self, mode="human"):
        if mode == "human":
            if not self.done:
                print(f"Step: {self.current_step}, Hour: {self.current_hour}, Battery: {self.current_battery}, Profit: {self.profit}")
            else:
                print("Episode finished.")
        elif mode == "pygame":
            self._render_pygame()
        elif mode == "rgb_array":
            raise NotImplementedError("Rendering as rgb_array is not supported for this environment.")

    def close(self):
        pass

In [36]:
# Extracting price columns
price_data = train.iloc[:, 1:]

# Create an instance of the environment
env = SmartGridEnv(price_data)

# Run the environment for a few steps
for _ in range(24):
    action = env.action_space.sample()  # Replace with your RL agent's action
    state, profit, done, _ = env.step(action.item())
    env.render(mode="human")  # Print the current state (for demonstration)
    if done:
        print("Episode finished.")
        break

# Reset the environment for a new episode
env.reset()

# Optionally, you can visualize the state after resetting
env.render(mode="human")

# Close the environment
env.close()
print(env.action_space)

Step: 1, Hour: 1, Battery: 11.299999999999997, Profit: -86
Step: 2, Hour: 2, Battery: 8.599999999999998, Profit: -92
Step: 3, Hour: 3, Battery: 52.7, Profit: 6
Step: 4, Hour: 4, Battery: 77.9, Profit: 62
Step: 5, Hour: 5, Battery: 66.2, Profit: 36
Step: 6, Hour: 6, Battery: 28.4, Profit: -48
Step: 7, Hour: 7, Battery: -14.800000000000004, Profit: -144
Step: 8, Hour: 8, Battery: 0.49999999999999645, Profit: -110
Step: 9, Hour: 9, Battery: -15.700000000000003, Profit: -146
Step: 10, Hour: 10, Battery: -9.400000000000002, Profit: -132
Step: 11, Hour: 11, Battery: -13.000000000000002, Profit: -140
Step: 12, Hour: 12, Battery: -48.1, Profit: -218
Step: 13, Hour: 13, Battery: -86.80000000000001, Profit: -304
Step: 14, Hour: 14, Battery: -67.00000000000001, Profit: -260
Step: 15, Hour: 15, Battery: -30.100000000000016, Profit: -178
Step: 16, Hour: 16, Battery: -48.100000000000016, Profit: -218
Step: 17, Hour: 17, Battery: -35.500000000000014, Profit: -190
Step: 18, Hour: 18, Battery: -22.0000

Note 1: not sure if (self.max_power / self.efficiency) is the right way to factor in efficiency as this increases MW. \
Note 2: Electricity is sold at the spot market price 
but is bought at twice the price to pay for the transmission costs and various taxes so when buying cost should be 2x. \
Note 3: reward maybe shouldn't be self.profit. If agent holds and proft is up it will be counted as a reward.\
Note 4: in step function remove np.array() from reward does something else with multidiscrete\
Note 5: discretize in agent instead of step function.

In [36]:
class QAgent():
    def __init__(self, env, discount_rate = 0.95, learning_rate=0.01, epsilon=0.05):
        self.env = env
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.action_space = 3
        self.Qtable = np.zeros([12, 31, 24, self.action_space]) # month,day,hour,capacity, action_space
        self.rewards = []


    def mask(self, action_type):
        # 8 am constraint
        if env.current_hour == env.time_constraint and env.current_battery < env.battery_time_constraint:
            buy_range = env.battery_time_constraint - env.current_battery
            mask = [buy_range, env.max_power]
                        
        else:
            buy_range = min(env.battery_capacity - env.current_battery, env.max_power)
            sell_range = -min(env.current_battery, env.max_power)
            mask = [sell_range, buy_range]

        return mask
        

    def train(self, simulations = 1000, epsilon = 0.05):
        for i in range(simulations):
            while not self.env.done:
                current_state = self.env.current_state
                
                # pick random action
                _, month, day, hour,_ = current_state
                if np.random.uniform(0,1) > 1-self.epsilon:
                    action = self.env.action_space.sample().item() # discretize
                
                # pick greedy action
                else:
                    action = np.argmax(self.Qtable[month, day, hour, :])

                next_state, reward, done, _ = env.step(action)
                next_month, next_day, next_hour = next_state[1], next_state[2], next_state[3]
                # self.rewards.append(reward)
                
                # update Q_values
                Q_target = (reward + self.discount_rate*np.max(self.Qtable[next_month, next_day, next_hour, :]))
                delta = self.learning_rate * (Q_target - self.Qtable[month, day, hour, action])
                self.Qtable[month, day, hour, action] = self.Qtable[month, day, hour, action] + delta

            self.env.reset()

    def visulaize_rewards(self):
        # TODO
        pass

    def play_game(self):
        #TODO
        pass

env = SmartGridEnv(price_data)
agent = QAgent(env)
agent.train(simulations=100)
agent.Qtable[0, 0, :]

array([[  2.04225382, 558.67631863,   0.        ],
       [  0.        ,   0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        ],
       [  0.        ,   0.     