In [325]:
import pandas as pd
import numpy as np

from dataset import get_dataset, add_derivatives
from env_discrete import BatteryDiscrete
from qlearning import QLearning

from sklearn.preprocessing import MinMaxScaler

from tqdm import tqdm

from plot import display_profit, display_schedule
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import datetime
import warnings


In [326]:
df = get_dataset()

In [327]:
df_train = df[df.timestamp.dt.year==2020].reset_index(drop=True)
df_test = df[df.timestamp.dt.year==2021].reset_index(drop=True)

scaler = MinMaxScaler()
df_train["scaled_price"] = scaler.fit_transform(df_train.price.to_numpy().reshape(-1, 1))
df_test["scaled_price"]  = scaler.transform(df_test.price.to_numpy().reshape(-1, 1))

In [328]:





price_over_daily_mean_bins = 5#5
rolling_windows_price_over_mean = 5

df_train["price_over_daily_mean"] = (df_train["scaled_price"]/df_train["scaled_price"].rolling(rolling_windows_price_over_mean).mean()).to_numpy()
df_test["price_over_daily_mean"]  = (df_test["scaled_price"]/df_test["scaled_price"].rolling(rolling_windows_price_over_mean).mean()).to_numpy()

df_train["price_over_daily_mean"], price_over_daily_mean_quantiles = pd.qcut(df_train.price_over_daily_mean, q=price_over_daily_mean_bins, retbins=True, labels=[i for i in range(price_over_daily_mean_bins)])
price_over_daily_mean_quantiles[0], price_over_daily_mean_quantiles[-1] = -np.inf, np.inf
df_test["price_over_daily_mean"] = pd.cut(df_test.price_over_daily_mean, bins=price_over_daily_mean_quantiles, labels=[i for i in range(price_over_daily_mean_bins)])



In [334]:
price_slope_1_bins = 10 #20
price_slope_2_bins = 10 #20
rolling_windows_slope_1 = 3
rolling_windows_slope_2 = 1

df_train = add_derivatives(df_train, "price" , "price_slope_1", rolling_windows=rolling_windows_slope_1, shift=1)
df_train = add_derivatives(df_train, "price" , "price_slope_2", rolling_windows=rolling_windows_slope_2, shift=0)

df_test = add_derivatives(df_test, "price" , "price_slope_1", rolling_windows=rolling_windows_slope_1, shift=1)
df_test = add_derivatives(df_test, "price" , "price_slope_2", rolling_windows=rolling_windows_slope_2, shift=0)


df_train["price_slope_1"], price_slope_1_quantiles = pd.qcut(df_train.price_slope_1, q=price_slope_1_bins, retbins=True, labels=[i for i in range(price_slope_1_bins)])
price_slope_1_quantiles[0], price_slope_1_quantiles[-1] = -np.inf, np.inf
df_test["price_slope_1"] = pd.cut(df_test.price_slope_1, bins=price_slope_1_quantiles, labels=[i for i in range(price_slope_1_bins)])

df_train["price_slope_2"], price_slope_2_quantiles = pd.qcut(df_train.price_slope_2, q=price_slope_2_bins, retbins=True, labels=[i for i in range(price_slope_2_bins)])
price_slope_2_quantiles[0], price_slope_2_quantiles[-1] = -np.inf, np.inf
df_test["price_slope_2"] = pd.cut(df_test.price_slope_2, bins=price_slope_2_quantiles, labels=[i for i in range(price_slope_2_bins)])





# derivatives_rolling_windows = [3, 3]
# df_train = add_derivatives(df_train, colname="price", nders=2, rolling_windows=derivatives_rolling_windows, shift=1)
# df_test = add_derivatives(df_test, colname="price", nders=2, rolling_windows=derivatives_rolling_windows, shift=1)


# # we repeat above process for first and second price derivatives (rolling differences)
# df_train["dprice_der1"], price_der1_quantiles = pd.qcut(df_train.price_der1, q=price_der1_bins, retbins=True, labels=[i for i in range(price_der1_bins)])
# price_der1_quantiles[0], price_der1_quantiles[-1] = -np.inf, np.inf
# df_test["dprice_der1"] = pd.cut(df_test.price_der1, bins=price_der1_quantiles, labels=[i for i in range(price_der1_bins)])

# df_train["dprice_der2"], price_der2_quantiles = pd.qcut(df_train.price_der2, q=price_der2_bins, retbins=True, labels=[i for i in range(price_der2_bins)])
# price_der2_quantiles[0], price_der2_quantiles[-1] = -np.inf, np.inf
# df_test["dprice_der2"] = pd.cut(df_test.price_der2, bins=price_der2_quantiles, labels=[i for i in range(price_der2_bins)])

# we repeat above process for first and second price derivatives (rolling differences)
# df_train["dprice_der1"], price_der1_quantiles = pd.qcut(df_train.price_der1, q=price_der1_bins, retbins=True, labels=[i for i in range(price_der1_bins)])
# price_der1_quantiles[0], price_der1_quantiles[-1] = -np.inf, np.inf
# df_test["dprice_der1"] = pd.cut(df_test.price_der1, bins=price_der1_quantiles, labels=[i for i in range(price_der1_bins)])

# df_train["dprice_der2"], price_der2_quantiles = pd.qcut(df_train.price_der2, q=price_der2_bins, retbins=True, labels=[i for i in range(price_der2_bins)])
# price_der2_quantiles[0], price_der2_quantiles[-1] = -np.inf, np.inf
# df_test["dprice_der2"] = pd.cut(df_test.price_der2, bins=price_der2_quantiles, labels=[i for i in range(price_der2_bins)])






# Get environments

In [335]:
price_bins = None
price_quantiles = None

def reward(env, action):
    if action == 2:
        return env.scaled_price[env.hour] - env.buying_price
    
    if action == 0:
        return (env.scaled_price[env.hour-3:env.hour].max() - env.scaled_price[env.hour]) * 2
    
    return 0
# reward=None


discrete_cols = [
    ("price_slope_1",price_slope_1_bins),
    ("price_slope_2",price_slope_2_bins),
    ("price_over_daily_mean" , price_over_daily_mean_bins)
]

train_env = BatteryDiscrete(df_train, k=5, discrete_cols=discrete_cols, start_hour=max([rolling_windows_slope_1,rolling_windows_slope_2,rolling_windows_price_over_mean])+2, price_bins=price_quantiles, reward_function=reward)
test_env = BatteryDiscrete(df_test, k=5, discrete_cols=discrete_cols, start_hour=max([rolling_windows_slope_1,rolling_windows_slope_2,rolling_windows_price_over_mean])+2, price_bins=price_quantiles, reward_function=reward)

# Q-Learning policy

In [336]:
price_over_daily_mean_bins

5

In [337]:
import numpy as np
import gym
import random


class QLearning:
    def __init__(
        self, env: gym.Env, discrete_cols,price_quantiles, nactions, alpha=0.1, gamma=0.9, epsilon=0.1
    ):
        self.env: gym.Env = env
        self.state_shape = tuple([n_bin for (_,n_bin) in discrete_cols]) + (nactions,)
        self.nactions = nactions
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon

        # initialize Q to zeros
        self.Q = np.zeros(shape=tuple(list(self.state_shape) + [nactions]))
        self.Q_update_count = np.zeros(shape=tuple(list(self.state_shape) + [nactions]))

    def learn(self, total_timesteps=None):
        state = self.env.reset()
        t = 0
        while not total_timesteps or t < total_timesteps :
            if np.random.random() < self.epsilon:
                if state[-1] == 0:
                    action = np.random.choice([1, 2],p=[0.1,0.9])
                elif state[-1] == 2:
                    action = np.random.choice([1, 0],p=[0.1,0.9])
                else : action = self.env.action_space.sample()
            else:
                action = np.random.choice(np.flatnonzero(self.Q[self._get_index(
                    state)] == np.max(self.Q[self._get_index(state)])))

            next_state, reward, terminated, _ = self.env.step(action)
            t+=1
            if terminated:
                # reset and start again
                state = self.env.reset()
                if total_timesteps : 
                    continue
                else : return

            old_value = self.Q[self._get_index(state, action)]
            next_max = np.max(self.Q[self._get_index(next_state)])
            new_value = (1 - self.alpha) * old_value + self.alpha * (
                reward + self.gamma * next_max
            )
            # update Q matrix
            self.Q_update_count[self._get_index(state, action)] +=1
            self.Q[self._get_index(state, action)] = new_value

            state = next_state



    # def learn_SARSA(self, total_timesteps):
    #     state = self.env.reset()
    #     action = random.choice([1, 2])
        
    #     for _ in range(total_timesteps):
            

    #         next_state, reward, terminated, _ = self.env.step(action)


    #         if terminated:
    #             # reset and start again
    #             next_state = self.env.reset()
    #             continue

    #         if np.random.random() < self.epsilon:
    #             print(state[-1])
    #             if state[-1] == 0:
    #                 next_action = random.choice([1, 2],p=[0.1,0.9])
    #             elif state[-1] == 2:
    #                 print("here")
    #                 next_action = random.choice([1, 0],p=[0.1,0.9])
    #             else : next_action = self.env.action_space.sample()
    #         else:
    #             next_action = np.random.choice(np.flatnonzero(self.Q[self._get_index(next_state)] == np.max(self.Q[self._get_index(next_state)])))




            # old_value = self.Q[self._get_index(state, action)]
            # new_value = (1 - self.alpha) * old_value + self.alpha * (
            #     reward + self.gamma * self.Q[self._get_index(next_state,next_action)]
            # )
            # # update Q matrix
            # self.Q_update_count[self._get_index(state, action)] +=1
            # self.Q[self._get_index(state, action)] = new_value

            # state = next_state
            # action = next_action



    def predict(self, state, deterministic=False):
        action, states_ = np.random.choice(np.flatnonzero(self.Q[self._get_index(
            state)] == np.max(self.Q[self._get_index(state)]))), None
        return action, states_

    def _get_index(self, state, action=None):
        if action is None:
            return tuple(state)
        return tuple(state) + (action,)


In [338]:
model = QLearning(train_env, discrete_cols,price_quantiles, nactions=3,epsilon=0.1,gamma=0.1,alpha=0.01)

print(model.Q.shape)

train_env.reset()
# model.Q[:,:,:,0,0] = - 1# if SOC == 0 (action 0), action 0 (discharge) is impossible
# model.Q[:,:,:,2,2] = - 1 # if SOC == 1 (index 2), action 2 (charge) is impossible

model.Q[:,:,:,:,1] = - 0.1

# model.Q[:,:,0,2,1] =  -1 # if super small price over mean, and charged, then force hold 
# model.Q[:,:,price_over_daily_mean_bins-1,1,0] =  0.5 # if super small price over mean, and half charged, then force hold 

model.Q_update_count[:,:,:,0,0] = 1
model.Q_update_count[:,:,:,2,2] = 1
model.learn(total_timesteps=len(df_train)*100)

cum_reward, df_optim = train_env.test(model)
print(cum_reward)
print(-(df_optim.price*df_optim.schedule).sum()/10**6)

cum_reward, df_optim = test_env.test(model)
print(cum_reward)
print(-(df_optim.price*df_optim.schedule).sum()/10**6)

(10, 10, 5, 3, 3)


100%|█████████▉| 8775/8783 [00:00<00:00, 22733.66it/s]


344.0319663796736
676.549


100%|█████████▉| 8752/8760 [00:00<00:00, 25521.38it/s]

886.4885335234875
1721.5905





In [None]:
for t in range()

# Test on train env:

In [242]:
cum_reward, df_optim = train_env.test(model)
print(cum_reward)
# display_profit(df_optim)


100%|█████████▉| 8775/8783 [00:00<00:00, 22500.70it/s]

215.4469961573





# SARSA policy

In [35]:
model = QLearning(train_env, discrete_cols,price_quantiles, nactions=3,epsilon=0.1,gamma=0.95,alpha=0.1)

print(model.Q.shape)

train_env.reset()
model.Q[:,:,:,0,0] = - np.inf # if SOC == 0 (action 0), action 0 (discharge) is impossible
model.Q[:,:,:,2,2] = - np.inf # if SOC == 1 (index 2), action 2 (charge) is impossible
model.Q_update_count[:,:,:,0,0] = 1
model.Q_update_count[:,:,:,2,2] = 1
model.learn_SARSA(total_timesteps=len(df_train)*10)
print((model.Q_update_count).mean())

(20, 20, 10, 3, 3)
2.6616666666666666


In [36]:
cum_reward, df_optim = train_env.test(model)
display_profit(df_optim)

100%|█████████▉| 8756/8783 [00:00<00:00, 16779.29it/s]


# Test on test env

In [292]:
cum_reward, df_optim = test_env.test(model)

100%|█████████▉| 8752/8760 [00:00<00:00, 20719.57it/s]


In [293]:
display_schedule(df_optim)

In [294]:
display_profit(df_optim)