In [1]:
import pandas as pd
import numpy as np

from dataset import get_dataset, add_derivatives
from env_discrete import BatteryDiscrete
from qlearning import QLearning

from sklearn.preprocessing import MinMaxScaler

from tqdm import tqdm

from plot import display_profit, display_schedule
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import datetime
import warnings


In [2]:
df = get_dataset()

In [3]:
df_train = df[df.timestamp.dt.year==2020].reset_index(drop=True)
df_test = df[df.timestamp.dt.year==2021].reset_index(drop=True)

scaler = MinMaxScaler()
df_train["scaled_price"] = scaler.fit_transform(df_train.price.to_numpy().reshape(-1, 1))
df_test["scaled_price"]  = scaler.transform(df_test.price.to_numpy().reshape(-1, 1))

In [4]:
price_over_daily_mean_bins = 5#5
rolling_windows_price_over_mean = 5

df_train["price_over_daily_mean"] = (df_train["scaled_price"]/df_train["scaled_price"].rolling(rolling_windows_price_over_mean).mean()).to_numpy()
df_test["price_over_daily_mean"]  = (df_test["scaled_price"]/df_test["scaled_price"].rolling(rolling_windows_price_over_mean).mean()).to_numpy()

df_train["price_over_daily_mean"], price_over_daily_mean_quantiles = pd.qcut(df_train.price_over_daily_mean, q=price_over_daily_mean_bins, retbins=True, labels=[i for i in range(price_over_daily_mean_bins)])
price_over_daily_mean_quantiles[0], price_over_daily_mean_quantiles[-1] = -np.inf, np.inf
df_test["price_over_daily_mean"] = pd.cut(df_test.price_over_daily_mean, bins=price_over_daily_mean_quantiles, labels=[i for i in range(price_over_daily_mean_bins)])



In [5]:
price_slope_1_bins = 20 #20
price_slope_2_bins = 20 #20
rolling_windows_slope_1 = 3
rolling_windows_slope_2 = 1

df_train = add_derivatives(df_train, "price" , "price_slope_1", rolling_windows=rolling_windows_slope_1, shift=1)
df_train = add_derivatives(df_train, "price" , "price_slope_2", rolling_windows=rolling_windows_slope_2, shift=0)

df_test = add_derivatives(df_test, "price" , "price_slope_1", rolling_windows=rolling_windows_slope_1, shift=1)
df_test = add_derivatives(df_test, "price" , "price_slope_2", rolling_windows=rolling_windows_slope_2, shift=0)


df_train["price_slope_1"], price_slope_1_quantiles = pd.qcut(df_train.price_slope_1, q=price_slope_1_bins, retbins=True, labels=[i for i in range(price_slope_1_bins)])
price_slope_1_quantiles[0], price_slope_1_quantiles[-1] = -np.inf, np.inf
df_test["price_slope_1"] = pd.cut(df_test.price_slope_1, bins=price_slope_1_quantiles, labels=[i for i in range(price_slope_1_bins)])

df_train["price_slope_2"], price_slope_2_quantiles = pd.qcut(df_train.price_slope_2, q=price_slope_2_bins, retbins=True, labels=[i for i in range(price_slope_2_bins)])
price_slope_2_quantiles[0], price_slope_2_quantiles[-1] = -np.inf, np.inf
df_test["price_slope_2"] = pd.cut(df_test.price_slope_2, bins=price_slope_2_quantiles, labels=[i for i in range(price_slope_2_bins)])



# Get environments

In [6]:
price_bins = None
price_quantiles = None

def reward(env, action):
    if action == 2:
        return env.scaled_price[env.hour] - env.buying_price
    
    if action == 0:
        return (env.scaled_price[env.hour-3:env.hour].max() - env.scaled_price[env.hour]) * 2
    
    return 0
# reward=None


discrete_cols = [
    ("price_slope_1",price_slope_1_bins),
    ("price_slope_2",price_slope_2_bins),
    ("price_over_daily_mean" , price_over_daily_mean_bins)
]

train_env = BatteryDiscrete(df_train, k=5, discrete_cols=discrete_cols, start_hour=max([rolling_windows_slope_1,rolling_windows_slope_2,rolling_windows_price_over_mean])+2, price_bins=price_quantiles, reward_function=reward)
test_env = BatteryDiscrete(df_test, k=5, discrete_cols=discrete_cols, start_hour=max([rolling_windows_slope_1,rolling_windows_slope_2,rolling_windows_price_over_mean])+2, price_bins=price_quantiles, reward_function=reward)

# Q-Learning policy

In [7]:
model = QLearning(train_env, discrete_cols,price_quantiles, nactions=3,epsilon=0.1,gamma=0.1,alpha=0.01)

print(model.Q.shape)

train_env.reset()
# model.Q[:,:,:,0,0] = - 1# if SOC == 0 (action 0), action 0 (discharge) is impossible
# model.Q[:,:,:,2,2] = - 1 # if SOC == 1 (index 2), action 2 (charge) is impossible

model.Q[:,:,:,:,1] = - 0.1

# model.Q[:,:,0,2,1] =  -1 # if super small price over mean, and charged, then force hold 
# model.Q[:,:,price_over_daily_mean_bins-1,1,0] =  0.5 # if super small price over mean, and half charged, then force hold 

model.Q_update_count[:,:,:,0,0] = 1
model.Q_update_count[:,:,:,2,2] = 1
model.learn(total_timesteps=len(df_train)*10)

cum_reward, df_optim = train_env.test(model)
print(cum_reward)
print(-(df_optim.price*df_optim.schedule).sum()/10**6)

cum_reward, df_optim = test_env.test(model)
print(cum_reward)
print(-(df_optim.price*df_optim.schedule).sum()/10**6)

(20, 20, 5, 3, 3)
325.45203337382884
692.482
823.2401181421232
1598.0055
