In [58]:
import pandas as pd
import numpy as np

from dataset import get_dataset, add_derivatives
from env_discrete import BatteryDiscrete
from qlearning import QLearning

from sklearn.preprocessing import MinMaxScaler

from tqdm import tqdm

from plot import display_profit, display_schedule
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import datetime
import warnings


# Parameters 

## Number of bins 

In [59]:
price_over_daily_mean_bins = 5#5
price_slope_1_bins = 20 #20
price_slope_2_bins = 20 #20



# Windows 

In [60]:
rolling_windows_slope_1 = 3
rolling_windows_slope_2 = 1
rolling_windows_price_over_mean = 5
window_approx_selling_price = 3

# Reward Scaling 

In [61]:
scaling_buy = 2
initial_q_value_hold = -0.1

# Training 

In [62]:
n_episodes = 100

eps_start = 0.1
eps_min = 0.001


alpha_start = 0.01
alpha_min = 0.0001

gamma = 0.1

In [63]:
df = get_dataset()

In [64]:
df_train = df[df.timestamp.dt.year==2020].reset_index(drop=True)
df_test = df[df.timestamp.dt.year==2021].reset_index(drop=True)

scaler = MinMaxScaler()
df_train["scaled_price"] = scaler.fit_transform(df_train.price.to_numpy().reshape(-1, 1))
df_test["scaled_price"]  = scaler.transform(df_test.price.to_numpy().reshape(-1, 1))

In [65]:


df_train["price_over_daily_mean"] = (df_train["scaled_price"]/df_train["scaled_price"].rolling(rolling_windows_price_over_mean).mean()).to_numpy()
df_test["price_over_daily_mean"]  = (df_test["scaled_price"]/df_test["scaled_price"].rolling(rolling_windows_price_over_mean).mean()).to_numpy()

df_train["price_over_daily_mean"], price_over_daily_mean_quantiles = pd.qcut(df_train.price_over_daily_mean, q=price_over_daily_mean_bins, retbins=True, labels=[i for i in range(price_over_daily_mean_bins)])
price_over_daily_mean_quantiles[0], price_over_daily_mean_quantiles[-1] = -np.inf, np.inf
df_test["price_over_daily_mean"] = pd.cut(df_test.price_over_daily_mean, bins=price_over_daily_mean_quantiles, labels=[i for i in range(price_over_daily_mean_bins)])



In [66]:




df_train = add_derivatives(df_train, "price" , "price_slope_1", rolling_windows=rolling_windows_slope_1, shift=1)
df_train = add_derivatives(df_train, "price" , "price_slope_2", rolling_windows=rolling_windows_slope_2, shift=0)

df_test = add_derivatives(df_test, "price" , "price_slope_1", rolling_windows=rolling_windows_slope_1, shift=1)
df_test = add_derivatives(df_test, "price" , "price_slope_2", rolling_windows=rolling_windows_slope_2, shift=0)


df_train["price_slope_1"], price_slope_1_quantiles = pd.qcut(df_train.price_slope_1, q=price_slope_1_bins, retbins=True, labels=[i for i in range(price_slope_1_bins)])
price_slope_1_quantiles[0], price_slope_1_quantiles[-1] = -np.inf, np.inf
df_test["price_slope_1"] = pd.cut(df_test.price_slope_1, bins=price_slope_1_quantiles, labels=[i for i in range(price_slope_1_bins)])

df_train["price_slope_2"], price_slope_2_quantiles = pd.qcut(df_train.price_slope_2, q=price_slope_2_bins, retbins=True, labels=[i for i in range(price_slope_2_bins)])
price_slope_2_quantiles[0], price_slope_2_quantiles[-1] = -np.inf, np.inf
df_test["price_slope_2"] = pd.cut(df_test.price_slope_2, bins=price_slope_2_quantiles, labels=[i for i in range(price_slope_2_bins)])



# Get environments

In [67]:
price_bins = None
price_quantiles = None

def reward(env, action):
    if action == 2:
        return env.scaled_price[env.hour] - env.buying_price
    
    if action == 0:
        return (env.scaled_price[env.hour-window_approx_selling_price:env.hour].max() - env.scaled_price[env.hour]) * scaling_buy
    
    return 0



discrete_cols = [
    ("price_slope_1",price_slope_1_bins),
    ("price_slope_2",price_slope_2_bins),
    ("price_over_daily_mean" , price_over_daily_mean_bins)
]

train_env = BatteryDiscrete(df_train, k=5, discrete_cols=discrete_cols, start_hour=max([rolling_windows_slope_1,rolling_windows_slope_2,rolling_windows_price_over_mean])+2, price_bins=price_quantiles, reward_function=reward)
test_env = BatteryDiscrete(df_test, k=5, discrete_cols=discrete_cols, start_hour=max([rolling_windows_slope_1,rolling_windows_slope_2,rolling_windows_price_over_mean])+2, price_bins=price_quantiles, reward_function=reward)

# Q-Learning policy

In [68]:


eps_decay = np.exp(np.log(eps_min/ eps_start)/(1.*n_episodes))
alpha_decay = np.exp(np.log(alpha_min/ alpha_start)/(1.*n_episodes))



model = QLearning(train_env, discrete_cols,price_quantiles, nactions=3,epsilon=eps_start,gamma=gamma,alpha=alpha_start)
train_env.reset()
model.Q[:,:,:,:,1] = initial_q_value_hold

cum_reward_list =  []
profit_list = []
cum_reward_test_list = []
profit_test_list = []

fig = make_subplots(specs=[[{"secondary_y": True}]])
fig2 = make_subplots(specs=[[{"secondary_y": True}]])


eps = eps_start
alpha = alpha_start

for episode in tqdm(range(n_episodes)) :
    
    model.learn()
    
    eps = max(eps_min,eps * eps_decay)
    alpha = max(alpha_min,alpha * alpha_decay)


    model.epsilon = eps
    model.alpha = alpha

    cum_reward, df_optim = train_env.test(model)
    profit = -(df_optim.price*df_optim.schedule).sum()/10**6
    cum_reward_list.append(cum_reward)
    profit_list.append(profit)

    cum_reward_test, df_optim_test = test_env.test(model)
    profit_test = -(df_optim_test.price*df_optim_test.schedule).sum()/10**6
    cum_reward_test_list.append(cum_reward_test)
    profit_test_list.append(profit_test)

    
fig.add_trace(
    go.Scatter(x=list(range(n_episodes)), y =cum_reward_list, name = "reward train")
)
fig.add_trace(
    go.Scatter(x=list(range(n_episodes)), y =profit_list, name = "profit train"), secondary_y=True
)


fig2.add_trace(
    go.Scatter(x=list(range(n_episodes)), y =cum_reward_test_list, name = "reward test")
)
fig2.add_trace(
    go.Scatter(x=list(range(n_episodes)), y =profit_test_list, name = "profit test"), secondary_y=True
)

fig.show()
fig2.show()


100%|██████████| 100/100 [01:54<00:00,  1.14s/it]
