In [None]:
import optuna
import numpy as np
import matplotlib.pyplot as plt
from bcmix import *

In [None]:
ACTION_RANGE = (-5.0, 5.0)
N_TRIALS = 50
DATA_LEN = 81

In [None]:
# true value
alpha, beta = -1.8, 2.2

# prior
canonical_0 = np.array([[0.0], [0.0]])
precision_0 = np.array([[1.0, 0.0], [0.0, 1.0]])

print(alpha, beta, myopic(canonical_0, precision_0))

### Rollout

In [None]:
def objective(trial):
    a = trial.suggest_float('a', ACTION_RANGE[0], ACTION_RANGE[1])
    q = q_myopic_without_change(canonical_i, precision_i, a)
    return q

In [None]:
for i in range(10):
    # initialize
    simresult_i = np.full((DATA_LEN, 8), np.nan)
    canonical_i, precision_i = canonical_0, precision_0
    for j in range(DATA_LEN):
        # current state
        covm_i = np.linalg.inv(precision_i)
        mean_i = covm_i @ canonical_i
        simresult_i[j, 0] = mean_i[0][0]
        simresult_i[j, 1] = mean_i[1][0]
        simresult_i[j, 2] = covm_i[0][0]
        simresult_i[j, 3] = covm_i[0][1]
        simresult_i[j, 4] = covm_i[1][1]
        # select action
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=N_TRIALS)
        a = study.best_trial.params['a']
        simresult_i[j, 5] = a
        simresult_i[j, 6] = study.best_trial.value
        # update state
        y = env_response(a, alpha, beta)[0]
        simresult_i[j, 7] = reward(a, y)
        canonical_i, precision_i = update_without_change(canonical_i, precision_i, a, y)
    np.save("simulations\sim_nocp_" + str(i) + ".npy", simresult_i)

### Plots

In [None]:
qvalues = np.full(DATA_LEN, 0.0)
regrets = np.full(DATA_LEN, 0.0)

for i in range(10):
    simresult_i = np.load("simulations\sim_nocp_" + str(i) + ".npy")
    qvalues += simresult_i[:, 6]
    for j in range(DATA_LEN):
        regrets[j] += (GAMMA ** j) * (alpha + simresult_i[j, 5] * beta) ** 2

qvalues /= 10
regrets /= 10

In [None]:
# plot Q values
plt.plot(qvalues)

In [None]:
# plot regret
plt.plot(np.cumsum(regrets))