In [1]:
import numpy as np
import optuna
import copy
import matplotlib.pyplot as plt
from bcmix import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ACTION_RANGE = (-5.0, 5.0)
N_TRIALS = 5
DATA_LEN = 80
P = 0.04

In [3]:
# true value
alpha, beta = 1.8, -2.4
mean_true = np.array([[0.0], [0.0]])
covm_true = np.array([[2.0, 0.0], [0.0, 2.0]])

# prior
canonical_0 = np.array([[0.0], [0.0]])
precision_0 = np.array([[1.0, 0.0], [0.0, 1.0]])
logcon_0 = (np.linalg.slogdet(precision_0)[1] - (canonical_0.T @ np.linalg.inv(precision_0) @ canonical_0).item()) / 2
states = {0: {"can": canonical_0, "pre": precision_0, "log": logcon_0, "pit": 0.0}}

print(alpha, beta, myopic(canonical_0, precision_0))

1.8 -2.4 -0.0


### Rollout with BCMIX

In [4]:
def objective_bcmix(trial):
    a = trial.suggest_float('a', ACTION_RANGE[0], ACTION_RANGE[1])
    q = q_myopic_with_change(states_i, a, alpha_i, beta_i, mean_true, covm_true, p=P)
    return q

In [5]:
for i in range(1):
    # initialize
    states_i, alpha_i, beta_i = copy.deepcopy(states), alpha, beta
    for j in range(DATA_LEN):
        # current state
        # select action
        study = optuna.create_study(direction="maximize")
        study.optimize(objective_bcmix, n_trials=N_TRIALS)
        a = study.best_trial.params['a']
        # update state
        y, alpha_i, beta_i = env_response(a, alpha_i, beta_i, mean_true, covm_true, p=P)
        states_i = update_with_change(states_i, a, y, p=P)


[I 2025-07-26 22:47:05,739] A new study created in memory with name: no-name-719636e9-b414-417f-b1b0-063d6f67f498
[I 2025-07-26 22:47:07,180] Trial 0 finished with value: -151.09932160953224 and parameters: {'a': -4.112018280980046}. Best is trial 0 with value: -151.09932160953224.
[I 2025-07-26 22:47:08,613] Trial 1 finished with value: -92.43882079982032 and parameters: {'a': 4.243277292051314}. Best is trial 1 with value: -92.43882079982032.
[I 2025-07-26 22:47:10,051] Trial 2 finished with value: -20.93918423783714 and parameters: {'a': 1.1749111214548833}. Best is trial 2 with value: -20.93918423783714.
[I 2025-07-26 22:47:11,491] Trial 3 finished with value: -25.448148805106438 and parameters: {'a': 0.540641690347961}. Best is trial 2 with value: -20.93918423783714.
[I 2025-07-26 22:47:12,939] Trial 4 finished with value: -41.87349474119731 and parameters: {'a': -1.3235481847066}. Best is trial 2 with value: -20.93918423783714.
[I 2025-07-26 22:47:12,940] A new study created in m

### Simulate data

In [None]:
xs = np.random.uniform(ACTION_RANGE[0], ACTION_RANGE[1], 80)
ys = [env_response(x, alpha, beta)[0] for x in xs[:40]]
alpha_new, beta_new = np.random.multivariate_normal(mean_true.flatten(), covm_true)
ys = np.array(ys + [env_response(x, alpha_new, beta_new)[0] for x in xs[40:80]])

In [None]:
states_i = copy.deepcopy(states)
for i in range(len(xs)):
    states_i = update_with_change(states_i, xs[i], ys[i], p=P)
    print(i)
    print([s["pit"] for _, s in states_i.items()])