In [5]:
import numpy as np
# from scipy.optimize import root_scalar
# import sympy as sp
import time
from tqdm import tqdm
import confseq as cs

#nsims = 500 # stopping time testing
nsims = 100 # runtime testing
num_arm = 4
total_time_array = np.zeros(nsims)

def arm_selector(S_list, alpha):
  n = len(S_list)
  upp_bounds = np.ones(n)
  mus = np.ones(n) * 0.5

  for i in range(n):
    mus[i] = np.mean(S_list[i])
    t = len(S_list[i])
    rad = np.sqrt(np.log(405.5 * n * t ** (1.1) / alpha * np.log(405.5 * n * t ** (1.1) / alpha) ) / (2*t) )
    upp_bounds[i] = mus[i] + rad

  arm_1 = np.argmax(mus)
  upp_bounds[arm_1] = -1E10
  arm_2 = np.argmax(upp_bounds)

  return (arm_1, arm_2)



# Confidence Intervals to test Against

def conf_int_hedged(S, var_hat, alpha, npts=100):
  '''
  S is a number

  '''
  m = np.arange(npts) / npts + 1 / (2 * npts)
  theta = 1 / 2
  t = len(S)
  lambda_val = np.sqrt(2 * np.log(2 / alpha) / (var_hat * (np.arange(t) + 1) * np.log(1 + (np.arange(t) + 1)))) 

  conf_int = []
  for mean in m:
    lambda_m_plus = np.minimum(lambda_val, 0.5 / mean)
    lambda_m_minus = np.minimum(lambda_val, 0.5 / (1 - mean))
    k_plus = theta * np.prod(1 + lambda_m_plus * (S - mean))
    k_minus = (1 - theta) * np.prod( 1 - lambda_m_minus * (S - mean))

    if max(k_plus, k_minus) < 1 / alpha:
      conf_int.append(mean)

  return (min(conf_int), max(conf_int))


# termination condition for hedge only
def term_condition(S_list, var_hats, alpha, npts):
  term = 0
  best = -1
  upp_bounds = np.ones(num_arm)
  lower_bounds = np.zeros(num_arm)

  for i in range(num_arm):
    S = S_list[i]
    var_hat = var_hats[i]

    bounds = conf_int_hedged(S, var_hat, alpha/num_arm, npts)
    upp_bounds[i] = bounds[1]
    lower_bounds[i] = bounds[0]

  best_low_bounds = np.argmax(lower_bounds)
  if (lower_bounds[best_low_bounds] >= max(upp_bounds[np.arange(num_arm) != best_low_bounds])):
    term = 1
    best = best_low_bounds

  return (term, best)


horizon_len = 1000

mu = [0.29, 0.43, 0.57, 0.71]
# mu = np.arange(4) + 1
# mu = (0.71 - 0.14*(mu - 1))/(0.29 + 0.14*(mu - 1))
alpha = 0.05

# simulation for best arm identification using our approach
best_arm_list = []
stop_times = []


for i in tqdm(range(nsims)):
  np.random.seed(i)
  # S_list = [[np.random.beta(1, p, 1)] for p in mu]
  S_list = [[np.random.binomial(1, p, 1)] for p in mu]
  t = num_arm
  var_hats = [[0.25]] * num_arm
  term = 0
  best = -1

  for j in range(horizon_len):
  #while(term==0):
    which_arm = arm_selector(S_list, alpha)
    h = which_arm[0]
    l = which_arm[1]

    var_hats[h] = np.append(var_hats[h], (1/4 + sum( (S_list[h] - np.mean(S_list[h])) ** 2 ) )/ (len(S_list[h]) + 1))
    var_hats[l] = np.append(var_hats[l], (1/4 + sum( (S_list[l] - np.mean(S_list[l])) ** 2 ) )/ (len(S_list[l]) + 1))

    # S_list[h] = np.append(S_list[h], np.random.beta(1, mu[h], 1))
    S_list[h] = np.append(S_list[h], np.random.binomial(1, mu[h], 1))
    S_list[l] = np.append(S_list[l], np.random.binomial(1, mu[l], 1))

    # S_list[l] = np.append(S_list[l], np.random.beta(1, mu[l], 1))

    start_time = time.time()
    terms = term_condition(S_list, var_hats, alpha, npts = 400) # change npts here to change fidelity
    end_time = time.time()

    total_time_array[i] += end_time - start_time

    term = terms[0]
    best = terms[1]

    t += 2

  best_arm_list.append(best)
  stop_times.append(t)


print(np.mean(stop_times))
print(np.std(stop_times))
print(np.mean(np.equal(best_arm_list,3)))
print(np.mean(total_time_array))
print(np.std(total_time_array))




100%|█████████████████████████████████████████| 100/100 [24:18<00:00, 14.59s/it]

2004.0
0.0
0.56
14.500433225631713
0.6529745869041194





In [6]:
print(np.mean(total_time_array))
print(np.std(total_time_array))

14.500433225631713
0.6529745869041194
