# 2.6 Optimistic Initial Values

In [None]:
import numpy as np
import os
from tqdm import tqdm_notebook

np.random.seed(77)

import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 15
plt.rcParams['ytick.labelsize'] = 15
plt.rcParams["figure.figsize"] = (11,4)

plt.rcParams['axes.unicode_minus'] = False

PROJECT_ROOT_DIR = "."
CHAPTER_ID = '10_armed_testbed'

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

In [None]:
def bandit(action):
    return np.random.normal(q_rewards[action],1)

def action_selection(t, q, epsilon = 0):

    rand = np.random.rand(1)[0]

    if t <= 0 or epsilon >= rand:            
        return np.random.choice(NUM_OF_ACTION,1)[0]
    else:
        return argmax(q)
    
def argmax(array):
    top_index = [0]
    top = array[0]
    
    for i in range(1,len(array)):
        if array[i] > top:
            top_index = [i]
            top = array[i]
        elif array[i] == top:
            top_index.append(i)
        
    np.random.shuffle(top_index)
        
    return top_index[0]

In [None]:
## simple bandit algorithm with step-size parameter, α = 0.1

STEP = 1000
RUN = 2000
NUM_OF_ACTION = 10

q_rewards = []
for _ in range(NUM_OF_ACTION):
    tmp = np.random.normal(0,1)
    q_rewards.append(tmp)
    
optimal_action = np.argmax(q_rewards)

g_OPTIMAL_ACTIONS = [0]*STEP
g_AVERAGE_REWARDS = [0]*STEP

e01_OPTIMAL_ACTIONS = [0]*STEP
e01_AVERAGE_REWARDS = [0]*STEP

## 10-armed testbed with simple bandit algorithm with step-size parameter, α = 0.1
for i in tqdm_notebook(range(RUN)):

    # greedy
    Q = [5]*NUM_OF_ACTION

    for t in range(STEP):
        A = action_selection(t, Q, 0)
        R = bandit(A)
        Q[A] += (0.1)*(R-Q[A])
        
        g_AVERAGE_REWARDS[t] += R/RUN
        if A == optimal_action:
            g_OPTIMAL_ACTIONS[t] += 1/RUN*100
            
    # ε = 0.1
    Q = [0]*NUM_OF_ACTION

    for t in range(STEP):
        A = action_selection(t, Q, 0.1)
        R = bandit(A)
        Q[A] += (0.1)*(R-Q[A])
        
        e01_AVERAGE_REWARDS[t] += R/RUN
        if A == optimal_action:
            e01_OPTIMAL_ACTIONS[t] += 1/RUN*100