In [20]:
import matplotlib.pyplot as plt
from multi_armed_bandit import MultiArmBandit
import numpy as np
from tqdm import tqdm

In [21]:
def simulate(bandit, runs, time):
    abs_estimation_error = np.zeros((bandit.arms, runs, time))
    bandit.best_action_counts = np.zeros((runs, time))
    bandit.rewards = np.zeros((runs, time))
    for r in tqdm(range(runs)):
        bandit.reset()
        for t in range(time):
            bandit.time += 1
            action = bandit.act()
            if action == bandit.best_arm:
                bandit.best_action_counts[r, t] = 1
            reward = bandit.step(action)
            bandit.rewards[r, t] = reward
            for arm in range(bandit.arms):
                abs_estimation_error[arm, r, t] = abs(bandit.q_estimate[arm] - bandit.q_true[arm])
    bandit.rewards = bandit.rewards.mean(axis=0)  # taking average of all the runs
    bandit.best_action_counts = bandit.best_action_counts.mean(axis=0)  # taking average of all the runs
    abs_estimation_error = abs_estimation_error.mean(axis=1)  # taking average of all the runs
    return bandit.rewards, bandit.best_action_counts, abs_estimation_error


In [22]:
var = 1

runs = 2000
time = 1000
arms = 10

bandit0 = MultiArmBandit(arms, epsilon=0, var=var)
bandit1 = MultiArmBandit(arms, epsilon=0.01, var=var)
bandit2 = MultiArmBandit(arms, epsilon=0.1, var=var)

In [23]:
rewards0, best_action_counts0, abs_estimation_error0 = simulate(bandit0, runs, time)
rewards1, best_action_counts1, abs_estimation_error1 = simulate(bandit1, runs, time)
rewards2, best_action_counts2, abs_estimation_error2 = simulate(bandit2, runs, time)


In [24]:


plt.figure(figsize=(20, 30))

# average reward vs steps
plt.subplot(3, 2, 1)
plt.xlabel('steps')
plt.ylabel('average reward')

plt.plot(rewards0, label='epsilon = 0')
plt.plot(rewards1, label='epsilon = 0.01')
plt.plot(rewards2, label='epsilon = 0.1')
plt.legend()

# optimal action vs steps
plt.subplot(3, 2, 2)
plt.xlabel('steps')
plt.ylabel('% optimal action')

plt.plot(best_action_counts0, label='epsilon = 0')
plt.plot(best_action_counts1, label='epsilon = 0.01')
plt.plot(best_action_counts2, label='epsilon = 0.1')
plt.legend()

# average absolute error in the estimate vs steps for epsilon = 0
plt.subplot(3, 2, 3)
plt.xlabel('steps')
plt.ylabel('average absolute error in the estimate for epsilson = 0')
for arm in range(arms):
    plt.plot(abs_estimation_error0[arm], label='arm = %s' % arm)
plt.legend()

# average absolute error in the estimate vs steps for epsilon = 0.01
plt.subplot(3, 2, 4)
plt.xlabel('steps')
plt.ylabel('average absolute error in the estimate for epsilson = 0.01')
for arm in range(arms):
    plt.plot(abs_estimation_error1[arm], label='arm = %s' % arm)
plt.legend()

# average absolute error in the estimate vs steps for epsilon = 0.1
plt.subplot(3, 2, 5)
plt.xlabel('steps')
plt.ylabel('average absolute error in the estimate for epsilson = 0.1')
for arm in range(arms):
    plt.plot(abs_estimation_error2[arm], label='arm = %s' % arm)
plt.legend()

plt.savefig('./q1.png')
plt.close()


In [25]:
var = 2

runs = 2000
time = 1000
arms = 10

bandit0 = MultiArmBandit(arms, epsilon=0, var=var)
bandit1 = MultiArmBandit(arms, epsilon=0.01, var=var)
bandit2 = MultiArmBandit(arms, epsilon=0.1, var=var)


In [27]:
rewards0, best_action_counts0, abs_estimation_error0 = simulate(bandit0, runs, time)
rewards1, best_action_counts1, abs_estimation_error1 = simulate(bandit1, runs, time)
rewards2, best_action_counts2, abs_estimation_error2 = simulate(bandit2, runs, time)


In [28]:
plt.figure(figsize=(20, 30))

# average reward vs steps
plt.subplot(3, 2, 1)
plt.xlabel('steps')
plt.ylabel('average reward')

plt.plot(rewards0, label='epsilon = 0')
plt.plot(rewards1, label='epsilon = 0.01')
plt.plot(rewards2, label='epsilon = 0.1')
plt.legend()

# optimal action vs steps
plt.subplot(3, 2, 2)
plt.xlabel('steps')
plt.ylabel('% optimal action')

plt.plot(best_action_counts0, label='epsilon = 0')
plt.plot(best_action_counts1, label='epsilon = 0.01')
plt.plot(best_action_counts2, label='epsilon = 0.1')
plt.legend()

# average absolute error in the estimate vs steps for epsilon = 0
plt.subplot(3, 2, 3)
plt.xlabel('steps')
plt.ylabel('average absolute error in the estimate for epsilson = 0')
for arm in range(arms):
    plt.plot(abs_estimation_error0[arm], label='arm = %s' % arm)
plt.legend()

# average absolute error in the estimate vs steps for epsilon = 0.01
plt.subplot(3, 2, 4)
plt.xlabel('steps')
plt.ylabel('average absolute error in the estimate for epsilson = 0.01')
for arm in range(arms):
    plt.plot(abs_estimation_error1[arm], label='arm = %s' % arm)
plt.legend()

# average absolute error in the estimate vs steps for epsilon = 0.1
plt.subplot(3, 2, 5)
plt.xlabel('steps')
plt.ylabel('average absolute error in the estimate for epsilson = 0.1')
for arm in range(arms):
    plt.plot(abs_estimation_error2[arm], label='arm = %s' % arm)
plt.legend()

plt.savefig('./q2.png')
plt.close()
