**Multiarmed Bandits: Gradient Bandit algorithm**

In this notebook, we use OpenAI Gym to implement the Gradient Bandit algorithm

In [None]:
import gym

Unfortunatley, Gym does not provide a bandit environment so we need to import it, lets install one with the command below:

In [None]:
!git clone https://github.com/JKCooper2/gym-bandits.git > /dev/null 2>&1
!pip install /content/gym-bandits/. > /dev/null 2>&1

Let's import needed packages

In [None]:
import gym_bandits
import numpy as np
import matplotlib.pyplot as plt
import random
from tqdm import tqdm

Define the softmax function

In [None]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

We define the action following the distribution given by the softmax funciton

In [None]:
def get_action(h):
    probs = softmax(h)
    return np.random.choice(10, p=probs), probs

We define a function that compute the average of obtained reward over repeated experiments.

In [None]:
def average(tot_rewards):
    avg = []
    for i in range(len(tot_rewards[0])):
        temp = 0
        for j in range(len(tot_rewards)):
            temp += tot_rewards[j][i]
        temp /= len(tot_rewards)
        avg.append(temp)
    return avg

Main code: define an Environment for 10-armed Bandit whose rewards are normal distributed with standard deviation = 1 and a mean drawn from a normal distribution (0,1). Same as Sutton&Barto book

In [None]:
k = 10  # Number of actions (arms)
iter = 1000 # Number of steps
repeat = 2000 # Number of repetitions
o0 = []  # No baseline, a = 0.4
o00 = []  # No baseline, a = 0.1
e01 = [] # Baseline, a = 0.4
e01 = [] # Baseline, a = 0.1



np.random.seed(42)
env = gym.make("BanditTenArmedGaussian-v0") # define Environment

Run the Gradient Bandit algorith with alpha = 0.4 and No-Baseline several times to average the results

In [None]:
alpha = 0.4

env.reset()
tot_rewards = []
for _ in tqdm(range(repeat)):
    H = np.zeros(shape=(k))
    rewards = []
    for i in range(iter):
        action, probs = get_action(H)
        state, reward, done, info = env.step(action)
        rewards.append(reward)
        H[action] = H[action] + alpha * reward * (1 - probs[action])
        H[:action] = H[:action]  - alpha * reward * probs[:action]
        if action + 1 < k:
            H[action + 1:] = H[action + 1:]  - alpha * reward * probs[action + 1]
    env.reset()
    tot_rewards.append(rewards)

o0 = average(tot_rewards)

Run the Gradient Bandit algorith with alpha = 0.4 and Baseline several times to average the results

In [None]:
env.reset()
tot_rewards = []
for _ in tqdm(range(repeat)):
    H = np.zeros(shape=(k))
    R = 0
    rewards = []
    for i in range(iter):
        action, probs = get_action(H)
        state, reward, done, info = env.step(action)
        rewards.append(reward)
        H[action] = H[action] + alpha * (reward - R) * (1 - probs[action])
        H[:action] = H[:action]  - alpha * (reward - R) * probs[:action]
        if action + 1 < k:
            H[action + 1:] = H[action + 1:]  - alpha * (reward - R) * probs[action + 1]
        R = R + 1/(i + 1) * (reward - R)
    env.reset()
    tot_rewards.append(rewards)

e01 = average(tot_rewards)

Run the Gradient Bandit algorith with alpha = 0.1 and No-Baseline several times to average the results

In [None]:
alpha = 0.1

env.reset()
tot_rewards = []
for _ in tqdm(range(repeat)):
    H = np.zeros(shape=(k))
    rewards = []
    for i in range(iter):
        action, probs = get_action(H)
        state, reward, done, info = env.step(action)
        rewards.append(reward)
        H[action] = H[action] + alpha * reward * (1 - probs[action])
        H[:action] = H[:action]  - alpha * reward * probs[:action]
        if action + 1 < k:
            H[action + 1:] = H[action + 1:]  - alpha * reward * probs[action + 1]
    env.reset()
    tot_rewards.append(rewards)

o00 = average(tot_rewards)

Run the Gradient Bandit algorith with alpha = 0.1 and Baseline several times to average the results

In [None]:
env.reset()
tot_rewards = []
for _ in tqdm(range(repeat)):
    H = np.zeros(shape=(k))
    R = 0
    rewards = []
    for i in range(iter):
        action, probs = get_action(H)
        state, reward, done, info = env.step(action)
        rewards.append(reward)
        H[action] = H[action] + alpha * (reward - R) * (1 - probs[action])
        H[:action] = H[:action]  - alpha * (reward - R) * probs[:action]
        if action + 1 < k:
            H[action + 1:] = H[action + 1:]  - alpha * (reward - R) * probs[action + 1]
        R = R + 1/(i + 1) * (reward - R)
    env.reset()
    tot_rewards.append(rewards)

e00 = average(tot_rewards)

Finally, we plot the results to compare the Gradient Bandit Algorithm with and without Baseline

In [None]:
plt.plot(o0, color='blue', label='No Baseline, a = 0.4')
plt.plot(e01, color='olive', label='Baseline, a = 0.4')
plt.plot(o00, color='red', label='No Baseline, a = 0.1')
plt.plot(e00, color='green', label='Baseline, a = 0.1')
plt.xlabel("Steps")
plt.ylabel("Average Reward")
plt.title("Average Reward vs. Steps on 10 Armed Bandit")
plt.legend()
plt.show()
