Imports

Bandit class

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from tqdm import trange

matplotlib.use('Agg')

class Bandit:
    def __init__(self, eps = 0.0, arm = 10, step_size = 0.1, init_value=0.0, UCB_val = None, sample_average = False,\
                 gradient=False, baseline=False, expected_reward=0.0):
        self.arm = arm
        self.initial_value = init_value
        self.step_size = step_size
        self.epsilon = eps
        self.time = 0
        self.sample_average = sample_average
        self.UCB_val = UCB_val
        self.average_reward = 0
        self.expected_reward = expected_reward
        self.gradient = gradient
        self.gradient_baseline = baseline
        
    def reset(self):
        self.q_expected = np.random.randn(self.arm) + self.expected_reward
        self.q_estimated = np.zeros(self.arm) + self.initial_value
        self.action_count = np.zeros(self.arm)
        self.time = 0
        self.average_reward = 0
        self.best_action = np.argmax(self.q_expected)
        #print(self.q_expected)
        #print(self.q_estimated)
        
    def selectAction(self):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.arm)
        
        if self.UCB_val is not None:
            return np.argmax(self.q_estimated + self.UCB_val * np.sqrt(np.log(self.time + 1) / (self.action_count+1e-10)))
        if self.gradient:
            preference = np.exp(self.q_estimated)
            self.action_probability = preference / np.sum(preference)
            return np.random.choice(self.arm, p=self.action_probability)
        
        return np.argmax(self.q_estimated)
    
    def takeAction(self, action):
        self.time += 1
        self.action_count[action] += 1
        reward = np.random.randn() + self.q_expected[action]
        #print(reward)
        self.average_reward += (reward-self.average_reward)/self.time
        #print((reward - self.q_estimated[action])/self.action_count[action])
        if self.sample_average:
            self.q_estimated[action] += (reward - self.q_estimated[action])/self.action_count[action]
        elif self.gradient:
            one_hot = np.zeros(self.arm)
            one_hot[action] = 1
            if self.gradient_baseline:
                baseline = self.average_reward
            else:
                baseline = 0
            self.q_estimated += self.step_size * (reward - baseline) * (one_hot - self.action_probability)
        else:
            self.q_estimated[action] += self.step_size * (reward - self.q_estimated[action])
        return reward


def run_experiment(runs, time_steps, bandits):
    rewards = np.zeros((len(bandits), runs, time_steps))
    #print(rewards)
    print('Running experiment...')
    #print(rewards.shape)
    best_action_counts = np.zeros(rewards.shape)
    for i, bandit in enumerate(bandits):
        for r in trange(runs):
            bandit.reset()
            for t in range(time_steps):
                action = bandit.selectAction()
                reward = bandit.takeAction(action)
                rewards[i, r, t] = reward
                if action == bandit.best_action:
                    best_action_counts[i, r, t] = 1
    mean_best_action_counts = best_action_counts.mean(axis=1)
    mean_rewards = rewards.mean(axis=1)
    return mean_best_action_counts, mean_rewards

def generateEpsGreedyFigure(runs=2000, time_steps=1000):
    epsilons = [0, 0.1, 0.01]
    bandits = [Bandit(eps=eps, sample_average = True) for eps in epsilons]
    best_action_counts, rewards = run_experiment(runs, time_steps, bandits)

    plt.subplot(2, 1, 1)
    for eps, rewards in zip(epsilons, rewards):
        plt.plot(rewards, label='epsilon = %.02f' % (eps))
    plt.xlabel('steps')
    plt.ylabel('average reward')
    plt.legend()

    plt.subplot(2, 1, 2)
    for eps, counts in zip(epsilons, best_action_counts):
        plt.plot(counts, label='epsilon = %.02f' % (eps))
    plt.xlabel('steps')
    plt.ylabel('% optimal action')
    plt.legend()

    plt.savefig('G:/rubel/HDILab/figure_2_2.png')
    plt.close()
    print('Generated Eps greedy..')
    
def generateOptimisticInitialValueFigure(runs=2000, time_steps=1000):
    bandits = [Bandit(eps=0, init_value=5, step_size=0.1), \
               Bandit(eps=0.1, init_value=0, step_size=0.1)]
    best_action_counts, _ = run_experiment(runs, time_steps, bandits)

    plt.plot(best_action_counts[0], label='epsilon = 0, q = 5')
    plt.plot(best_action_counts[1], label='epsilon = 0.1, q = 0')
    plt.xlabel('Steps')
    plt.ylabel('% optimal action')
    plt.legend()

    plt.savefig('G:/rubel/HDILab/figure_2_3.png')
    plt.close()
    print('Generated optimistic initial..')

def generateUCBFigure(runs=2000, time_steps=1000):
    bandits = [Bandit(eps=0, UCB_val=2, sample_average=True), \
               Bandit(eps=0.1, sample_average=True)]
    _, avg_rewards = run_experiment(runs, time_steps, bandits)

    plt.plot(avg_rewards[0], label='UCB c = 2')
    plt.plot(avg_rewards[1], label='eps greedy, eps = 0.1')
    plt.xlabel('Steps')
    plt.ylabel('Average reward')
    plt.legend()

    plt.savefig('G:/rubel/HDILab/figure_2_4.png')
    plt.close()

def generateGradientFigure(runs=2000, time_step=1000):
    bandits = [Bandit(gradient=True, step_size=0.1, baseline=True, expected_reward=4), \
               Bandit(gradient=True, step_size=0.1, baseline=False, expected_reward=4),\
               Bandit(gradient=True, step_size=0.4, baseline=True, expected_reward=4), \
               Bandit(gradient=True, step_size=0.4, baseline=False, expected_reward=4)]

    best_action_counts, _ = run_experiment(runs, time_step, bandits)
    labels = ['alpha = 0.1, with baseline',
              'alpha = 0.1, without baseline',
              'alpha = 0.4, with baseline',
              'alpha = 0.4, without baseline']

    for i in range(len(bandits)):
        plt.plot(best_action_counts[i], label=labels[i])
    plt.xlabel('Steps')
    plt.ylabel('% Optimal action')
    plt.legend()

    plt.savefig('G:/rubel/HDILab/figure_2_5.png')
    plt.close()

def generateSummaryFigure(runs=2000, time_steps=1000):
    labels = ['epsilon-greedy', 'gradient bandit',
              'UCB', 'optimistic initialization']
    generators = [lambda eps: Bandit(eps=eps, sample_average=True),
                  lambda alpha: Bandit(gradient=True, step_size=alpha, baseline=True),
                  lambda coef: Bandit(eps=0, UCB_val=coef, sample_average=True),
                  lambda initial: Bandit(eps=0, init_value=initial, step_size=0.1)]
    parameters = [np.arange(-7, -1, dtype=np.float),
                  np.arange(-5, 2, dtype=np.float),
                  np.arange(-4, 3, dtype=np.float),
                  np.arange(-2, 3, dtype=np.float)]

    bandits = []
    for generator, parameter in zip(generators, parameters):
        for param in parameter:
            bandits.append(generator(pow(2, param)))

    _, average_rewards = run_experiment(runs, time_steps, bandits)
    rewards = np.mean(average_rewards, axis=1)

    i = 0
    for label, parameter in zip(labels, parameters):
        l = len(parameter)
        plt.plot(parameter, rewards[i:i+l], label=label)
        i += l
    plt.xlabel('Parameter(2^x)')
    plt.ylabel('Average reward')
    plt.legend()

    plt.savefig('G:/rubel/HDILab/figure_2_6.png')
    plt.close()
    
if __name__ == '__main__':
    #bandit = Bandit()
    #bandit.reset()
    #generateEpsGreedyFigure()
    #generateOptimisticInitialValueFigure()
    #generateUCBFigure()
    #generateGradientFigure()
    generateSummaryFigure()

Running experiment...


100%|██████████| 2000/2000 [00:25<00:00, 79.17it/s]
 64%|██████▎   | 1274/2000 [00:17<00:08, 80.73it/s]