In [None]:
import os
import pandas as pd
import random
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
random.seed(4014)

In [None]:
## define Bernoulli arm class
class BernoulliArm():
    def __init__(self, p):
        self.p = p
        
    def draw(self):
        if random.random()>self.p:
            return 0.0
        else:
            return 1.0

In [None]:
class EpsilonGreedy():
    
    def __init__(self, epsilon, counts, values): 
        self.epsilon = epsilon
        self.counts = counts
        self.values = values
        return 
    
    def initialize(self, n_arms):
        self.counts = [0 for item in range(n_arms)]
        self.values = [0.0 for item in range(n_arms)]
        return
    
    def select_arm(self):
        if random.random()>self.epsilon:
            m = max(self.values)
            return self.values.index(m)
        else:
            return random.randrange(len(self.values))
        
    def update(self, chosen_arm, reward):
        self.counts[chosen_arm] = self.counts[chosen_arm]+1
        n = self.counts[chosen_arm]
        value = self.values[chosen_arm]
        new_value = ((n-1)/float(n))*value + (1/float(n))*reward
        self.values[chosen_arm] = new_value
        return

In [None]:
## implement test
def test_algorithm(algo, arms, num_sims, horizon):
    chosen_arms = [0 for i in range(num_sims*horizon)]
    rewards = [0 for i in range(num_sims*horizon)]
    cumulative_rewards = [0 for i in range(num_sims*horizon)]
    sim_rouns = [0 for i in range(num_sims*horizon)]
    time_steps = [0 for i in range(num_sims*horizon)]
    
    for sim in range(num_sims):
        sim = sim + 1
        algo.initialize(len(arms))
        
        for t in range(horizon):
            t = t + 1
            index = (sim-1)*horizon + t - 1
            sim_rouns[index] = sim
            time_steps[index] = t
            
            chosen_arm = algo.select_arm()
            chosen_arms[index] = chosen_arm
            reward = arms[chosen_arm].draw()
            rewards[index] = reward
            
            if t == 1:
               cumulative_rewards[index] = reward
            else:
               cumulative_rewards[index] = cumulative_rewards[index-1] + reward 
            
            algo.update(chosen_arm, reward)
    return(sim_rouns, time_steps, chosen_arms, rewards, cumulative_rewards)