In [None]:
import numpy as np
from typing import List
from random import random
from random import randrange


class Policy(object):
  def choose(self, value_estimates: List) -> int:
    return randrange(len(value_estimates))

  def __str__(self) -> str:
    return "Random"


class GreedyPolicy(Policy):
  def __init__(self):
    pass

  def choose(self, value_estimates: List) -> int:
    max_idx = 0
    max_val = value_estimates[0]
    for i, v in enumerate(value_estimates):
      if max_val < v:
        max_idx = i
        max_val = v

    return max_idx

  def __str__(self) -> str:
    return "Greedy"


class EpsilonGreedyPolicy(Policy):
  def __init__(self, e: float):
    self.e = e

  def choose(self, value_estimates: List) -> int:
    if random() < self.e:
      return randrange(len(value_estimates))

    max_idx = 0
    max_val = value_estimates[0]
    for i, v in enumerate(value_estimates):
      if max_val < v:
        max_idx = i
        max_val = v

    return max_idx

  def __str__(self) -> str:
    return "EpsilonGreedy-{}".format(self.e)


class Agent:
  def __init__(self, k: int, policy: Policy):
    self.policy = policy
    self.value_estimates = []
    self.action_attempts = []
    self.last_action = -1

    for i in range(k):
      self.value_estimates.append(0)
      self.action_attempts.append(0)

  def choose(self) -> int:
    self.last_action = self.policy.choose(self.value_estimates)
    return self.last_action

  def observe(self, reward: float):
    self.action_attempts[self.last_action] += 1
    regret = reward - self.value_estimates[self.last_action]
    step_size = 1 / float(self.action_attempts[self.last_action])
    self.value_estimates[self.last_action] += regret * step_size


class GaussianBandit:
  def __init__(self, mean: float, variance: float, k: int):
    self.arms = np.random.normal(mean, variance, k)
    self.variance = variance

  def pull(self, arm: int) -> float:
    return np.random.normal(self.arms[arm], self.variance)

  def __str__(self):
    return "{}".format(self.arms)


bandit = GaussianBandit(0, 1, 10)
print(bandit)


In [None]:
import pandas as pd

policies = [
  Policy(),
  GreedyPolicy(),
  EpsilonGreedyPolicy(0.2),
  EpsilonGreedyPolicy(0.1),
  EpsilonGreedyPolicy(0.01),
]

agents = [Agent(len(bandit.arms), p) for p in policies]

global_total_rewards = []
global_avg_rewards = []
for n in range(1, 1000):
  total_rewards = []
  avg_rewards = []
  for _ in range(len(agents)):
    total_rewards.append(0)
    avg_rewards.append(0)

  for step in range(n):
    for i, agent in enumerate(agents):
      arm = agent.choose()
      reward = bandit.pull(arm)
      total_rewards[i] += reward
      avg_rewards[i] = (avg_rewards[i] * step + reward) / (step + 1)
      agent.observe(reward)

  global_total_rewards.append(total_rewards)
  global_avg_rewards.append(avg_rewards)

policy_names = [str(p) for p in policies]
total_rewards_df = pd.DataFrame(data=global_total_rewards, columns=policy_names)
avg_rewards_df = pd.DataFrame(data=global_avg_rewards, columns=policy_names)

display(total_rewards_df)
display(avg_rewards_df)


In [None]:
import plotly.express as px

px.line(total_rewards_df, labels={"index": "Steps", "value": "Total Reward"}).show()
px.line(avg_rewards_df, labels={"index": "Steps", "value": "Average Reward"}).show()
