In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import matplotlib.pyplot as plt

class MultiArmedBandit:
    def __init__(self, probabilities):
        self.probabilities = probabilities
        self.n_arms = len(probabilities)

    def pull(self, arm):
        return 1 if np.random.rand() < self.probabilities[arm] else 0

In [None]:
class EpsilonGreedy:
    def __init__(self, n_arms, epsilon):
        self.epsilon = epsilon
        self.n_arms = n_arms
        self.counts = np.zeros(n_arms)
        self.values = np.zeros(n_arms)

    def select_arm(self):
        if np.random.rand() < self.epsilon:
            return np.random.randint(self.n_arms)
        else:
            return np.argmax(self.values)

    def update(self, chosen_arm, reward):
        self.counts[chosen_arm] += 1
        n = self.counts[chosen_arm]
        value = self.values[chosen_arm]
        self.values[chosen_arm] += (reward - value) / n

In [None]:
class UCB:
    def __init__(self, n_arms):
        self.n_arms = n_arms
        self.counts = np.zeros(n_arms)
        self.values = np.zeros(n_arms)
        self.total_counts = 0

    def select_arm(self):
        for arm in range(self.n_arms):
            if self.counts[arm] == 0:
                return arm
        ucb_values = self.values + np.sqrt((2 * np.log(self.total_counts)) / self.counts)
        return np.argmax(ucb_values)

    def update(self, chosen_arm, reward):
        self.counts[chosen_arm] += 1
        self.total_counts += 1
        n = self.counts[chosen_arm]
        value = self.values[chosen_arm]
        self.values[chosen_arm] += (reward - value) / n

In [None]:
class ThompsonSampling:
    def __init__(self, n_arms):
        self.n_arms = n_arms
        self.successes = np.ones(n_arms)
        self.failures = np.ones(n_arms)

    def select_arm(self):
        sampled = np.random.beta(self.successes, self.failures)
        return np.argmax(sampled)

    def update(self, chosen_arm, reward):
        if reward == 1:
            self.successes[chosen_arm] += 1
        else:
            self.failures[chosen_arm] += 1

In [None]:
def run_algorithm(algo_class, bandit, horizon, **kwargs):
    algo = algo_class(bandit.n_arms, **kwargs) if kwargs else algo_class(bandit.n_arms)
    regret = []
    optimal_reward = max(bandit.probabilities)
    total_regret = 0

    for t in range(horizon):
        arm = algo.select_arm()
        reward = bandit.pull(arm)
        algo.update(arm, reward)
        total_regret += optimal_reward - bandit.probabilities[arm]
        regret.append(total_regret)

    if hasattr(algo, 'values'):
        best_arm = np.argmax(algo.values)
    elif hasattr(algo, 'successes'):
        best_arm = np.argmax(algo.successes / (algo.successes + algo.failures))
    else:
        best_arm = None

    return regret, total_regret, best_arm

In [None]:
def evaluate_all_algos(probabilities, horizon=5000):
    bandit = MultiArmedBandit(probabilities)

    eps_regret, eps_total, eps_best = run_algorithm(EpsilonGreedy, bandit, horizon, epsilon=0.1)
    bandit = MultiArmedBandit(probabilities)
    ucb_regret, ucb_total, ucb_best = run_algorithm(UCB, bandit, horizon)
    bandit = MultiArmedBandit(probabilities)
    ts_regret, ts_total, ts_best = run_algorithm(ThompsonSampling, bandit, horizon)

    plt.figure(figsize=(10, 6))
    plt.plot(eps_regret, label='Epsilon-Greedy')
    plt.plot(ucb_regret, label='UCB')
    plt.plot(ts_regret, label='Thompson Sampling')
    plt.xlabel('Timesteps')
    plt.ylabel('Total Regret')
    plt.title('Total Regret vs Timesteps')
    plt.legend()
    plt.grid(True)
    plt.show()

    print(f"\nFinal Regret:")
    print(f"Epsilon-Greedy: {eps_total:.2f}, Best Arm: {eps_best}")
    print(f"UCB: {ucb_total:.2f}, Best Arm: {ucb_best}")
    print(f"Thompson Sampling: {ts_total:.2f}, Best Arm: {ts_best}")

In [None]:
# Try your bandit here
evaluate_all_algos(probabilities=[0.3, 0.5, 0.8, 0.6], horizon=5000)