In [7]:
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np


class Bandit(object):
    def __init__(self, n_arms):
        # each arm is a gaussian with mean randomly sampled from ~N(0, 1)
        self.bandits_mu = stats.norm(0, 1).rvs(n_arms)
        
        # remembers the best arm
        self.best_arm = np.argmax(self.bandits_mu)
        
        # instantiate the gaussians arms with predefined mu and variance 1
        self.arms = [stats.norm(mu, 1) for mu in self.bandits_mu]
        
    def play(self, arm):
        """
        Returns the reward of playing a given arm
        """
        return self.arms[arm].rvs(1)[0]



In [10]:
class UniformAgent(object):
    def __init__(self, bandit):
        self.bandit = bandit
        
        # assigns uniformly random probs. to arms, then normalize
        self.probabilities = stats.uniform(0, 1).rvs(len(bandit.arms))
        
        sum_probs = sum(self.probabilities)
        
        # normalizes
        self.probabilities = [p / sum_probs for p in self.probabilities]
        
        #checks
        assert np.isclose(1, sum(self.probabilities))
        
        #saves cumulative probabilities:
        self.cumprobs = np.cumsum(self.probabilities)
        
    def choose_arm(self):
        """
        Selects an arm in proportion with the probabilities
        """
        # code copied from OpenAI Gym gym/envs/toy_text/discrete.py
        return (self.cumprobs > np.random.rand()).argmax()
        
    def greedy(self):
        """
        Returns the arm with highest probability
        """
        return np.argmax(self.probabilities)

In [14]:
b = Bandit(10)
a = UniformAgent(b)
a.greedy()

1

In [32]:
a.choose_arm()

6