In [9]:
#import modules

import numpy
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [1]:
class Eps_greedy:
    
    """
    inputs
    --------------------------------------
    --------------------------------------
    k: number of bandit's arms (int)
    eps: Epsilon value. Normally, 0<eps<1
    iters: number of iterations
    mu: average of rewards for k-arms 
    """
    
    def __init__(self, k, eps, iters, mu='random'):
        
        #number of arms
        self.k =k 
        #value for the epsilon
        self.eps = eps
        #number of iterations in a cyle
        self.iters = iters
        #step count
        self.n = 0
        #step count for k arms
        self.k_n = np.zeros(k)
        #mean reward mn-1 for (n=1)
        self.mean_reward = 0
        #total reward R
        self.reward = np.zeros(iters)
        #mn
        self.k_reward = np.zeros(k)
        
        
        #for mean to be a normal distribution
        if mu=='random':
            
            self.mu = np.random.normal(0, 1, k)
        
        #for mean to be a seuence of numbers
        if mu=='sequence':
            
            self.mu = np.linspace(0, k-1, k)
            
        #for mean to be the user input
        if type(mu) == list:
            
            self.mu = np.array(mu)
            
            
    def calculate(self):
        
        #generate a random distribution for probability
        
        prob = np.random.rand()
        
        if (self.eps == 0 and self.n == 0) or prob < self.eps:
            
            a = np.random.choice(self.k)
            
        else:
            
            a = np.argmax(self.k_reward)
    
        reward = np.random.normal(self.mu[a], 1)
        
        #update counts
        
        self.n += 1
        self.k_n[a] +=1
        
        #update total
        
        self.mean_reward = self.mean_reward+(reward-self.mean_reward)/self.n
        
        #update results for a_k
        
        self.k_rewards[a] = self.k_reward[a]+(reward-self.k_reward[a])/ self.k_n[a]
        
        
    def run(self):
        
        for i in range(self.iters):
            self.calculate()
            self.reward[i] = self.mean_reward
            
    def reset(self):
        
        self.n =0
        self.k_n = np.zeros(k)
        self.mean_reward = 0
        self.reward = np.zeros(iters)
        self.k_reward = np.zeros(k)
        

        k = 20
        iters = 1000
        
        eps0_rewards = np.zeros(iters)
        eps01_rewards = np.zeros(iters)
        eps001_rewards = np.zeros(iters)
        
        episodes = 1000
        
        #Run experiments
        
        for i in range(episodes):

            eps0   = Eps_greedy(k, 0, iters)
            eps01  = Eps_greedy(k, 0.1, iters, eps0.mu.copy())
            eps001 = Eps_greedy(k, 0.01, iters, eps0.mu_copy())
            
        #Run 
        
        eps0.calculate()
        eps01.calculate()
        eps001.calculate()
        
        # update the rewards 
        
        eps0_rewards = eps0_rewards + (eps0.rewards - eps0_rewards)/ (i+1)
        eps01_rewards = eps01_rewards + (eps01.rewards - eps01_rewards)/ (i+1)
        eps001_rewards = eps001_rewards + (eps001.rewards - eps001_rewards)/ (i+1)
        
        # plotting the values
        
        plt.figure(figsize=(12,8))
        plt.plot(eps0_rewards, label="$\epsilon=0$ (greedy)")
        plt.plot(eps001_rewards, label="$\epsilon=0.01$ (greedy)")
        plt.plot(eps01_rewards, label="$\epsilon=0.1$ (greedy)")

        plt.legend(bbox_to_anchor=(1.3, 0.5))
        plt.xlabel("Iterations")
        plt.ylabel("Average Reward")
        plt.title("Average $\epsilon-greedy$ Rewards after " + str(episodes) 
                                  + " Episodes")
        plt.show()