In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

class eps_decay_bandit:
    
    def __init__(self,k, iters, mu="sequence"):
        
        #bandit's arm
        self.k = k
        #number of iterations
        self.iters = iters 
        #avergage mean for k armed bandits
        self.k_reward = np.zeros(k)
        #total reward
        self.reward = np.zeros(iters)
        #mean reward
        self.mean_reward = 0
        #step counts 
        self.n = 0
        #step counts for k arms
        self.k_n = np.zeros(k)
        
    
        if type(mu) == list:
            self.mu = np.arrays(mu)
            
        if self.mu == "random": 
            self.mu = np.random.normal(0, 1, k)
            
        if self.mu == "sequence":
            self.mu = np.linspace(0, k-1, k)
            
        if self.mu == "rndm_binomial":
            self.mu = np.random.binomial(0 , 1, k)
            
    def calculate(self):
        # Generate random number
        p = np.random.rand()
        if p < 1 / (1 + self.n / self.k):
            # Randomly select an action
            a = np.random.choice(self.k)
        else:
            # Take greedy action
            a = np.argmax(self.k_reward)
            
        reward = np.random.normal(self.mu[a], 1)
        
        # Update counts
        self.n += 1
        self.k_n[a] += 1
        
        # Update total
        self.mean_reward = self.mean_reward + (
            reward - self.mean_reward) / self.n
        
        # Update results for a_k
        self.k_reward[a] = self.k_reward[a] + (
            reward - self.k_reward[a]) / self.k_n[a]
        
    def run(self):
        for i in range(self.iters):
            self.pull()
            self.reward[i] = self.mean_reward
            
    def reset(self):
        # Resets results while keeping settings
        self.n = 0
        self.k_n = np.zeros(k)
        self.mean_reward = 0
        self.reward = np.zeros(iters)
        self.k_reward = np.zeros(k)
        
        
        k = 10
        iters = 1000

        eps_decay_rewards = np.zeros(iters)
        eps_1_rewards = np.zeros(iters)

        episodes = 1000
        # Run experiments
        for i in range(episodes):
            # Initialize bandits
            eps_decay = eps_decay_bandit(k, iters)
            eps_1 = eps_bandit(k, 0.1, iters, eps_decay.mu.copy())

            # Run experiments
            eps_decay.run()
            eps_1.run()

            # Update long-term averages
            eps_decay_rewards = eps_decay_rewards + (
                eps_decay.reward - eps_decay_rewards) / (i + 1)
            eps_1_rewards = eps_1_rewards + (
                eps_1.reward - eps_1_rewards) / (i + 1)

        plt.figure(figsize=(12,8))
        plt.plot(eps_decay_rewards, label="$\epsilon-decay$")
        plt.plot(eps_1_rewards, label="$\epsilon=0.1$")
        plt.legend(bbox_to_anchor=(1.2, 0.5))
        plt.xlabel("Iterations")
        plt.ylabel("Average Reward")
        plt.title("Average $\epsilon-decay$ and" + 
            "$\epsilon-greedy$ Rewards after " 
            + str(episodes) + " Episodes")
        plt.show()



