<a href="https://colab.research.google.com/github/ahassanzadeh/Multi-Armed-Bandits/blob/master/Main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# import modules 
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 
%matplotlib inline

# **Epsilon Greedy** 


In [0]:
class greedy_bandit:
    def __init__(self, k, epsilon, iterations, rew):
        # Number of arms(actions)
        self.k = k
        # Search probability
        self.epsilon = epsilon
        # Number of iterations
        self.iterations = iterations
        # Step count
        self.n = 0
        # Step count for each arm
        self.k_n = np.zeros(k)
        # Total mean reward
        self.mean_reward = 0
        self.reward = np.zeros(iterations)
        
        if rew == "greedy":
          # Reward per step 
          self.rew = np.random.normal(0, 1, k)
        elif rew == 'Optimistic_Initial_Value':
          # Select initial values
          self.rew = np.repeat(5., k)
        # Mean reward for each arm
        self.k_reward = np.zeros(k)

                                 
    def choose_action(self):
        # Generate random number
        p = np.random.rand()
        if self.epsilon == 0 and self.n == 0:
            a = np.random.choice(self.k)
        elif p < self.epsilon:
            # Randomly select an action
            a = np.random.choice(self.k)
        else:
            # Take greedy action
            a = np.argmax(self.k_reward)
            
        reward = np.random.normal(self.rew[a], 1)
        
        # Update counts
        self.n += 1
              
        # Update total
        self.mean_reward = self.mean_reward + (
            reward - self.mean_reward) / self.n

        # Update results for a_k
        self.k_reward[a] = self.k_reward[a] + (
            reward - self.k_reward[a]) / self.k_n[a]

    def run(self):
        for i in range(self.iterations):
            self.choose_action()
            self.reward[i] = self.mean_reward
    def reset(self):
        # Resets results while keeping settings
        self.n = 0
        self.k_n = np.zeros(k)
        self.mean_reward = 0
        self.reward = np.zeros(iterations)
        self.k_reward = np.zeros(k)

In [0]:
k = 10
iterations = 1000

eps_0_rewards = np.zeros(iterations)
eps_01_rewards = np.zeros(iterations)
eps_1_rewards = np.zeros(iterations)

episodes = 1000
# Run experiments
for i in range(episodes):
    # Initialize bandits
    eps_0  = greedy_bandit(k, 0, iterations,"greedy")
    eps_01 = greedy_bandit(k, 0.01, iterations, eps_0.rew.copy())
    eps_1  = greedy_bandit(k, 0.1, iterations, eps_0.rew.copy())
    # Run experiments
    eps_0.run()
    eps_01.run()
    eps_1.run()
    
    # Update long-term averages
    eps_0_rewards = eps_0_rewards + (
        eps_0.reward - eps_0_rewards) / (i + 1)
    eps_01_rewards = eps_01_rewards + (
        eps_01.reward - eps_01_rewards) / (i + 1)
    eps_1_rewards = eps_1_rewards + (
        eps_1.reward - eps_1_rewards) / (i + 1)

plt.figure(figsize=(16,12))
plt.plot(eps_0_rewards, label="$\epsilon=0$ (greedy)")
plt.plot(eps_01_rewards, label="$\epsilon=0.01$")
plt.plot(eps_1_rewards, label="$\epsilon=0.1$")
plt.legend(bbox_to_anchor=(1.3, 1))
plt.xlabel("Iterations")
plt.ylabel("Average Reward")
plt.title("Average $\epsilon-greedy$ Rewards after " + str(episodes) + " Episodes")
plt.show()



AttributeError: ignored

# **Optimistic Initial Value(OIV)**

In [0]:
# k = 10
# iters = 1000

# eps_01_rewards = np.zeros(iters)



# # Run experiments
# for i in range(iters):
#     # Initialize bandits
#     eps_1 = eps_bandit(k, 0.1, iters)
#     # Run experiments
#     eps_1.run()
    
#     # Update long-term averages
#     eps_01_rewards = eps_01_rewards + (
#         eps_01.reward - eps_01_rewards) / (i + 1)
#     oiv_rewards = oiv_rewards + (
#         oiv_bandit.reward - oiv_rewards) / (i + 1)