In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random 
from random import randint
%matplotlib notebook

# 1.1

In [3]:
# For the question 1.1, we find the expected reward for actions chosen uniformly.
# We defined an array for 6 arms bandit values.
six_armed_bandit = np.array([[1,3],[-3,8],[2,5],[-2,6],[3,4],[-2,2]])
# Expected rewards calculated one by one for each arm.
rewards_value = np.mean(six_armed_bandit,axis=1)
# Sum of Expected rewards for all arms.
Total_expected_reward = np.sum(rewards_value)
print('Sum of Expected reward :',Total_expected_reward)

Sum of Expected reward : 13.5


# 1.2

In [4]:
# First start with a class for passing six armed bandit parameters to the methods defined below of that class
class Environment(object):
     #  __init__ method is defined with self variable
     #initialize bandits
    def __init__(self, params):
        self.six_armed_bandit = len(params[:,0]) #Len function for the number of the arms
        self.params = params #Self variable passes the parameters of the arms
    # Return a random uniform number from the selected arm    
    def select_action(self, bandit_val):
        value_estimate = np.random.uniform(self.params[bandit_val,0],self.params[bandit_val,1])
        return value_estimate
    
    # get_reward is defined for computing sample average reward according the uniformly chosen actions 
    def get_reward(self,actions):
        self.Q = np.zeros(self.six_armed_bandit)# Q value for average reward of the arm
        self.values = np.zeros(self.six_armed_bandit)
        # For 10 uniform  actions each arm rewards computing and best arm and highest reward is calculated
        for i in range(10):
            b_arm = np.random.randint(0,self.six_armed_bandit)
            self.values[b_arm] = self.values[b_arm] + 1
            reward_a = self.select_action(b_arm)
            self.Q[b_arm] += (reward_a - self.Q[b_arm])/(self.values[b_arm]+1)
        choose_arm = np.argmax(self.Q)
        choose_rew = self.Q[choose_arm]
        total_rewards = np.sum(self.Q)
        print("Chosen best arm , reward = ",choose_arm ,",", choose_rew)
        print("Total rewards = ",total_rewards)
        print("Expected Rewards of the each arm:",self.Q)#Expected Rewards
        print("Action values: ",self.values)
    
   
  

In [5]:
n=10 #definition for 10 actions.
counts = Environment(six_armed_bandit)
print('Sample average reward for 10 uniformly chosen actions:')
counts.get_reward(n)

Sample average reward for 10 uniformly chosen actions:
Chosen best arm , reward =  3 , 2.57332394128811
Total rewards =  7.246450869393876
Expected Rewards of the each arm: [ 0.87125767 -0.27085348  1.77488223  2.57332394  2.44661128 -0.14877077]
Action values:  [2. 1. 2. 1. 2. 2.]


#  1.3

In [6]:
# Observations are  made from 4000 actions on the test bed
# And each action is repeated 100 times to produce an outcome
iters = 4000
times = 100
epsilon = 0.1
class Bandit(object):
    def __init__(self, params):
        self.params = params #Receives the intervals for each arm
        self.six_armed_bandit = len(params[:,0]) #Number of six_armed_bandit
  # Return a random uniform number from the selected arm
    def select_action(self, bandit_val):
        value_estimate = np.random.uniform(self.params[bandit_val,0],self.params[bandit_val,1])
        return value_estimate
#Choose an arm and calculate the percentage of being chosen of that arm
    def select_arm(self):
        per_selection= 100*(self.values/np.sum(self.values[:]))
        print("\nThe selected arm percentage: ",per_selection)
        print("Each arm has rewards: ",self.Q)
        print("Action values: ",self.values)
            
    #Update the arm with reward values and use epsilon policy
    def update_values(self, actions, cur_reward, epsilon):
        self.Q = np.zeros(self.six_armed_bandit)
        self.values = np.zeros(self.six_armed_bandit)
        for i in range(actions):
            #choose best arm
            if np.random.random() > epsilon:
                b_arm = np.argmax(self.Q)
            else:
                #otherwise testing all arms for the selection
                b_arm = np.random.randint(0,self.six_armed_bandit)
            #Updating  rewards values for the chosen best arm     
            self.values[b_arm] =  self.values[b_arm] + 1
            reward = self.select_action(b_arm)
            self.Q[b_arm] += (reward - self.Q[b_arm])/(self.values[b_arm]+1)
            
            if ((i + 1) % cur_reward) == 0:
                self.select_arm()

#Show the expected values wiht using epsilon greedy policy              
counts = Bandit(six_armed_bandit)
counts.update_values(iters,times,epsilon)             


The selected arm percentage:  [10.  2. 79.  4.  5.  0.]
Each arm has rewards:  [1.94307563 1.24158781 3.33089824 1.75812043 2.74541289 0.        ]
Action values:  [10.  2. 79.  4.  5.  0.]

The selected arm percentage:  [ 5.   1.  86.   3.5  3.5  1. ]
Each arm has rewards:  [ 1.94307563  1.24158781  3.41293103  1.4170426   2.9165833  -0.05734874]
Action values:  [ 10.   2. 172.   7.   7.   2.]

The selected arm percentage:  [ 4.          0.66666667 89.          2.33333333  3.          1.        ]
Each arm has rewards:  [2.07196797 1.24158781 3.45786575 1.4170426  3.04698617 0.37995936]
Action values:  [ 12.   2. 267.   7.   9.   3.]

The selected arm percentage:  [ 3.25  1.25 90.    2.25  2.5   0.75]
Each arm has rewards:  [2.05177694 2.7627659  3.48378006 1.59135454 3.13044977 0.37995936]
Action values:  [ 13.   5. 360.   9.  10.   3.]

The selected arm percentage:  [ 3.4  1.4 90.   2.4  2.   0.8]
Each arm has rewards:  [2.04966773 2.38770117 3.52501036 1.84984312 3.13044977 0.386422

#  1.4

In [7]:
#intialize values
iters = 4000
times = 100
epsilon = 0.1
alpha = 0.01

class Action(object):
    def __init__(self, params):
        self.params = params #Receives the intervals for each arm
        self.six_armed_bandit = len(params[:,0]) #Number of six_armed_bandit
 # Return a random uniform number from the selected arm
    def select_action(self, bandit_val):
        value_estimate = np.random.uniform(self.params[bandit_val,0],self.params[bandit_val,1])
        return value_estimate
#Choose an arm and calculate the percentage of being chosen of that arm
    def select_arm(self):
        per_selection= 100*(self.values/np.sum(self.values[:]))
        print("\nThe selected arm percentage: ",per_selection)
        print("Each arm has rewards: ",self.Q)
        print("Action values: ",self.values)
#Computes expectation of the reward with learning rate  
    def update_sample(self,actions,cur_reward,epsilon,alpha):
        self.Q = np.zeros(self.six_armed_bandit)
        self.values = np.zeros(self.six_armed_bandit)
        
        for i in range(actions):
            if np.random.random() > epsilon:
                # Select highest rate
                b_arm = np.argmax(self.Q)
            else:
                b_arm = np.random.randint(0,self.six_armed_bandit)
            self.values[b_arm] =  self.values[b_arm] + 1
            reward = self.select_action(b_arm)
    
             #Update expected reward with using alpha to learn the bandits
            self.Q[b_arm] += alpha * (reward - self.Q[b_arm])
            if ((i + 1) % cur_reward) == 0:
                self.select_arm()
            
            # do the experiment for 2000 steps for the fourth arm
            if i == 2000:
                self.params[2] = np.array([5,7])
                
#Show the values with using constant learning rate
counts = Action(six_armed_bandit)
counts.update_sample(iters,times,epsilon,alpha) 


The selected arm percentage:  [93.  1.  1.  0.  4.  1.]
Each arm has rewards:  [ 1.15184547  0.00471174  0.02819378  0.          0.13750244 -0.01133868]
Action values:  [93.  1.  1.  0.  4.  1.]

The selected arm percentage:  [90.5  2.5  1.5  1.   3.   1.5]
Each arm has rewards:  [ 1.68861701  0.12608289  0.12608817  0.00989437  0.20040206 -0.0278864 ]
Action values:  [181.   5.   3.   2.   6.   3.]

The selected arm percentage:  [90.          3.          1.66666667  1.          3.          1.33333333]
Each arm has rewards:  [ 1.84398740e+00  1.75995280e-01  1.91135441e-01 -3.98655568e-05
  2.94128594e-01 -4.16255628e-02]
Action values:  [270.   9.   5.   3.   9.   4.]

The selected arm percentage:  [90.5   2.25  2.    1.25  3.    1.  ]
Each arm has rewards:  [ 1.94948836  0.17599528  0.2653775   0.05226913  0.39311858 -0.04162556]
Action values:  [362.   9.   8.   5.  12.   4.]

The selected arm percentage:  [91.2  2.2  2.2  1.   2.6  0.8]
Each arm has rewards:  [ 1.91396804  0.19376


The selected arm percentage:  [63.43589744  1.71794872 29.8974359   1.71794872  1.64102564  1.58974359]
Each arm has rewards:  [1.92263759 1.01968634 5.96842069 0.95344771 1.657127   0.04216041]
Action values:  [2474.   67. 1166.   67.   64.   62.]

The selected arm percentage:  [61.9    1.725 31.45   1.75   1.625  1.55 ]
Each arm has rewards:  [1.92261605 1.11849169 6.0809134  0.96195116 1.67423021 0.04216041]
Action values:  [2476.   69. 1258.   70.   65.   62.]


#  1.5

In [8]:
#initialize values
iters = 4000
times = 100
epsilon = 0.1
alpha = 0.01

class Greedy(object):
    #initlize bandits
    def __init__(self, params):
        self.params = params #Receives the intervals for each arm
        self.six_armed_bandit = len(params[:,0]) #Number of six_armed_bandit
 # Return a random uniform number from the selected arm
    def select_action(self, bandit_val):
        value_estimate = np.random.uniform(self.params[bandit_val,0],self.params[bandit_val,1])
        return value_estimate

 #Computes expected Q value with using greedy strategy
    def greedy_expect(self):
        per_selection= 100*(self.values/np.sum(self.values[:]))
        print("\nThe selected arm percentage: ",per_selection)
        print("Each arm has rewards: ",self.new_Q)
        print("Action values:  ",self.values)
           
 #Define the Q value to 5 and update rewards wrt to Q.
    def update_greedy(self,actions,cur_reward,epsilon,alpha):
        self.new_Q = np.ones(self.six_armed_bandit)*5
        self.values = np.zeros(self.six_armed_bandit)
        #Run experiments
        for i in range(actions):
            if np.random.random() > epsilon:
                b_arm = np.argmax(self.new_Q)
            else:
                b_arm = np.random.randint(0,self.six_armed_bandit)
            self.values[b_arm] = self.values[b_arm] + 1
            reward = self.select_action(b_arm)
            # Add alpha  to Q
            self.new_Q[b_arm] += alpha * (reward - self.new_Q[b_arm])
            
            if ((i + 1) % cur_reward) == 0:
                self.greedy_expect()
            
            # do the experiment for 2000 steps for the fourth arm
            if i == 2000:
                self.params[2] = np.array([5,7])
                
#Show the expected values wiht using greedy policy
counts = Greedy(six_armed_bandit)
counts.update_greedy(iters,times,epsilon,alpha) 


The selected arm percentage:  [ 2.  1. 91.  1.  2.  3.]
Each arm has rewards:  [4.93495571 4.99684133 5.55142616 4.98585057 4.96175803 4.87650071]
Action values:   [ 2.  1. 91.  1.  2.  3.]

The selected arm percentage:  [ 2.   2.  88.   2.5  2.   3.5]
Each arm has rewards:  [4.86169204 4.88298495 5.83351962 4.87101918 4.9382364  4.70500385]
Action values:   [  4.   4. 176.   5.   4.   7.]

The selected arm percentage:  [ 1.66666667  1.66666667 88.33333333  2.          3.33333333  3.        ]
Each arm has rewards:  [4.84057183 4.83137957 6.04223646 4.85592071 4.86214034 4.58472086]
Action values:   [  5.   5. 265.   6.  10.   9.]

The selected arm percentage:  [ 1.5   1.25 89.25  2.    2.75  3.25]
Each arm has rewards:  [4.81132389 4.83137957 6.04014417 4.76369179 4.84893372 4.42201402]
Action values:   [  6.   5. 357.   8.  11.  13.]

The selected arm percentage:  [ 1.8  1.2 89.8  1.6  2.2  3.4]
Each arm has rewards:  [4.71583865 4.77389251 6.00387212 4.76369179 4.84893372 4.26490538