In [1]:
import numpy as np
import tut_basics as tb

# 1 Multi-Armed Bandits

In [2]:
# activate a particular bandit and collect reward
def arm_pull(bandit_id):
    if bandit_id == 1:
        reward = tb.bandit1()
    elif bandit_id == 2:
        reward = tb.bandit2()
    elif bandit_id == 3:
        reward = tb.bandit3()
    else:
        print("No bandit corresponding to ", bandit_id)
    return reward

In [13]:
## pass 1, 2, or 3 to the arm_pull function
reward = arm_pull(3)
print(reward)

1


### Goal: find the arm that will give the greatest return

#### 1.1 Collect data samples for each arm. 

In [14]:
n_samples = 1000

# create list to store rewards
R = [[],[],[]] 
# loop over number of samples for each of the three bandits
for i in range(n_samples):
    for j in range(3):
        # collect rewards for each arm 
        reward = arm_pull(j+1)
        # log reward in list
        R[j].append(reward)

# display the rewards that were collected
np.array(R).T

array([[ 0,  1,  1],
       [ 1,  1,  1],
       [ 1,  1,  1],
       ...,
       [ 0, 10,  1],
       [ 1, 10, 10],
       [ 1,  1,  1]])

#### 1.2 Compute the expected return for each arm

Recall E(R) = $\sum_r$ r * p(r) 

In [15]:
# get values of rewards each arm gave
rewards1 = np.unique(np.array(R)[0,:])
rewards2 = np.unique(np.array(R)[1,:])
rewards3 = np.unique(np.array(R)[2,:])
print("Reward Values: ", rewards1, rewards2, rewards3)
#compute how often each reward occured out of total rewards
# use as an estimate of the probability of receiving that reward
props1 = [R[0].count(rewards1[0])/len(R[0]), R[0].count(rewards1[1])/len(R[0])]
props2 = [R[1].count(rewards2[0])/len(R[1]), R[1].count(rewards2[1])/len(R[1])]
props3 = [R[2].count(rewards3[0])/len(R[2]), R[2].count(rewards3[1])/len(R[2])] 
print("Reward Freqencies: ", props1, props2, props3)

Reward Values:  [0 1] [ 1 10] [ 1 10]
Reward Freqencies:  [0.489, 0.511] [0.708, 0.292] [0.904, 0.096]


In [16]:
## Compute the expected return for each arm
E1 = np.dot(rewards1, props1)
print("Expected Vaule of Arm 1: ", E1)
E2 = np.dot(rewards2, props2)
print("Expected Vaule of Arm 2: ", E2)
E3 = np.dot(rewards3, props3)
print("Expected Vaule of Arm 3: ", E3)

Expected Vaule of Arm 1:  0.511
Expected Vaule of Arm 2:  3.628
Expected Vaule of Arm 3:  1.8639999999999999


#### What happens if you increase your number of samples? 

----------------------------------------------------------------