## Multi_armed bandit

note by myself:
Notice this problem is simpler than other RL problem, since it has no delayed reward. In other words, it is state-free. "the best action now is the best action always"

In [53]:
import tensorflow as tf 
import numpy as np

The lower the bandit number, the more likely a positive reward will be returned. We want our agent to learn to always choose the bandit that will give that positive reward.

### This rule is defined by ourselves and we can define any bandit to get higher reward given specific rule.

In [99]:
#bandits = [0.2,0,-0.2,-5]
bandits = [-0.9,-0.3,0,0.1,0.4]
num_bandits = len(bandits)
def pullBandit(bandit):
    #Get a random number.
    result = np.random.randn(1)
    if result > bandit:
        #return a positive reward.
        return 1
    else:
        #return a negative reward.
        return -1

In [100]:
tf.reset_default_graph()

#These two lines established the feed-forward part of the network. This does the actual choosing.
weights = tf.Variable(tf.ones([num_bandits]))
chosen_action = tf.argmax(weights,0)

#The next six lines establish the training proceedure. We feed the reward and chosen action into the network
#to compute the loss, and use it to update the network.
reward_holder = tf.placeholder(shape=[1],dtype=tf.float32)
action_holder = tf.placeholder(shape=[1],dtype=tf.int32)
responsible_weight = tf.slice(weights,action_holder,[1]) ## action_holder will be assigned the action from previous cal
## pick the slot in weight that slot = action
loss = -(tf.log(responsible_weight)*reward_holder)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
update = optimizer.minimize(loss)

In [101]:
weight = tf.Variable(tf.ones(num_bandits))
ini = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(ini)
    #print weights.eval()
    a = sess.run(weights)
    print a 


[ 1.  1.  1.  1.  1.]


In [103]:
total_episodes = 100 #Set total number of episodes to train agent on.
total_reward = np.zeros(num_bandits) #Set scoreboard for bandits to 0.
e = 0.1 #Set the chance of taking a random action.

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        if np.random.randn(1)<e:
            action = np.random.randint(num_bandits)
        else:
            action = sess.run(chosen_action)
        reward = pullBandit(bandits[action])

        #update the network
        _,resp,ww = sess.run([update,responsible_weight,weights], feed_dict={reward_holder:[reward],action_holder:[action]})
        
        total_reward[action] += reward
        if i % 50 == 0:
            print "Running reward for the " + str(num_bandits) + " bandits: " + str(total_reward)
        i+=1

print "The agent thinks bandit " + str(np.argmax(ww)+1) + " is the most promising...."
if np.argmax(ww) == np.argmax(-np.array(bandits)):
    print "...and it was right!"
else:
    print "...and it was wrong!"

Running reward for the 5 bandits: [ 0.  0.  1.  0.  0.]
Running reward for the 5 bandits: [ 18.   0.   1.   4.   0.]
The agent thinks bandit 1 is the most promising....
...and it was right!


In [97]:
bandits = [0.2,0,-0.2,-5]
num_bandits = len(bandits)

init = tf.initialize_all_variables()

# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        
        #Choose either a random action or one from our network.
        if np.random.rand(1) < e:
            action = np.random.randint(num_bandits)
        else:
            action = sess.run(chosen_action)
        
        reward = pullBandit(bandits[action]) #Get our reward from picking one of the bandits.
        
        #Update the network.
        _,resp,ww = sess.run([update,responsible_weight,weights], feed_dict={reward_holder:[reward],action_holder:[action]})
        
        #Update our running tally of scores.
        total_reward[action] += reward
        if i % 50 == 0:
            print "Running reward for the " + str(num_bandits) + " bandits: " + str(total_reward)
        i+=1
print "The agent thinks bandit " + str(np.argmax(ww)+1) + " is the most promising...."
if np.argmax(ww) == np.argmax(-np.array(bandits)):
    print "...and it was right!"
else:
    print "...and it was wrong!"

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Running reward for the 4 bandits: [-1.  0.  0.  0.]
Running reward for the 4 bandits: [ -3.  -1.   1.  12.]
Running reward for the 4 bandits: [ -5.  -3.   2.  57.]
Running reward for the 4 bandits: [  -4.   -4.    3.  104.]
Running reward for the 4 bandits: [  -3.   -4.    2.  150.]
Running reward for the 4 bandits: [  -4.   -5.    2.  198.]
Running reward for the 4 bandits: [  -4.   -4.    1.  240.]
Running reward for the 4 bandits: [  -3.   -5.    3.  284.]
Running reward for the 4 bandits: [  -3.   -5.    5.  328.]
Running reward for the 4 bandits: [  -3.   -5.    5.  376.]
Running reward for the 4 bandits: [  -3.   -5.    5.  426.]
Running reward for the 4 bandits: [  -3.   -6.    5.  475.]
Running reward for the 4 bandits: [  -2.   -6.    6.  523.]
Running reward for the 4 bandits: [  -3.   -7.    5.  568.]
Running reward for the 4 bandits: [  -3.   -6.    5.  615.]
Running reward for the 4 bandits: [  -2.  