In [73]:
import tensorflow as tf
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Input, Dense, Embedding
import numpy as np

In [133]:
#List out our bandits. Currently bandit 4 (index#3) is set to most often provide a positive reward.
bandits = [0.2,0,-0.2,-5]
num_bandits = len(bandits)
def pullBandit(bandit):
    #Get a random number.
    result = np.random.randn(1)
    if result > bandit:
        #return a positive reward.
        return 1
    else:
        #return a negative reward.
        return -1

In [134]:
tf.keras.backend.clear_session()

action_holder = Input(shape=1)
reward_holder = Input(shape=1)
W = Embedding(4, 1)(action_holder)

@tf.function
def m_loss(W, reward_holder):
    loss = -(tf.math.log(tf.reduce_sum(W)) * reward_holder)
    return loss

sgd = optimizers.SGD(lr=0.001)

model = tf.keras.Model(inputs=action_holder, outputs=W)
model.summary()
model.compile(loss=m_loss, optimizer=sgd)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 1)]               0         
_________________________________________________________________
embedding (Embedding)        (None, 1, 1)              4         
Total params: 4
Trainable params: 4
Non-trainable params: 0
_________________________________________________________________


In [135]:
total_episodes = 1000 #Set total number of episodes to train agent on.
total_reward = np.zeros(num_bandits) #Set scoreboard for bandits to 0.
e = 0.1 #Set the chance of taking a random action.


# Launch the tensorflow graph

i = 0
while i < total_episodes:

    #Choose either a random action or one from our network.
    if np.random.rand(1) < e:
        action = np.random.randint(num_bandits)
    else:
        weights = [float(model.predict_on_batch(np.array([x]))) for x in range(4)]
        weights = np.nan_to_num(np.array(weights), -1.0)
        action = np.argmax(weights)

    reward = pullBandit(bandits[action]) #Get our reward from picking one of the bandits.

    #Update the network.
    ww = model.predict_on_batch(np.array([action]))
    l = model.train_on_batch(x=np.array([action]), y=np.array([reward], dtype=np.float64))
    #Update our running tally of scores.
    total_reward[action] += reward
    if i % 50 == 0:
        print("Running reward for the " + str(num_bandits) + " bandits: " + str(total_reward))
    i+=1
print("The agent thinks bandit " + str(action+1) + " is the most promising....")

if np.argmax(weights) == np.argmax(-np.array(bandits)):
    print("...and it was right!")
else:
    print("...and it was wrong!")

Running reward for the 4 bandits: [0. 0. 1. 0.]
Running reward for the 4 bandits: [ 1. -1.  1. 44.]
Running reward for the 4 bandits: [ 2. -2.  1. 92.]
Running reward for the 4 bandits: [  1.  -1.   2. 139.]
Running reward for the 4 bandits: [  1.  -2.   3. 181.]
Running reward for the 4 bandits: [  2.  -1.   2. 228.]
Running reward for the 4 bandits: [  1.  -1.   3. 276.]
Running reward for the 4 bandits: [  1.  -3.   3. 324.]
Running reward for the 4 bandits: [  1.  -6.   2. 370.]
Running reward for the 4 bandits: [  0.  -6.   3. 418.]
Running reward for the 4 bandits: [ -1.  -5.   4. 463.]
Running reward for the 4 bandits: [ -2.  -6.   5. 508.]
Running reward for the 4 bandits: [ -3.  -7.   8. 551.]
Running reward for the 4 bandits: [ -4.  -9.   7. 597.]
Running reward for the 4 bandits: [ -5.  -7.   7. 642.]
Running reward for the 4 bandits: [ -4.  -6.   8. 685.]
Running reward for the 4 bandits: [ -6.  -7.   7. 729.]
Running reward for the 4 bandits: [ -5.  -7.   6. 777.]
Running 