In [718]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Reshape
from tensorflow.keras import optimizers
import numpy as np

In [719]:
class contextual_bandit():
    def __init__(self):
        self.state = 0
        #List out our bandits. Currently arms 4, 2, and 1 (respectively) are the most optimal.
        self.bandits = np.array([[0.2,0,-0.0,-5],[0.1,-5,1,0.25],[-5,5,5,5]])
        self.num_bandits = self.bandits.shape[0]
        self.num_actions = self.bandits.shape[1]
        
    def getBandit(self):
        self.state = np.random.randint(0, len(self.bandits)) #Returns a random state for each episode.
        return self.state
        
    def pullArm(self,action):
        #Get a random number.
        bandit = self.bandits[self.state,action]
        result = np.random.randn(1)
        if result > bandit:
            #return a positive reward.
            return 1
        else:
            #return a negative reward.
            return -1

In [720]:
cBandit = contextual_bandit() #Load the bandits.

lr=0.0001
s_size=cBandit.num_bandits
a_size=cBandit.num_actions

tf.keras.backend.clear_session()

state_in = Input(batch_shape=[1, s_size], dtype=tf.int32) 
reward_holder = Input(batch_shape=[1], dtype=tf.float32)

# state_in_OH = tf.one_hot(state_in, s_size)
W = Dense(a_size, kernel_initializer='random_normal', activation='sigmoid', use_bias=False)(state_in)
expected_reward = tf.reduce_max(W, 1)
chosen_action = tf.keras.backend.argmax(W, 1)
expected_reward = tf.math.log(tf.reduce_max(W, 1))

@tf.function
def agent_loss(expected_reward, reward_holder):
    expected_reward = tf.dtypes.cast(expected_reward, dtype=tf.float32)
    reward_holder = tf.dtypes.cast(reward_holder, dtype=tf.float32)
    loss = -(expected_reward * reward_holder)
    return loss

sgd = optimizers.SGD(lr=lr)
model = tf.keras.Model(inputs=state_in, outputs=expected_reward)
model.summary()
model.compile(loss=agent_loss,  optimizer=sgd)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(1, 3)]                  0         
_________________________________________________________________
dense (Dense)                (1, 4)                    12        
_________________________________________________________________
tf_op_layer_Max_1 (TensorFlo [(1,)]                    0         
_________________________________________________________________
tf_op_layer_Log (TensorFlowO [(1,)]                    0         
Total params: 12
Trainable params: 12
Non-trainable params: 0
_________________________________________________________________


In [721]:
total_episodes = 20000 #Set total number of episodes to train agent on.
total_reward = np.zeros([cBandit.num_bandits,cBandit.num_actions]) #Set scoreboard for bandits to 0.
e = 0.1 #Set the chance of taking a random action.

# Launch the tensorflow graph
i = 0

reward=1
while i < total_episodes:
    s = cBandit.getBandit() #Get a state from the environment.
    s_oh = np.identity(s_size)[s:s+1]

    #Choose either a random action or one from our network.
    if np.random.rand(1) < e:
        action = np.array([np.random.randint(cBandit.num_actions)])
    else:
        ww = model.get_weights()[0]
        action = np.argmax(np.matmul(s_oh, ww))
    reward = cBandit.pullArm(action) #Get our reward for taking an action given a bandit.
    #Update the network.
    l = model.train_on_batch(x=s_oh, y=np.array([reward]))

    #Update our running tally of scores.
    total_reward[s, action] += reward
    if i % 500 == 0:
        print("Mean reward for each of the " + str(cBandit.num_bandits) + " bandits: " + str(np.mean(total_reward,axis=1)))
    i+=1

ww = model.get_weights()[0]

for a in range(cBandit.num_bandits):
    print("The agent thinks action " + str(np.argmax(ww[a])+1) + " for bandit " + str(a+1) + " is the most promising....")
    if np.argmax(ww[a]) == np.argmin(cBandit.bandits[a]):
        print("...and it was right!")
    else:
        print("...and it was wrong!")

Mean reward for each of the 3 bandits: [ 0.    0.   -0.25]
Mean reward for each of the 3 bandits: [ -0.75  35.75 -44.75]
Mean reward for each of the 3 bandits: [ -5.5   70.   -83.75]
Mean reward for each of the 3 bandits: [  -8.5   106.75 -122.5 ]
Mean reward for each of the 3 bandits: [-17.25 140.5  -97.5 ]
Mean reward for each of the 3 bandits: [-21.75 176.75 -61.75]
Mean reward for each of the 3 bandits: [-19.5  219.5  -28.75]
Mean reward for each of the 3 bandits: [-24.5  256.25  11.5 ]
Mean reward for each of the 3 bandits: [-29.75 294.25  45.75]
Mean reward for each of the 3 bandits: [-31.5  329.    84.75]
Mean reward for each of the 3 bandits: [-41.25 364.75 118.25]
Mean reward for each of the 3 bandits: [-42.   401.5  153.75]
Mean reward for each of the 3 bandits: [-49.   436.25 191.  ]
Mean reward for each of the 3 bandits: [-55.5  472.   228.25]
Mean reward for each of the 3 bandits: [-56.25 507.75 262.75]
Mean reward for each of the 3 bandits: [-59.75 543.25 298.25]
Mean rew