# The Contexual Bandits Problem

### Importing the libraries

In [1]:
import numpy as np

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
from tensorflow.python.framework import ops

Instructions for updating:
non-resource variables are not supported in the long term


### Creating the contextual bandits

In [18]:
class contextual_bandit():
    def __init__(self):
        self.state = 0
        
        # Defining the bandits
        self.bandits = np.array([[0.2,0,-0.0,-5],[0.1,-5,1,0.25],[-5,5,5,5]])
        
        self.num_bandits = self.bandits.shape[0]
        self.num_actions = self.bandits.shape[1]
        
    def getBandit(self):
        self.state = np.random.randint(0,len(self.bandits))
        return self.state
    
    def pullArm(self, action):
        
        bandit = self.bandits[self.state, action]
        
        result = np.random.rand(1)
        
        if result > bandit:
            # return a positive reward
            return 0.1
        else:
            # return a negative reward
            return -1

### Creating the policy based agent

In [19]:
class agent():
    def __init__(self, lr, s_size,a_size):
        # Feed forward network
        self.state_in = tf.placeholder(shape=[1], dtype=tf.int32)
        state_encoded = tf.one_hot(self.state_in, s_size)
        
        output = tf.layers.dense(state_encoded,\
                                 a_size,\
                                 bias_initializer=None,\
                                 activation=tf.nn.sigmoid,\
                                 kernel_initializer=tf.ones_initializer())
        
        self.output = tf.reshape(output, [-1])
        self.chosen_action = tf.argmax(self.output,0)
        
        # Back propogation and updating the network
        self.reward_holder = tf.placeholder(shape=[1], dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[1], dtype=tf.int32)

        self.responsible_weight = tf.slice(self.output, self.action_holder,[1])
        self.loss = -(tf.log(self.responsible_weight) * self.reward_holder)
        
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
        self.update = optimizer.minimize(self.loss)
        
        

### Training the agent

In [20]:
tf.reset_default_graph()

In [21]:
cBandit = contextual_bandit()
myAgent = agent(lr=0.001,s_size=cBandit.num_bandits,a_size=cBandit.num_actions)
weights = tf.trainable_variables()[0]

In [22]:
total_episodes = 2000
total_reward = np.zeros([cBandit.num_bandits, cBandit.num_actions])
e = 0.1

In [23]:
init = tf.global_variables_initializer()

In [24]:
with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        
        s = cBandit.getBandit()
        
        if np.random.rand(1) < e:
            action = np.random.randint(cBandit.num_actions)
        else:
            action = sess.run(myAgent.chosen_action, feed_dict={myAgent.state_in:[s]})
            
        reward = cBandit.pullArm(action)
        
        # Update the network
        feed_dict={myAgent.reward_holder:[reward],\
                   myAgent.action_holder:[action],\
                   myAgent.state_in:[s]}
        
        _, ww = sess.run([myAgent.update, weights], feed_dict=feed_dict)
        
        # Update the running scores
        total_reward[s,action] += reward
        
        if i % 500 == 0:
            print("Mean reward for each of the " + str(cBandit.num_bandits) + " bandits: " + str(np.mean(total_reward,axis=1)))
        i+=1
        

Mean reward for each of the 3 bandits: [0.    0.025 0.   ]
Mean reward for each of the 3 bandits: [  3.5    2.5  -38.85]
Mean reward for each of the 3 bandits: [  7.15    5.25  -85.275]
Mean reward for each of the 3 bandits: [  11.2      8.175 -123.475]


In [25]:
for a in range(cBandit.num_bandits):
    print("The agent thinks action " + str(np.argmax(ww[a])+1) + " for bandit " + str(a+1) + " is the most promising....")
          
    if np.argmax(ww[a]) == np.argmin(cBandit.bandits[a]):
        print("...and it was right!")
    else:
        print("...and it was wrong!")

The agent thinks action 2 for bandit 1 is the most promising....
...and it was wrong!
The agent thinks action 2 for bandit 2 is the most promising....
...and it was right!
The agent thinks action 1 for bandit 3 is the most promising....
...and it was right!
