In [3]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np

In [23]:
from IPython.display import clear_output, Image, display, HTML

def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = "<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))

In [31]:
class ContextualBandit():
    def __init__(self):
        self.state = 0
        
        # Each bandit has 4 arms. We use 3 bandits.
        # The highest value in each column is the best arm for that bandit
        self.bandits = np.array([
                [0.2, 0.0, 0.0, -5.0],
                [0.1, -5.0, 1.0, 0.25],
                [-5.0, 5.0, 5.0, 5.0]
            ])
        
        self.num_bandits = self.bandits.shape[0]
        self.num_actions = self.bandits.shape[1]
        
    def getBandit(self):
        # Return a random bandit. The bandit represents the state.
        # Basically we shut our eyes and wander over to a random
        # bandit which we will play.
        self.state = np.random.randint(0, self.num_bandits)
        return self.state

    def pullArm(self, action):
        chance = self.bandits[self.state, action]
        reward = 1 if np.random.randn(1) > chance else -1
        return reward
        
        
class agent():
    def __init__(self, learn_rate, state_size, action_size):
        # Create the feed-forward part of the neural network.
        # The agent is given a state as input and will produce
        # an action.
        
        # The network takes as input the state, which is an integer
        # The slim.one_hot_encoding function maps the input state
        # to a 1 in at the integer position in a one-hot vector
        self.state_in = tf.placeholder(dtype=tf.int32,
                                       shape=[1])
        state_in_one_hot = slim.one_hot_encoding(labels=self.state_in,
                                                 num_classes=state_size)
        
        output = slim.fully_connected(inputs=state_in_one_hot,
                                      num_outputs=action_size,
                                      biases_initializer=None,
                                      activation_fn=tf.nn.sigmoid,
                                      weights_initializer=tf.ones_initializer())
        
        self.output = tf.reshape(tensor=output,
                                 shape=[-1])
        
        self.chosen_action = tf.argmax(input=self.output,
                                       axis=0)
        
        # The next six lines establish the training procedure.
        # We feed the reward and chosen action into the network
        # to compute the loss, and use it to update the network.
        self.reward_holder = tf.placeholder(dtype=tf.float32,
                                            shape=[1])
        self.action_holder = tf.placeholder(dtype=tf.int32,
                                            shape=[1])
        self.responsible_weight = tf.slice(input_=self.output,
                                           begin=self.action_holder,
                                           size=[1])
        self.loss = -(tf.log(self.responsible_weight)*self.reward_holder)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=learn_rate)
        self.update = optimizer.minimize(self.loss)
        

In [41]:
tf.reset_default_graph()
# Create a contextual bandit
context_bandit = ContextualBandit()
# Load the agent
my_agent = agent(learn_rate=0.001,
                 state_size=context_bandit.num_bandits,
                 action_size=context_bandit.num_actions)

# Weights that we can use to look into the network
weights = tf.trainable_variables()[0]

# Total number of episodes to perform training
total_episodes = 10000

# Total rewards for each (bandit, arm)
total_reward = np.zeros([context_bandit.num_bandits,
                         context_bandit.num_actions])

# Chance of taking a random action
e = 0.1

# Get a TF Op that will initialise all the variables when run
init = tf.global_variables_initializer()

# Launch the tensorflow graph
with tf.Session() as sess:
    # Run initialiser Op
    sess.run(init)
    
    for ep in range(total_episodes):
        # Get a state from the environment. Here this means the
        # enumeration of a bandit
        state = context_bandit.getBandit()
        
        # Choose either a random action or use our network to produce
        # one
        if np.random.rand(1) < e:
            action = np.random.randint(context_bandit.num_actions)
        else:
            action = sess.run(my_agent.chosen_action,
                              feed_dict={my_agent.state_in:[state]})
            
        # Get the reward for the chosen action
        reward = context_bandit.pullArm(action)
        
        # Update the agent network
        feed_dict = {my_agent.reward_holder:[reward],
                     my_agent.action_holder:[action],
                     my_agent.state_in:[state]}
        _, ww = sess.run([my_agent.update, weights],
                         feed_dict=feed_dict)
        
        # Update our running score
        total_reward[state, action] += reward
        if ep % 500 == 0:
            print(np.mean(total_reward, axis=1))

[ 0.   -0.25  0.  ]
[43.5  35.25 34.  ]
[80.5  75.25 69.5 ]
[116.   112.5  107.25]
[154.5  149.25 145.  ]
[190.75 188.75 183.25]
[223.   228.25 215.  ]
[263.75 268.5  248.  ]
[297.25 309.25 285.25]
[337.25 344.75 321.25]
[377.25 378.5  361.5 ]
[417.5  416.75 401.  ]
[460.   451.25 435.5 ]
[499.75 487.   470.  ]
[538.5  522.25 509.5 ]
[576.75 560.25 546.25]
[614.   600.   583.25]
[650.75 640.25 617.75]
[692.25 676.   656.  ]
[731.25 714.75 692.75]


In [30]:
# Create a contextual bandit
context_bandit = ContextualBandit()

In [38]:
weights.value

<bound method Variable.value of <tf.Variable 'fully_connected/weights:0' shape=(3, 4) dtype=float32_ref>>