# Policy/P-Value/Q network 


In this notebook, we'll build a neural network that can learn to play games through reinforcement learning. More specifically, we'll use Q-learning to train an agent to play a game called [Cart-Pole](https://gym.openai.com/envs/CartPole-v0). In this game, a freely swinging pole is attached to a cart. The cart can move to the left and right, and the goal is to keep the pole upright as long as possible.

![Cart-Pole](assets/cart-pole.jpg)

We can simulate this game using [OpenAI Gym](https://gym.openai.com/). First, let's check out how OpenAI Gym works. Then, we'll get into training an agent to play the Cart-Pole game.

In [53]:
# In this one we should define and detect GPUs for tensorflow
# GPUs or CPU
import tensorflow as tf

# Check TensorFlow Version
print('TensorFlow Version: {}'.format(tf.__version__))

# Check for a GPU
print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

TensorFlow Version: 1.7.1
Default GPU Device: 


>**Note:** Make sure you have OpenAI Gym cloned into the same directory with this notebook. I've included `gym` as a submodule, so you can run `git submodule --init --recursive` to pull the contents into the `gym` repo.

##### >**Note:** Make sure you have OpenAI Gym cloned. Then run this command `pip install -e gym/[all]`.

In [54]:
import numpy as np
import gym

## Create the Cart-Pole game environment
env = gym.make('CartPole-v0')
# env = gym.make('CartPole-v1')
# env = gym.make('Acrobot-v1')
# env = gym.make('MountainCar-v0')
# env = gym.make('Pendulum-v0')
# env = gym.make('Blackjack-v0')
# env = gym.make('FrozenLake-v0')
# env = gym.make('AirRaid-ram-v0')
# env = gym.make('AirRaid-v0')
# env = gym.make('BipedalWalker-v2')
# env = gym.make('Copy-v0')
# env = gym.make('CarRacing-v0')
# env = gym.make('Ant-v2') #mujoco
# env = gym.make('FetchPickAndPlace-v1') # mujoco required!

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m




In [57]:
# Data of the model
def model_input(state_size):
    states = tf.placeholder(tf.float32, [None, state_size], name='states')
    actions = tf.placeholder(tf.int32, [None], name='actions')
    rewards = tf.placeholder(tf.float32, [None], name='rewards') # env rewards
    return states, actions, rewards

In [58]:
# Generator/Controller: Generating/prediting the actions
def generator(states, action_size, hidden_size, reuse=False, alpha=0.1, training=False):
    with tf.variable_scope('generator', reuse=reuse):
        # First fully connected layer
        h1 = tf.layers.dense(inputs=states, units=hidden_size)
        bn1 = tf.layers.batch_normalization(h1, training=training)        
        nl1 = tf.maximum(alpha * bn1, bn1)
        
        # Second fully connected layer
        h2 = tf.layers.dense(inputs=nl1, units=hidden_size)
        bn2 = tf.layers.batch_normalization(h2, training=training)        
        nl2 = tf.maximum(alpha * bn2, bn2)
        
        # Output layer
        logits = tf.layers.dense(inputs=nl2, units=action_size)        
        #predictions = tf.nn.softmax(logits)

        # return actions logits
        return logits

In [47]:
def model_loss(action_size, hidden_size, # model init
               states, actions, rewards): # model input 
    # policy learning/gradient
    actions_logits = generator(states=states, hidden_size=hidden_size, action_size=action_size)
    actions_labels = tf.one_hot(indices=actions, depth=action_size, dtype=actions_logits.dtype)
    log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits=actions_logits,labels=actions_labels)        
    g_loss = tf.reduce_mean(-log_prob * rewards)
    # total_reward = tf.reduce_sum(rewards)
    # #g_loss = tf.reduce_mean(-log_prob * total_reward)
    # g_loss = -tf.reduce_mean(log_prob) * total_reward
    
    # Returning/outputing actions, sum of rewards, rewards, and loss
    return actions_logits, g_loss

In [48]:
# Optimizating/training/learning G & D
def model_opt(g_loss, learning_rate):
    """
    Get optimization operations in order
    :param g_loss: Generator loss Tensor for action prediction
    :param learning_rate: Learning Rate Placeholder
    :return: A tuple of (qfunction training, generator training, discriminator training)
    """
    # Get weights and bias to update
    t_vars = tf.trainable_variables()
    g_vars = [var for var in t_vars if var.name.startswith('generator')]

    # Optimize
    with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): # Required for batchnorm (BN)
        g_opt = tf.train.AdamOptimizer(learning_rate).minimize(g_loss, var_list=g_vars)

    return g_opt

In [49]:
class Model:
    def __init__(self, state_size, action_size, hidden_size, learning_rate):

        # Data of the Model: make the data available inside the framework
        self.states, self.actions, self.rewards = model_input(state_size=state_size)
        
        # Create the Model: calculating the loss and forwad pass
        self.actions_logits, self.g_loss = model_loss(
            action_size=action_size, hidden_size=hidden_size, # model init 
            actions=self.actions, states=self.states, rewards=self.rewards) # model input
        
        # Update the model: backward pass and backprop
        self.g_opt = model_opt(g_loss=self.g_loss, learning_rate=learning_rate)

In [50]:
# Training parameters
train_episodes = 1000          # max number of episodes to learn from
max_steps = 300000000          # max steps in an episode
learning_rate = 0.001          # learning rate for adam

# Network parameters
state_size = 4                 # number of units for the input state/observation -- simulation
action_size = 2                # number of units for the output actions -- simulation
hidden_size = 64               # number of units in each Q-network hidden layer -- simulation

In [51]:
# Reset/init the graph/session
tf.reset_default_graph()

# Init the model
model = Model(action_size=action_size, hidden_size=hidden_size, state_size=state_size, learning_rate=learning_rate)

In [52]:
# Now train with experiences
saver = tf.train.Saver()

# Env reward, action logits reward, action labels/onehot reward
rewards_list, g_loss_list = [], []

# TF session for training
with tf.Session() as sess:
    
    # Rstoring/loading the trained/learned controller SA/SM
    sess.run(tf.global_variables_initializer())
    
    # Training episodes/epochs
    for ep in range(train_episodes):
        state = env.reset() # env first state
        batch = [] # every data batch
        total_reward = 0 # reward env

        # Training steps/batches
        for _ in range(max_steps): # start=0, step=1, stop=max_steps/done/reward
            action_logits = sess.run([model.actions_logits], feed_dict={model.states: np.reshape(state, [1, -1])})
            action = np.argmax(action_logits)
            next_state, reward, done, _ = env.step(action)
            total_reward += reward # env
            #reward *= 1- float(done)
            #action *= 1- float(done)
            batch.append([state, action, reward])
            state = next_state
            if done is True:
                break
                
        #batch = memory.buffer
        states = np.array([each[0] for each in batch])
        actions = np.array([each[1] for each in batch])
        rewards = np.array([each[2] for each in batch])
        g_loss, _ = sess.run([model.g_loss, model.g_opt], feed_dict = {model.states: states, 
                                                                       model.actions: actions, 
                                                                       model.rewards:rewards})
        print('Episode: {}'.format(ep),
              'total_reward: {}'.format(total_reward),
              'g_loss: {:.4f}'.format(g_loss))
        rewards_list.append([ep, total_reward])
        g_loss_list.append([ep, g_loss])
        
    # Save the trained/learned D using G/SM/SA
    # At the end of all training episodes/epochs
    saver.save(sess, 'checkpoints/model-pqa.ckpt')
    #saver.save(sess, tf.train.latest_checkpoint('checkpoints'))

Episode: 0 total_reward: 57.0 g_loss: -39.3278
Episode: 1 total_reward: 16.0 g_loss: -10.7552
Episode: 2 total_reward: 10.0 g_loss: -6.7420
Episode: 3 total_reward: 52.0 g_loss: -35.5407
Episode: 4 total_reward: 9.0 g_loss: -6.1677
Episode: 5 total_reward: 10.0 g_loss: -6.7558
Episode: 6 total_reward: 52.0 g_loss: -35.4415
Episode: 7 total_reward: 62.0 g_loss: -42.2541
Episode: 8 total_reward: 13.0 g_loss: -8.9347
Episode: 9 total_reward: 15.0 g_loss: -10.2967
Episode: 10 total_reward: 23.0 g_loss: -15.7841
Episode: 11 total_reward: 11.0 g_loss: -7.5391
Episode: 12 total_reward: 11.0 g_loss: -7.5236
Episode: 13 total_reward: 9.0 g_loss: -6.1566
Episode: 14 total_reward: 9.0 g_loss: -6.1648
Episode: 15 total_reward: 9.0 g_loss: -6.1987
Episode: 16 total_reward: 13.0 g_loss: -8.9696
Episode: 17 total_reward: 13.0 g_loss: -8.9781
Episode: 18 total_reward: 12.0 g_loss: -8.2777
Episode: 19 total_reward: 14.0 g_loss: -9.6631
Episode: 20 total_reward: 31.0 g_loss: -21.3199
Episode: 21 total_r

Episode: 172 total_reward: 83.0 g_loss: -57.3418
Episode: 173 total_reward: 21.0 g_loss: -14.4621
Episode: 174 total_reward: 17.0 g_loss: -11.7058
Episode: 175 total_reward: 19.0 g_loss: -13.0811
Episode: 176 total_reward: 17.0 g_loss: -11.6812
Episode: 177 total_reward: 11.0 g_loss: -7.5610
Episode: 178 total_reward: 12.0 g_loss: -8.2200
Episode: 179 total_reward: 9.0 g_loss: -6.1486
Episode: 180 total_reward: 9.0 g_loss: -6.1416
Episode: 181 total_reward: 12.0 g_loss: -8.2260
Episode: 182 total_reward: 13.0 g_loss: -8.9213
Episode: 183 total_reward: 11.0 g_loss: -7.5635
Episode: 184 total_reward: 16.0 g_loss: -11.0138
Episode: 185 total_reward: 21.0 g_loss: -14.4865
Episode: 186 total_reward: 14.0 g_loss: -9.6492
Episode: 187 total_reward: 13.0 g_loss: -8.9614
Episode: 188 total_reward: 14.0 g_loss: -9.6622
Episode: 189 total_reward: 14.0 g_loss: -9.6833
Episode: 190 total_reward: 10.0 g_loss: -6.8624
Episode: 191 total_reward: 47.0 g_loss: -32.4691
Episode: 192 total_reward: 22.0 g_

Episode: 345 total_reward: 39.0 g_loss: -26.9898
Episode: 346 total_reward: 40.0 g_loss: -27.6560
Episode: 347 total_reward: 45.0 g_loss: -31.1134
Episode: 348 total_reward: 50.0 g_loss: -34.5112
Episode: 349 total_reward: 17.0 g_loss: -11.7367
Episode: 350 total_reward: 10.0 g_loss: -6.8965
Episode: 351 total_reward: 14.0 g_loss: -9.6521
Episode: 352 total_reward: 11.0 g_loss: -7.5836
Episode: 353 total_reward: 14.0 g_loss: -9.6552
Episode: 354 total_reward: 15.0 g_loss: -10.3463
Episode: 355 total_reward: 18.0 g_loss: -12.4233
Episode: 356 total_reward: 14.0 g_loss: -9.6696
Episode: 357 total_reward: 23.0 g_loss: -15.8773
Episode: 358 total_reward: 23.0 g_loss: -15.8827
Episode: 359 total_reward: 20.0 g_loss: -13.8155
Episode: 360 total_reward: 8.0 g_loss: -5.5079
Episode: 361 total_reward: 9.0 g_loss: -6.1893
Episode: 362 total_reward: 19.0 g_loss: -13.1337
Episode: 363 total_reward: 26.0 g_loss: -17.9593
Episode: 364 total_reward: 28.0 g_loss: -19.3406
Episode: 365 total_reward: 35

Episode: 516 total_reward: 180.0 g_loss: -124.4924
Episode: 517 total_reward: 139.0 g_loss: -96.2525
Episode: 518 total_reward: 9.0 g_loss: -6.2227
Episode: 519 total_reward: 11.0 g_loss: -7.5868
Episode: 520 total_reward: 10.0 g_loss: -6.8858
Episode: 521 total_reward: 12.0 g_loss: -8.3012
Episode: 522 total_reward: 149.0 g_loss: -103.1240
Episode: 523 total_reward: 11.0 g_loss: -7.6116
Episode: 524 total_reward: 10.0 g_loss: -6.9003
Episode: 525 total_reward: 8.0 g_loss: -5.5246
Episode: 526 total_reward: 9.0 g_loss: -6.2217
Episode: 527 total_reward: 11.0 g_loss: -7.6152
Episode: 528 total_reward: 14.0 g_loss: -9.6934
Episode: 529 total_reward: 43.0 g_loss: -29.7670
Episode: 530 total_reward: 43.0 g_loss: -29.7599
Episode: 531 total_reward: 22.0 g_loss: -15.2300
Episode: 532 total_reward: 15.0 g_loss: -10.3900
Episode: 533 total_reward: 12.0 g_loss: -8.3104
Episode: 534 total_reward: 10.0 g_loss: -6.9199
Episode: 535 total_reward: 11.0 g_loss: -7.6147
Episode: 536 total_reward: 22.0

Episode: 694 total_reward: 32.0 g_loss: -22.1238
Episode: 695 total_reward: 61.0 g_loss: -42.2383
Episode: 696 total_reward: 29.0 g_loss: -20.0571
Episode: 697 total_reward: 22.0 g_loss: -15.1812
Episode: 698 total_reward: 25.0 g_loss: -17.2531
Episode: 699 total_reward: 21.0 g_loss: -14.4743
Episode: 700 total_reward: 20.0 g_loss: -13.7896
Episode: 701 total_reward: 15.0 g_loss: -10.3395
Episode: 702 total_reward: 17.0 g_loss: -11.7213
Episode: 703 total_reward: 12.0 g_loss: -8.2719
Episode: 704 total_reward: 11.0 g_loss: -7.5891
Episode: 705 total_reward: 12.0 g_loss: -8.2720
Episode: 706 total_reward: 10.0 g_loss: -6.8915
Episode: 707 total_reward: 10.0 g_loss: -6.8919
Episode: 708 total_reward: 9.0 g_loss: -6.2029
Episode: 709 total_reward: 11.0 g_loss: -7.5962
Episode: 710 total_reward: 12.0 g_loss: -8.2882
Episode: 711 total_reward: 11.0 g_loss: -7.6038
Episode: 712 total_reward: 16.0 g_loss: -11.0672
Episode: 713 total_reward: 9.0 g_loss: -6.2097
Episode: 714 total_reward: 9.0 g

Episode: 873 total_reward: 9.0 g_loss: -6.1806
Episode: 874 total_reward: 12.0 g_loss: -8.2982
Episode: 875 total_reward: 9.0 g_loss: -6.1963
Episode: 876 total_reward: 10.0 g_loss: -6.9186
Episode: 877 total_reward: 12.0 g_loss: -8.2863
Episode: 878 total_reward: 12.0 g_loss: -8.2867
Episode: 879 total_reward: 9.0 g_loss: -6.2153
Episode: 880 total_reward: 11.0 g_loss: -7.6068
Episode: 881 total_reward: 13.0 g_loss: -8.9940
Episode: 882 total_reward: 17.0 g_loss: -11.7753
Episode: 883 total_reward: 49.0 g_loss: -33.9224
Episode: 884 total_reward: 36.0 g_loss: -24.9142
Episode: 885 total_reward: 28.0 g_loss: -19.3659
Episode: 886 total_reward: 33.0 g_loss: -22.8190
Episode: 887 total_reward: 17.0 g_loss: -11.7396
Episode: 888 total_reward: 18.0 g_loss: -12.4415
Episode: 889 total_reward: 19.0 g_loss: -13.1258
Episode: 890 total_reward: 13.0 g_loss: -8.9822
Episode: 891 total_reward: 13.0 g_loss: -8.9807
Episode: 892 total_reward: 11.0 g_loss: -7.5984
Episode: 893 total_reward: 14.0 g_l

In [30]:
import matplotlib.pyplot as plt
%matplotlib inline

def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / N 

In [None]:
eps, arr = np.array(rewards_list).T
smoothed_arr = running_mean(arr, 10)
plt.plot(eps[-len(smoothed_arr):], smoothed_arr)
plt.plot(eps, arr, color='grey', alpha=0.3)
plt.xlabel('Episode')
plt.ylabel('Total env rewards')

In [None]:
eps, arr = np.array(g_loss_list).T
smoothed_arr = running_mean(arr, 10)
plt.plot(eps[-len(smoothed_arr):], smoothed_arr)
plt.plot(eps, arr, color='grey', alpha=0.3)
plt.xlabel('Episode')
plt.ylabel('Gloss')