In [5]:
import tensorflow as tf
import numpy as np
from collections import deque
import gym

In [6]:
env = gym.make('CartPole-v0')

# Constants defining our neural network
learning_rate = 1e-1
input_size = env.observation_space.shape[0]
output_size = env.action_space.n

X = tf.placeholder(tf.float32, [None, input_size], name="input_x")

# First layer of weights
W1 = tf.get_variable("W1", shape=[input_size, output_size],
                     initializer=tf.contrib.layers.xavier_initializer())
Qpred = tf.matmul(X, W1)

# We need to define the parts of the network needed for learning a policy
Y = tf.placeholder(shape=[None, output_size], dtype=tf.float32)

# Loss function
loss = tf.reduce_sum(tf.square(Y - Qpred))
# Learning
train = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss) #Different Optimizer

# Values for q learning
max_episodes = 1000
dis = 0.9
step_history = []


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Colocations handled automatically by placer.


In [7]:
# Setting up our environment
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)

for episode in range(max_episodes):
    e = 1. / ((episode / 10) + 1)
    step_count = 0
    state = env.reset()
    done = False

    # The Q-Network training
    while not done:
        step_count += 1
        x = np.reshape(state, [1, input_size])
        # Choose an action by greedily (with e chance of random action) from
        # the Q-network
        Q = sess.run(Qpred, feed_dict={X: x})
        if np.random.rand(1) < e:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q)

        # Get new state and reward from environment
        next_state, reward, done, _ = env.step(action)
        if done:
            Q[0, action] = -100
        else:
            x_next = np.reshape(next_state, [1, input_size])
            # Obtain the Q' values by feeding the new state through our network
            Q_next = sess.run(Qpred, feed_dict={X: x_next})
            Q[0, action] = reward + dis * np.max(Q_next)

        # Train our network using target and predicted Q values on each episode
        sess.run(train, feed_dict={X: x, Y: Q})
        state = next_state

    step_history.append(step_count)
    print("Episode: {}  steps: {}".format(episode, step_count))
    # If last 10's avg steps are 500, it's good enough
    if len(step_history) > 10 and np.mean(step_history[-10:]) > 500:
        break

# See our trained network in action
observation = env.reset()
reward_sum = 0
while True:
    env.render()

    x = np.reshape(observation, [1, input_size])
    Q = sess.run(Qpred, feed_dict={X: x})
    action = np.argmax(Q)

    observation, reward, done, _ = env.step(action)
    reward_sum += reward
    if done:
        print("Total score: {}".format(reward_sum))
        break

Episode: 0  steps: 75
Episode: 1  steps: 23
Episode: 2  steps: 12
Episode: 3  steps: 26
Episode: 4  steps: 14
Episode: 5  steps: 10
Episode: 6  steps: 11
Episode: 7  steps: 66
Episode: 8  steps: 76
Episode: 9  steps: 51
Episode: 10  steps: 37
Episode: 11  steps: 21
Episode: 12  steps: 100
Episode: 13  steps: 56
Episode: 14  steps: 12
Episode: 15  steps: 75
Episode: 16  steps: 27
Episode: 17  steps: 32
Episode: 18  steps: 57
Episode: 19  steps: 11
Episode: 20  steps: 46
Episode: 21  steps: 81
Episode: 22  steps: 22
Episode: 23  steps: 76
Episode: 24  steps: 19
Episode: 25  steps: 12
Episode: 26  steps: 34
Episode: 27  steps: 61
Episode: 28  steps: 26
Episode: 29  steps: 56
Episode: 30  steps: 36
Episode: 31  steps: 30
Episode: 32  steps: 63
Episode: 33  steps: 29
Episode: 34  steps: 21
Episode: 35  steps: 10
Episode: 36  steps: 44
Episode: 37  steps: 76
Episode: 38  steps: 74
Episode: 39  steps: 20
Episode: 40  steps: 10
Episode: 41  steps: 37
Episode: 42  steps: 40
Episode: 43  steps: 

Episode: 348  steps: 38
Episode: 349  steps: 47
Episode: 350  steps: 96
Episode: 351  steps: 75
Episode: 352  steps: 37
Episode: 353  steps: 31
Episode: 354  steps: 9
Episode: 355  steps: 10
Episode: 356  steps: 41
Episode: 357  steps: 187
Episode: 358  steps: 42
Episode: 359  steps: 45
Episode: 360  steps: 31
Episode: 361  steps: 25
Episode: 362  steps: 30
Episode: 363  steps: 10
Episode: 364  steps: 11
Episode: 365  steps: 10
Episode: 366  steps: 74
Episode: 367  steps: 53
Episode: 368  steps: 78
Episode: 369  steps: 147
Episode: 370  steps: 45
Episode: 371  steps: 33
Episode: 372  steps: 39
Episode: 373  steps: 10
Episode: 374  steps: 37
Episode: 375  steps: 52
Episode: 376  steps: 66
Episode: 377  steps: 44
Episode: 378  steps: 74
Episode: 379  steps: 83
Episode: 380  steps: 45
Episode: 381  steps: 39
Episode: 382  steps: 9
Episode: 383  steps: 61
Episode: 384  steps: 25
Episode: 385  steps: 56
Episode: 386  steps: 23
Episode: 387  steps: 10
Episode: 388  steps: 55
Episode: 389  st

Episode: 693  steps: 60
Episode: 694  steps: 37
Episode: 695  steps: 36
Episode: 696  steps: 71
Episode: 697  steps: 46
Episode: 698  steps: 10
Episode: 699  steps: 10
Episode: 700  steps: 52
Episode: 701  steps: 61
Episode: 702  steps: 76
Episode: 703  steps: 87
Episode: 704  steps: 50
Episode: 705  steps: 95
Episode: 706  steps: 45
Episode: 707  steps: 60
Episode: 708  steps: 70
Episode: 709  steps: 47
Episode: 710  steps: 57
Episode: 711  steps: 29
Episode: 712  steps: 39
Episode: 713  steps: 20
Episode: 714  steps: 38
Episode: 715  steps: 47
Episode: 716  steps: 55
Episode: 717  steps: 37
Episode: 718  steps: 31
Episode: 719  steps: 36
Episode: 720  steps: 33
Episode: 721  steps: 9
Episode: 722  steps: 58
Episode: 723  steps: 39
Episode: 724  steps: 51
Episode: 725  steps: 40
Episode: 726  steps: 63
Episode: 727  steps: 55
Episode: 728  steps: 84
Episode: 729  steps: 51
Episode: 730  steps: 54
Episode: 731  steps: 52
Episode: 732  steps: 45
Episode: 733  steps: 10
Episode: 734  ste