# Deep Q Learning

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from collections import deque
import random

import gym

In [2]:
RANDOM_SEED = 40

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
tf.set_random_seed(RANDOM_SEED)

In [3]:
env = gym.make("CartPole-v0")

a_size = env.action_space.n
s_size = env.observation_space.shape[0]
print("Action space size: {}".format(a_size))
print("State space size: {}".format(s_size))

possible_actions = np.identity(a_size)

Action space size: 2
State space size: 4


In [4]:
class DQNetwork(object):
    def __init__(self, s_size, a_size):
        self.s_size = s_size
        self.a_size = a_size
        
        self.states = tf.placeholder(shape=[None, self.s_size], dtype=tf.float32)
        self.dense = tf.layers.dense(inputs=self.states, units=20, activation=tf.nn.tanh)
        self.Qout = tf.layers.dense(inputs=self.dense, units=self.a_size)
        self.predict = tf.argmax(self.Qout, 1)

        self.Qtarget = tf.placeholder(shape=[None], dtype=tf.float32)
        self.action = tf.placeholder(shape=[None ,self.a_size], dtype=tf.float32)
        Q = tf.reduce_sum(tf.multiply(self.Qout, self.action), axis=1)
        self.loss = tf.reduce_mean(tf.square(self.Qtarget - Q))
        trainer = tf.train.AdamOptimizer(learning_rate=0.01)
        self.optimize = trainer.minimize(self.loss)

In [5]:
tf.reset_default_graph()

network = DQNetwork(s_size, a_size)
init = tf.global_variables_initializer()

In [6]:
sess = tf.Session()
sess.run(init)

In [7]:
gamma = 0.99
n_steps = 2000
e = 1
e_decay = 0.995
e_min = 0.1
num_episodes = 300
batch_size = 40

rlist = []
experience = deque(maxlen=2000)

for episode in range(num_episodes):
    s = env.reset()
    r_total = 0
    done = False
    
    while not done:
        if np.random.rand(1) < e:
            a_ind = env.action_space.sample()
        else:
            a_ind = sess.run(network.predict, feed_dict={network.states: [s]})[0]
        s1, r, done, _ = env.step(a_ind)
        
        experience.append((s, possible_actions[a_ind], r, s1, done))
        
        r_total += r
        s = s1
        if done:
            if e > e_min:
                e *= e_decay
            if episode % 10 == 0:
                print("EPIDOSE {:0>5}: {}".format(episode, np.mean(rlist[-10:-1])))
        
        if len(experience) > batch_size:
            minibatch = random.sample(experience, batch_size)
            states_mb = np.array([i[0] for i in minibatch])
            actions_mb = np.array([i[1] for i in minibatch])
            rewards_mb = np.array([i[2] for i in minibatch])
            next_states_mb = np.array([i[3] for i in minibatch])
            dones_mb = np.array([i[4] for i in minibatch])
            
            Qtarget = []
            Qnext_state = sess.run(network.Qout, feed_dict={network.states: next_states_mb})
            for i in range(batch_size):
                target = rewards_mb[i]
                if not dones_mb[i]:
                    target += gamma * np.max(Qnext_state[i])
                Qtarget.append(target)
            loss, _ = sess.run([network.loss, network.optimize], feed_dict={network.states: states_mb,
                                                                           network.Qtarget: Qtarget,
                                                                           network.action: actions_mb})

    rlist.append(r_total)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


EPIDOSE 00000: nan
EPIDOSE 00010: 16.444444444444443
EPIDOSE 00020: 19.333333333333332
EPIDOSE 00030: 36.55555555555556
EPIDOSE 00040: 33.888888888888886
EPIDOSE 00050: 45.0
EPIDOSE 00060: 42.0
EPIDOSE 00070: 40.77777777777778
EPIDOSE 00080: 61.44444444444444
EPIDOSE 00090: 45.666666666666664
EPIDOSE 00100: 47.888888888888886
EPIDOSE 00110: 43.77777777777778
EPIDOSE 00120: 71.66666666666667
EPIDOSE 00130: 105.44444444444444
EPIDOSE 00140: 132.44444444444446
EPIDOSE 00150: 128.33333333333334
EPIDOSE 00160: 94.55555555555556
EPIDOSE 00170: 55.333333333333336
EPIDOSE 00180: 78.22222222222223
EPIDOSE 00190: 107.0
EPIDOSE 00200: 67.88888888888889
EPIDOSE 00210: 51.111111111111114
EPIDOSE 00220: 114.66666666666667
EPIDOSE 00230: 129.66666666666666
EPIDOSE 00240: 119.0
EPIDOSE 00250: 108.44444444444444
EPIDOSE 00260: 95.0
EPIDOSE 00270: 89.88888888888889
EPIDOSE 00280: 144.44444444444446
EPIDOSE 00290: 130.55555555555554


In [10]:
s = env.reset()
# for i in range(3): env.step(0)
r_total = 0
done = False
while True:
    env.render()
    a = sess.run(network.predict, feed_dict={network.states: [s]})[0]
    s, r, done, _ = env.step(a)
    r_total += r
    #print(done)
    if done == True:
        print(r_total)
        break

167.0


In [11]:
env.close()