In [1]:
import tensorflow as tf
from tensorflow.contrib.layers import xavier_initializer as xinit

In [2]:
import numpy as np
from collections import deque
import random

In [3]:
import gym

## Build Q(s,a)

- input : state
- output : Q-values for each action

In [4]:
state_size = 4
action_size = 2
hdim = 24

In [5]:
class DQNAgent():
    
    def __init__(self, state_size=4, action_size=2, hdim=24, epsilon=1.):
        
        self.epsilon = epsilon
        self.action_size = action_size
        self.experience = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.01
        
        tf.reset_default_graph()
        self.state_ = tf.placeholder(tf.float32, shape=[state_size], name='states')
        self.target_ = tf.placeholder(tf.float32, shape=[action_size], name='targets')
        W1 = tf.get_variable('W1', dtype=tf.float32,
                             shape=[state_size, hdim], initializer=xinit() )
        b1 = tf.get_variable('b1', dtype=tf.float32,
                             shape=[hdim], initializer=xinit() )
        h = tf.nn.relu(tf.matmul(
            tf.expand_dims(self.state_, axis=0), W1) + b1)
        W2 = tf.get_variable('W2', dtype=tf.float32,
                             shape=[hdim, action_size], initializer=xinit() )
        b2 = tf.get_variable('b2', dtype=tf.float32,
                             shape=[action_size], initializer=xinit() )
        self.q_out = tf.reshape(tf.matmul(h, W2) + b2, [action_size])
        self.loss = tf.reduce_mean(tf.pow(self.q_out - self.target_, 2))
        self.update = tf.train.AdamOptimizer(
            learning_rate=self.learning_rate).minimize(self.loss)
        
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        

    def get_action(self, s):
        if np.random.rand() <= self.epsilon:
            return np.random.randint(0, self.action_size)
        
        q_values = self.sess.run(self.q_out, feed_dict = {
            self.state_ : s.reshape([state_size])
        })
        
        return np.argmax(q_values)
    
    def remember(self, s0, a0, r0, s1, done):
        self.experience.append((s0,a0,r0,s1,done))
        
    def exp_replay(self, batch_size):
        batch = random.sample(self.experience, batch_size)
        for s,a,r,s1,done in batch:
            s = s.reshape([state_size])
            target = r
            if not done:
                q1 = self.sess.run(self.q_out, feed_dict = {
                    self.state_ : s1
                })
                q1_max = np.max(q1)
                target = r + (self.gamma * q1_max)
            q = self.sess.run(self.q_out, feed_dict = {
                self.state_ : s
            })
            q[a] = target
            self.sess.run(self.update, feed_dict = {
                self.state_ : s,
                self.target_ : q
            })
            
            if self.epsilon > self.epsilon_min:
                self.epsilon *= self.epsilon_decay

In [6]:
env = gym.make('CartPole-v0')

[2017-06-05 15:14:20,218] Making new env: CartPole-v0


In [7]:
agent = DQNAgent()

In [8]:
episodes = 2500
for e in range(episodes):
    
    s = env.reset()
    
    for t in range(500):
        a = agent.get_action(s)
        s1, r, done, _ = env.step(a)
        # remember
        agent.remember(s,a,r,s1,done)
        # update state
        s = s1
        
        if done:
            print("episode: {}/{}, score: {}".format(e, episodes, t))
            break
    
    batch_size = 32
    if len(agent.experience) > batch_size :
        agent.exp_replay(32)        

episode: 0/2500, score: 10
episode: 1/2500, score: 18
episode: 2/2500, score: 8
episode: 3/2500, score: 26
episode: 4/2500, score: 13
episode: 5/2500, score: 14
episode: 6/2500, score: 15
episode: 7/2500, score: 9
episode: 8/2500, score: 14
episode: 9/2500, score: 9
episode: 10/2500, score: 10
episode: 11/2500, score: 156
episode: 12/2500, score: 9
episode: 13/2500, score: 9
episode: 14/2500, score: 8
episode: 15/2500, score: 17
episode: 16/2500, score: 9
episode: 17/2500, score: 28
episode: 18/2500, score: 11
episode: 19/2500, score: 79
episode: 20/2500, score: 21
episode: 21/2500, score: 17
episode: 22/2500, score: 41
episode: 23/2500, score: 9
episode: 24/2500, score: 8
episode: 25/2500, score: 15
episode: 26/2500, score: 11
episode: 27/2500, score: 11
episode: 28/2500, score: 10
episode: 29/2500, score: 8
episode: 30/2500, score: 7
episode: 31/2500, score: 8
episode: 32/2500, score: 107
episode: 33/2500, score: 20
episode: 34/2500, score: 8
episode: 35/2500, score: 11
episode: 36/2

episode: 290/2500, score: 136
episode: 291/2500, score: 109
episode: 292/2500, score: 88
episode: 293/2500, score: 104
episode: 294/2500, score: 124
episode: 295/2500, score: 106
episode: 296/2500, score: 114
episode: 297/2500, score: 68
episode: 298/2500, score: 119
episode: 299/2500, score: 199
episode: 300/2500, score: 113
episode: 301/2500, score: 133
episode: 302/2500, score: 184
episode: 303/2500, score: 118
episode: 304/2500, score: 147
episode: 305/2500, score: 11
episode: 306/2500, score: 108
episode: 307/2500, score: 69
episode: 308/2500, score: 102
episode: 309/2500, score: 97
episode: 310/2500, score: 176
episode: 311/2500, score: 109
episode: 312/2500, score: 165
episode: 313/2500, score: 102
episode: 314/2500, score: 148
episode: 315/2500, score: 121
episode: 316/2500, score: 20
episode: 317/2500, score: 65
episode: 318/2500, score: 92
episode: 319/2500, score: 85
episode: 320/2500, score: 24
episode: 321/2500, score: 48
episode: 322/2500, score: 41
episode: 323/2500, sco

episode: 574/2500, score: 39
episode: 575/2500, score: 37
episode: 576/2500, score: 41
episode: 577/2500, score: 94
episode: 578/2500, score: 85
episode: 579/2500, score: 37
episode: 580/2500, score: 58
episode: 581/2500, score: 70
episode: 582/2500, score: 109
episode: 583/2500, score: 57
episode: 584/2500, score: 55
episode: 585/2500, score: 77
episode: 586/2500, score: 35
episode: 587/2500, score: 49
episode: 588/2500, score: 149
episode: 589/2500, score: 87
episode: 590/2500, score: 58
episode: 591/2500, score: 78
episode: 592/2500, score: 58
episode: 593/2500, score: 100
episode: 594/2500, score: 70
episode: 595/2500, score: 97
episode: 596/2500, score: 114
episode: 597/2500, score: 132
episode: 598/2500, score: 199
episode: 599/2500, score: 78
episode: 600/2500, score: 75
episode: 601/2500, score: 81
episode: 602/2500, score: 89
episode: 603/2500, score: 109
episode: 604/2500, score: 103
episode: 605/2500, score: 169
episode: 606/2500, score: 123
episode: 607/2500, score: 199
epi

episode: 853/2500, score: 140
episode: 854/2500, score: 99
episode: 855/2500, score: 88
episode: 856/2500, score: 154
episode: 857/2500, score: 145
episode: 858/2500, score: 171
episode: 859/2500, score: 114
episode: 860/2500, score: 199
episode: 861/2500, score: 199
episode: 862/2500, score: 132
episode: 863/2500, score: 114
episode: 864/2500, score: 139
episode: 865/2500, score: 199
episode: 866/2500, score: 174
episode: 867/2500, score: 141
episode: 868/2500, score: 152
episode: 869/2500, score: 77
episode: 870/2500, score: 110
episode: 871/2500, score: 199
episode: 872/2500, score: 145
episode: 873/2500, score: 86
episode: 874/2500, score: 113
episode: 875/2500, score: 145
episode: 876/2500, score: 16
episode: 877/2500, score: 63
episode: 878/2500, score: 81
episode: 879/2500, score: 94
episode: 880/2500, score: 14
episode: 881/2500, score: 124
episode: 882/2500, score: 102
episode: 883/2500, score: 199
episode: 884/2500, score: 104
episode: 885/2500, score: 135
episode: 886/2500, 

episode: 1131/2500, score: 62
episode: 1132/2500, score: 74
episode: 1133/2500, score: 71
episode: 1134/2500, score: 15
episode: 1135/2500, score: 106
episode: 1136/2500, score: 12
episode: 1137/2500, score: 116
episode: 1138/2500, score: 96
episode: 1139/2500, score: 12
episode: 1140/2500, score: 105
episode: 1141/2500, score: 11
episode: 1142/2500, score: 12
episode: 1143/2500, score: 199
episode: 1144/2500, score: 67
episode: 1145/2500, score: 89
episode: 1146/2500, score: 78
episode: 1147/2500, score: 92
episode: 1148/2500, score: 114
episode: 1149/2500, score: 108
episode: 1150/2500, score: 97
episode: 1151/2500, score: 119
episode: 1152/2500, score: 97
episode: 1153/2500, score: 9
episode: 1154/2500, score: 14
episode: 1155/2500, score: 91
episode: 1156/2500, score: 82
episode: 1157/2500, score: 89
episode: 1158/2500, score: 13
episode: 1159/2500, score: 80
episode: 1160/2500, score: 78
episode: 1161/2500, score: 67
episode: 1162/2500, score: 12
episode: 1163/2500, score: 77
epis

episode: 1405/2500, score: 104
episode: 1406/2500, score: 194
episode: 1407/2500, score: 9
episode: 1408/2500, score: 16
episode: 1409/2500, score: 7
episode: 1410/2500, score: 20
episode: 1411/2500, score: 82
episode: 1412/2500, score: 13
episode: 1413/2500, score: 199
episode: 1414/2500, score: 108
episode: 1415/2500, score: 108
episode: 1416/2500, score: 123
episode: 1417/2500, score: 131
episode: 1418/2500, score: 15
episode: 1419/2500, score: 19
episode: 1420/2500, score: 115
episode: 1421/2500, score: 20
episode: 1422/2500, score: 16
episode: 1423/2500, score: 84
episode: 1424/2500, score: 11
episode: 1425/2500, score: 8
episode: 1426/2500, score: 199
episode: 1427/2500, score: 138
episode: 1428/2500, score: 154
episode: 1429/2500, score: 11
episode: 1430/2500, score: 150
episode: 1431/2500, score: 179
episode: 1432/2500, score: 10
episode: 1433/2500, score: 106
episode: 1434/2500, score: 197
episode: 1435/2500, score: 121
episode: 1436/2500, score: 121
episode: 1437/2500, score:

episode: 1674/2500, score: 87
episode: 1675/2500, score: 199
episode: 1676/2500, score: 150
episode: 1677/2500, score: 132
episode: 1678/2500, score: 160
episode: 1679/2500, score: 183
episode: 1680/2500, score: 163
episode: 1681/2500, score: 142
episode: 1682/2500, score: 134
episode: 1683/2500, score: 172
episode: 1684/2500, score: 199
episode: 1685/2500, score: 199
episode: 1686/2500, score: 130
episode: 1687/2500, score: 92
episode: 1688/2500, score: 92
episode: 1689/2500, score: 125
episode: 1690/2500, score: 101
episode: 1691/2500, score: 140
episode: 1692/2500, score: 101
episode: 1693/2500, score: 199
episode: 1694/2500, score: 127
episode: 1695/2500, score: 162
episode: 1696/2500, score: 134
episode: 1697/2500, score: 118
episode: 1698/2500, score: 138
episode: 1699/2500, score: 160
episode: 1700/2500, score: 199
episode: 1701/2500, score: 112
episode: 1702/2500, score: 135
episode: 1703/2500, score: 102
episode: 1704/2500, score: 105
episode: 1705/2500, score: 111
episode: 17

episode: 1941/2500, score: 157
episode: 1942/2500, score: 199
episode: 1943/2500, score: 133
episode: 1944/2500, score: 184
episode: 1945/2500, score: 188
episode: 1946/2500, score: 199
episode: 1947/2500, score: 15
episode: 1948/2500, score: 130
episode: 1949/2500, score: 91
episode: 1950/2500, score: 116
episode: 1951/2500, score: 155
episode: 1952/2500, score: 125
episode: 1953/2500, score: 199
episode: 1954/2500, score: 177
episode: 1955/2500, score: 119
episode: 1956/2500, score: 128
episode: 1957/2500, score: 135
episode: 1958/2500, score: 110
episode: 1959/2500, score: 101
episode: 1960/2500, score: 199
episode: 1961/2500, score: 129
episode: 1962/2500, score: 166
episode: 1963/2500, score: 199
episode: 1964/2500, score: 162
episode: 1965/2500, score: 173
episode: 1966/2500, score: 150
episode: 1967/2500, score: 199
episode: 1968/2500, score: 199
episode: 1969/2500, score: 199
episode: 1970/2500, score: 148
episode: 1971/2500, score: 167
episode: 1972/2500, score: 142
episode: 1

episode: 2211/2500, score: 199
episode: 2212/2500, score: 52
episode: 2213/2500, score: 50
episode: 2214/2500, score: 116
episode: 2215/2500, score: 11
episode: 2216/2500, score: 67
episode: 2217/2500, score: 40
episode: 2218/2500, score: 23
episode: 2219/2500, score: 55
episode: 2220/2500, score: 62
episode: 2221/2500, score: 99
episode: 2222/2500, score: 24
episode: 2223/2500, score: 65
episode: 2224/2500, score: 62
episode: 2225/2500, score: 12
episode: 2226/2500, score: 78
episode: 2227/2500, score: 15
episode: 2228/2500, score: 12
episode: 2229/2500, score: 114
episode: 2230/2500, score: 20
episode: 2231/2500, score: 116
episode: 2232/2500, score: 11
episode: 2233/2500, score: 57
episode: 2234/2500, score: 11
episode: 2235/2500, score: 46
episode: 2236/2500, score: 23
episode: 2237/2500, score: 16
episode: 2238/2500, score: 15
episode: 2239/2500, score: 70
episode: 2240/2500, score: 59
episode: 2241/2500, score: 8
episode: 2242/2500, score: 103
episode: 2243/2500, score: 129
episo

KeyboardInterrupt: 

In [12]:
avg_reward = 0

for i in range(100):
    
    s = env.reset()
    
    for t in range(500):
        a = agent.get_action(s)
        s1, r, done, _ = env.step(a)
        # remember
        #agent.remember(s,a,r,s1,done)
        # update state
        s = s1
        if done:
            avg_reward += t
            break
print(avg_reward/100)

129.44
