In [18]:
# %matplotlib inline
import gym
import numpy as np
from IPython.display import clear_output
import time
import matplotlib as plt

In [19]:
import tensorflow as tf
from tensorflow.keras import layers, models
from collections import deque
import random

In [20]:
class DQN(tf.keras.Model):
    def __init__(self, action_size):
        super(DQN, self).__init__()
        self.dense_1 = layers.Dense(24, activation="relu")
        self.dense_2 = layers.Dense(24, activation="relu")
        self.dense_3 = layers.Dense(action_size, activation="linear")
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
        
    def call(self, inputs):
        x = self.dense_1(inputs)
        x = self.dense_2(x)
        return self.dense_3(x)
    
    def loss(self, inputs, target):
        with tf.GradientTape() as tape:
            x = self(inputs)
            loss = tf.reduce_mean(tf.reduce_sum(tf.square(x-target), axis=1))
        grads = tape.gradient(loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        return loss

In [21]:
env = gym.make("CartPole-v1").env
memory = deque(maxlen = 2000)
action_size = env.action_space.n
model = DQN(action_size)

In [22]:
done = False
steps = 0
state = env.reset()

while not done:
    state, reward, done, info = env.step(env.action_space.sample())
    steps += 1
#    img = env.render(mode="rgb_array")
#     plt.imshow(img)
#     plt.show()
    time.sleep(.1)
    clear_output(wait = True)
print("Steps", steps)

Steps 27


In [23]:
EPISODES = 500
EPSILON = .1
EPSILON_DECAY = 0.995
EPSILON_MIN = 0.1
BATCH_SIZE = 32
GAMMA = 0.95

In [24]:
def train():
    minibatch = random.sample(memory, BATCH_SIZE)
    for state, next_state, action, reward, done in minibatch:
        target = reward
        if not done:
            target = reward + GAMMA * np.max(model(next_state)[0])
        q_values = model(state).numpy()
        q_values[0][action] = target
        loss = model.loss(state, q_values)
    return loss

In [17]:
epsilon = EPSILON
for i in range(1, EPISODES + 1):
    state = env.reset()
    done = False
    steps = 0
    state = np.reshape(state, [1, -1])
    while not done:
        p = np.random.rand()
        if p < epsilon:
            action = env.action_space.sample()
            
        else:
            q_values = model(state)[0]
            action = np.argmax(q_values)
        next_state, reward, done, info = env.step(action)
        next_state = np.reshape(next_state, [1, -1])
        memory.append((state, next_state, action, reward, done))
        state = next_state
        steps+=1

        if done:
            print("episodes : {}/{}, score: {}, e: {:.2}".format(i, EPISODES, steps, epsilon))
        if epsilon > EPSILON_MIN:
            epsilon = epsilon * EPSILON_DECAY

        if len(memory) >= BATCH_SIZE:
            loss = train()

episodes : 1/500, score: 438, e: 0.1
episodes : 2/500, score: 108, e: 0.1
episodes : 3/500, score: 214, e: 0.1
episodes : 4/500, score: 160, e: 0.1
episodes : 5/500, score: 183, e: 0.1
episodes : 6/500, score: 84, e: 0.1
episodes : 7/500, score: 175, e: 0.1
episodes : 8/500, score: 11, e: 0.1
episodes : 9/500, score: 12, e: 0.1
episodes : 10/500, score: 34, e: 0.1
episodes : 11/500, score: 140, e: 0.1
episodes : 12/500, score: 226, e: 0.1
episodes : 13/500, score: 330, e: 0.1
episodes : 14/500, score: 260, e: 0.1
episodes : 15/500, score: 196, e: 0.1
episodes : 16/500, score: 281, e: 0.1
episodes : 17/500, score: 169, e: 0.1
episodes : 18/500, score: 176, e: 0.1
episodes : 19/500, score: 179, e: 0.1
episodes : 20/500, score: 152, e: 0.1
episodes : 21/500, score: 141, e: 0.1
episodes : 22/500, score: 129, e: 0.1
episodes : 23/500, score: 144, e: 0.1
episodes : 24/500, score: 130, e: 0.1
episodes : 25/500, score: 108, e: 0.1
episodes : 26/500, score: 64, e: 0.1
episodes : 27/500, score: 

KeyboardInterrupt: 