In [11]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.keras.callbacks import TensorBoard

from time import sleep, time
import numpy as np
import random
import gym

In [None]:
# Environment
env = gym.make('CartPole-v1').env
inputCount = env.observation_space.shape[0]
actionsCount = env.action_space.n

# Neural Network
model = keras.Sequential([
    keras.layers.Dense(24, input_dim=inputCount, activation='relu'),
    keras.layers.Dense(24, activation='relu'),
    keras.layers.Dense(actionsCount, activation='linear')
])

tensorboard = TensorBoard(log_dir="logs/{}".format(time()))

model.compile(loss='mse', optimizer=tf.train.AdamOptimizer(), metrics=['mae'])
# Load weights
#model.load_weights("weights.h5")

# Hyperparameters
gamma = 1.0
epsilon = 1.0
epsilonMin = 0.01
epsilonDecay = 0.999
episodes = 5000

# Memory (Remember & Replay)
memory = []
batch_size = 64
memoryMax = 50000

# Training
for e in range(episodes):
    s = env.reset()
    s = np.array([s])

    for time in range(500):
        # Act greedy sometimes
        if np.random.rand() <= epsilon:
            a = random.randrange(actionsCount)
        else:
            a = np.argmax(model.predict(s))

        newS, r, done, _ = env.step(a)
        newS = np.array([newS])
        target = r + gamma * np.max(model.predict(newS))
        target_f = model.predict(s)[0]
        target_f[a] = target
        model.fit(s, target_f.reshape(-1, actionsCount), epochs=1, verbose=0, callbacks=[tensorboard])
        memory.append((s, a, r, newS, done))
        s = newS

        # free first items in memory
        if len(memory)==memoryMax:
            del memory[:5000]

        if done:
            print("episode: {}/{}, score: {}".format(e, episodes, time))
            break

    if epsilon > epsilonMin:
        epsilon *= epsilonDecay

    # Replay memory
    if len(memory) > batch_size:
        minibatch = random.sample(memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
              target = reward + gamma * np.max(model.predict(next_state))

            target_f = model.predict(state)[0]
            target_f[action] = target
            model.fit(state, target_f.reshape(-1, actionsCount), epochs=1, verbose=0)


# Save weights
model.save_weights("weights.h5")

# Play game
print("\nPlaying Game...")
sleep(1)

s = env.reset()
done = False
while not done:
    env.render()
    a = np.argmax(model.predict(np.array([s])))
    newS, r, done, _ = env.step(a)
    s = newS

episode: 0/5000, score: 19
episode: 1/5000, score: 30
episode: 2/5000, score: 43
episode: 3/5000, score: 13
episode: 4/5000, score: 23
episode: 5/5000, score: 21
episode: 6/5000, score: 14
episode: 7/5000, score: 15
episode: 8/5000, score: 10
episode: 9/5000, score: 12
episode: 10/5000, score: 31
episode: 11/5000, score: 17
episode: 12/5000, score: 19
episode: 13/5000, score: 17
episode: 14/5000, score: 20
episode: 15/5000, score: 13
episode: 16/5000, score: 10
episode: 17/5000, score: 11
episode: 18/5000, score: 12
episode: 19/5000, score: 23
episode: 20/5000, score: 12
episode: 21/5000, score: 13
episode: 22/5000, score: 12
episode: 23/5000, score: 36
episode: 24/5000, score: 41
episode: 25/5000, score: 11
episode: 26/5000, score: 31
episode: 27/5000, score: 15
episode: 28/5000, score: 8
episode: 29/5000, score: 26


In [7]:
env.close()