In [6]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
import gym
import numpy as np

# Game

In [38]:
ENV_NAME = 'CartPole-v0'


# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
shape_observations = (1,) +env.observation_space.shape
n_actions = env.action_space.n

[2017-11-14 17:24:30,344] Making new env: CartPole-v0


# Model - Keras

In [39]:
model = Sequential()
model.add(Flatten(input_shape=shape_observations))
model.add(Dense(16, activation="relu"))
model.add(Dense(16, activation="relu"))
model.add(Dense(16, activation="relu"))
model.add(Dense(n_actions, activation="linear"))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_8 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_29 (Dense)             (None, 16)                80        
_________________________________________________________________
dense_30 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_31 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_32 (Dense)             (None, 2)                 34        
Total params: 658
Trainable params: 658
Non-trainable params: 0
_________________________________________________________________
None


# Q-learn model

In [50]:
from keras.optimizers import Adam
from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

memory = SequentialMemory(limit=50000, window_length=1) # limit is the size of memory, not clear about window_length
policy = BoltzmannQPolicy() #Not clear, Bellmen?
dqn = DQNAgent(
    model=model, # Keras model
    nb_actions=n_actions, # Size of action space
    memory=memory, 
    nb_steps_warmup=10, # Number of steps before doing training
    target_model_update=1e-2, # Soft update with `(1 - target_model_update) * old + target_model_update * new`.
    policy=policy
)
dqn.compile(Adam(lr=1e-3), metrics=['mae']) # MAE = Mean Absolute Error

In [51]:
dqn.fit(env, nb_steps=50000, verbose=1)
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

Training for 50000 steps ...
Interval 1 (0 steps performed)
   11/10000 [..............................] - ETA: 197s - reward: 1.0000



65 episodes - episode_reward: 151.969 [12.000, 200.000] - loss: 8.980 - mean_absolute_error: 39.763 - mean_q: 79.758

Interval 2 (10000 steps performed)
52 episodes - episode_reward: 193.885 [151.000, 200.000] - loss: 5.213 - mean_absolute_error: 41.832 - mean_q: 83.811

Interval 3 (20000 steps performed)
50 episodes - episode_reward: 199.200 [177.000, 200.000] - loss: 7.448 - mean_absolute_error: 39.446 - mean_q: 78.826

Interval 4 (30000 steps performed)
51 episodes - episode_reward: 195.627 [78.000, 200.000] - loss: 8.280 - mean_absolute_error: 38.534 - mean_q: 76.913

Interval 5 (40000 steps performed)
done, took 403.190 seconds


In [53]:
dqn.test(env, nb_episodes=100, visualize=False)


Testing for 100 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200
Episode 6: reward: 200.000, steps: 200
Episode 7: reward: 200.000, steps: 200
Episode 8: reward: 200.000, steps: 200
Episode 9: reward: 200.000, steps: 200
Episode 10: reward: 200.000, steps: 200
Episode 11: reward: 200.000, steps: 200
Episode 12: reward: 200.000, steps: 200
Episode 13: reward: 200.000, steps: 200
Episode 14: reward: 200.000, steps: 200
Episode 15: reward: 200.000, steps: 200
Episode 16: reward: 200.000, steps: 200
Episode 17: reward: 200.000, steps: 200
Episode 18: reward: 200.000, steps: 200
Episode 19: reward: 200.000, steps: 200
Episode 20: reward: 200.000, steps: 200
Episode 21: reward: 200.000, steps: 200
Episode 22: reward: 200.000, steps: 200
Episode 23: reward: 200.000, steps: 200
Episode 24: reward: 200.000, steps: 200
Episode 25: reward: 

<keras.callbacks.History at 0x1a1bcc1d30>

In [None]:
dqn.