In [1]:
from DumbGame import DumbGameEnv
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam

In [2]:
env = DumbGameEnv()
states = env.observation_space.shape
actions = env.action_space.n
print(f"States:{states} Actions:{actions}")

States:(1,) Actions:5


In [3]:
def build_model(states, actions):
    model = Sequential()    
    model.add(Dense(24, activation='relu', input_shape=states))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

In [4]:
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=20000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=actions, nb_steps_warmup=20, target_model_update=0.1)
    return dqn

In [5]:
model = build_model(states, actions)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 24)                48        
_________________________________________________________________
dense_1 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 125       
Total params: 773
Trainable params: 773
Non-trainable params: 0
_________________________________________________________________


In [6]:
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=0.1))#, metrics=['mae'])
history = dqn.fit(env, nb_steps=5000, visualize=False, verbose=1)

Training for 5000 steps ...
Interval 1 (0 steps performed)
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
    1/10000 [..............................] - ETA: 8:06 - reward: -1.0000





In [7]:
print(history.params)
print(history.history)

{'nb_steps': 5000}
{'episode_reward': [-16.0, -10.0, -8.0, -24.0, -26.0, -2.0, -8.0, -21.0, -1.0, -2.0, -16.0, 0.0, -31.0, -38.0, -23.0, -38.0, -21.0, -22.0, -4.0, -9.0, -19.0, -19.0, -20.0, -21.0, -8.0, -29.0, -26.0, -18.0, -22.0, -10.0, -17.0, -17.0, -19.0, -50.0, -31.0, -16.0, -4.0, -13.0, -23.0, -29.0, -18.0, -22.0, -21.0, -9.0, -9.0, 0.0, -20.0, -28.0, -20.0, -26.0, -13.0, -25.0, -3.0, -11.0, -9.0, -7.0, -18.0, -11.0, -20.0, -15.0, -14.0, -15.0, -5.0, 0.0, -27.0, -46.0, -13.0, -7.0, -16.0, -16.0, -16.0, -12.0, -9.0, -16.0, -3.0, -13.0, -15.0, -5.0, -44.0, -11.0, 1.0, -26.0, -21.0, -18.0, -27.0, -33.0, -24.0, -12.0, -13.0, -6.0, -22.0, -1.0, -11.0, -2.0, -24.0, -1.0, -14.0, -7.0, -8.0, -27.0, -14.0, -18.0, -31.0, -20.0, -9.0, -3.0, 0.0, -27.0, -7.0, -16.0, -16.0, -10.0, -31.0, -14.0, -7.0, -16.0, -3.0, -24.0, -16.0, -39.0, -44.0, -20.0, -8.0, -2.0, -25.0, -8.0, -24.0, -36.0, -6.0, -20.0, -11.0, -21.0, -27.0, -33.0, -8.0, -12.0, -28.0, -8.0, -35.0, -4.0, -8.0, -27.0, 1.0, -8.0, -9.0

In [9]:
scores = dqn.test(env, nb_episodes=1, visualize=False)

Testing for 1 episodes ...


KeyboardInterrupt: 

In [None]:
#print(np.mean(scores.history['episode_reward']))
#dqn.get_config()
#scores = dqn.test(env, nb_episodes=1, visualize=False, verbose=1)
#test(self, env, nb_episodes=1, action_repetition=1, callbacks=None, visualize=True, nb_max_episode_steps=None, nb_max_start_steps=0, start_step_policy=None, verbose=1)
#print(np.mean(scores.history['episode_reward']))callbacks = callbacks[:]