# Deep Q-Learning

In [1]:
import gym
import numpy as np
from gym import wrappers
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt

Hiperparâmtros

In [32]:
BATCH_SIZE = 256
MAX_EPISODES = 5
GAMMA = 0.9 # Discount factor

## Environment


In [33]:
env = gym.make('CartPole-v1') # Instantiate a new environment

print('States:', env.observation_space) # Show the type and shape of observations
print('Actions:', env.action_space)     # Show the type and shape of possible actions

States: Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)
Actions: Discrete(2)


Rodar uma partida

In [None]:
env.reset() # Set env to initial state
import time
done = False
while not done:
    env.render()
    time.sleep(0.1)
    _, _, done, _ = env.step(env.action_space.sample()) # Perform random actions

env.close()

## Q-Network


In [27]:
model = Sequential()
model.add(Dense(512, activation='relu', name='fc1', input_shape=env.observation_space.shape))
model.add(Dense(512, activation='relu', name='fc2'))
model.add(Dense(env.action_space.n, name='fc3'))
model.summary()

model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
fc1 (Dense)                  (None, 512)               2560      
_________________________________________________________________
fc2 (Dense)                  (None, 512)               262656    
_________________________________________________________________
fc3 (Dense)                  (None, 2)                 1026      
Total params: 266,242
Trainable params: 266,242
Non-trainable params: 0
_________________________________________________________________


## Experience Replay



In [30]:
buffer = deque(maxlen=500000) # A circular buffer with max of 2000 samples

while len(buffer) < BATCH_SIZE: # Fill buffer with one batch, so we can start learning
    state = env.reset()
    done = False
    
    while not done:
        action = env.action_space.sample()
        new_state, reward, done, info = env.step(action)
        transition = (state, action, reward, new_state, done)
        buffer.append(transition) # Store trasition in the buffer

## Training


In [31]:
exploration_decay = 0.995 # Decay rate
exploration_rate = 1.0    # Initial exploration rate

rewards_list = []
loss_list = []

for episode in range(1, MAX_EPISODES+1):
    state = env.reset()
    done = False
    score = 0.0
    
    while not done:
#         env.render() # Comment this line to hide graphical interface
        
        if np.random.rand() < exploration_rate :
            # If rand < exploration_rate agent must explore
            action = env.action_space.sample()
        else:
            # Else agent will use Q-Network to get the best action
            action = np.argmax(model.predict(state[None]))
            
        new_state, reward, done, info = env.step(action) # Perform action
        transition = (state, action, reward, new_state, done)
        buffer.append(transition) # Store transtion in the buffer
        
        state = new_state
        
        score += reward # Update episode total score
        
    if exploration_rate > 0.01:
        # Update exploration_rate. It must be at least 1%
        exploration_rate *= exploration_decay
        
    # Experice Replay
    # Sample a random batch from buffer
    indexes = np.random.choice(len(buffer), BATCH_SIZE, replace=True)
    batch = [buffer[i] for i in indexes]
    states = np.array([item[0] for item in batch])
    actions = np.array([item[1] for item in batch])
    rewards = np.array([item[2] for item in batch])
    new_states = np.array([item[3] for item in batch])
    terminals = np.array([item[4] for item in batch])

    # Predict Q(s, a, theta) for states 
    predictions = model.predict(states)

    # Update values according to Deep Q-Learnig algorithm
    for i in range(len(batch)):
        if terminals[i]:
            # yj = rj
            predictions[i,actions[i]] = rewards[i]
        else:
            # yj = rj + gamma * Q(s', a', theta)
            predictions[i,actions[i]] = rewards[i] + GAMMA * np.max(model.predict(new_states[i][None]))

    # Train model with the batch
    loss, _ = model.train_on_batch(states, predictions)
    loss_list.append(loss)
    rewards_list.append(score)
    print('Episode: {}, Score: {}'.format(episode, score))


Episode: 1, Score: 18.0
Episode: 2, Score: 28.0
Episode: 3, Score: 30.0
Episode: 4, Score: 42.0
Episode: 5, Score: 28.0
Episode: 6, Score: 19.0
Episode: 7, Score: 10.0
Episode: 8, Score: 23.0
Episode: 9, Score: 17.0
Episode: 10, Score: 14.0
Episode: 11, Score: 17.0


KeyboardInterrupt: 

In [18]:
# Save Keras model
model.save('cart_pole.h5')

## Test

In [26]:
done = False
state = env.reset()
score = 0.0

while not done:
    env.render()
    time.sleep(0.01)
    action = np.argmax(model.predict(state[None]))
    state, reward, done, _ = env.step(action)
    score += reward
    
print(score)
env.close()
    

10.0
