# SpaceInvaders-v0

In [1]:
import gym
from keras.models import Sequential
from keras.layers import Input, merge, Permute, Conv2D
from keras.models import load_model, Sequential, Model
from keras.layers.convolutional import Convolution2D
from keras.optimizers import Adam
from keras.layers.core import Activation, Dropout, Flatten, Dense
from keras.optimizers import Adam
from keras import backend as K
from keras.callbacks import History
import keras
import os

from collections import deque
import itertools
import numpy as np
import random

random.seed(42)

Using TensorFlow backend.


In [2]:
class DQNAgent():
    def __init__(self, state_size, action_size):
        self._state_size = state_size
        self._action_size = action_size        
        self.memory = deque(maxlen=2000)
        self.gamma = 0.99    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.8#0.01
        self.epsilon_decay = 0.995
        self._learning_rate = 0.001
        self.model_name = 'Breakout-DQN.h5'
        self.model = self._build_model()
        self.target_model = self._build_model()
    
    
    def _build_model(self, hidden_size=24):
            
        model = Sequential()
        model.add(Conv2D(32, (8, 8), strides=(4, 4), input_shape=self._state_size))
        model.add(Activation('relu'))
        model.add(Conv2D(64, (4, 4), strides=(2, 2)))
        model.add(Activation('relu'))
        model.add(Conv2D(64, (3, 3), strides=(1, 1)))
        model.add(Activation('relu'))
        model.add(Flatten())
        model.add(Dense(512))
        model.add(Activation('relu'))
        model.add(Dense(self._action_size))
        model.add(Activation('linear'))
        print(model.summary())

        opt = Adam(lr=self._learning_rate)
        model.compile(loss='mean_squared_error', optimizer=opt)
        model.summary()
        
        if os.path.exists(self.model_name):
            model.load_weights(self.model_name)
        return model

    def remember(self, state, action, reward, next_state, done):
        state = state.reshape(1,210, 160, 3)
        next_state = next_state.reshape(1,210, 160, 3)
        self.memory.append((state, action, reward, next_state, done))
        
    def update_target_model(self):
        self.target_model = self.model
        
    def train_model(self, batch_size=20):
        def batch(iterable, n=1):
            l = len(iterable)
            for ndx in range(0, l, n):
                yield list(itertools.islice(iterable, ndx, min(ndx + n, l)))
        
        history = History()
        batch_data = list(batch(self.memory, batch_size))
        selected_batch = random.sample(batch_data, 1)
        
        for state, action, reward, next_state, done in selected_batch[0]:
            target = self.model.predict(state)
            target_val = self.model.predict(next_state)
            target_val_ = self.target_model.predict(next_state)
            if done:
                target[0][action] = reward
            else:
                a = np.argmax(target_val)
                target[0][action] = reward + self.gamma * target_val_[0][a]
            
            self.model.fit(state, target, epochs=1, verbose=0, callbacks=[history])
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay    

    def takeAction(self, state):
        state = state.reshape(1,210, 160, 3)
        act = np.argmax(self.model.predict(state))
        if np.random.rand() < self.epsilon:
            act = random.randrange(self._action_size)
        return act


In [5]:
episodes = 10000
render = True
shows = 10
total_reward = list()

env = gym.make('Breakout-v0')
agent = DQNAgent(env.observation_space.shape, env.action_space.n)

for e in range(episodes):
    state = env.reset()
    
    rewards = 0
    while True:
        
        if render:
            env.render()
        action = agent.takeAction(state)
        next_state, reward, done, _ = env.step(action)
        rewards += reward
        
        
        if done:
            total_reward.append(rewards)
#             print("Episode: {}, Total Reward: {}".format(e, rewards))
            if rewards <= 1:
                agent.remember(state, action, rewards-np.mean(total_reward[-shows:]), next_state, done)
                state = next_state
            break
        else:
            agent.remember(state, action, reward, next_state, done)
            state = next_state

            
    agent.train_model(32)
    
    if e % shows == 0:
        print("Episode: {}, Total Reward: {}".format(e, np.mean(total_reward[-shows:])))
        agent.update_target_model()
        agent.model.save_weights(agent.model_name)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_13 (Conv2D)           (None, 51, 39, 32)        6176      
_________________________________________________________________
activation_21 (Activation)   (None, 51, 39, 32)        0         
_________________________________________________________________
conv2d_14 (Conv2D)           (None, 24, 18, 64)        32832     
_________________________________________________________________
activation_22 (Activation)   (None, 24, 18, 64)        0         
_________________________________________________________________
conv2d_15 (Conv2D)           (None, 22, 16, 64)        36928     
_________________________________________________________________
activation_23 (Activation)   (None, 22, 16, 64)        0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 22528)             0         
__________

KeyboardInterrupt: 