# SpaceInvaders-v0

In [7]:
import gym
from keras.models import Sequential
from keras.layers import Input, merge, Permute, Conv2D
from keras.models import load_model, Sequential, Model
from keras.layers.convolutional import Convolution2D
from keras.optimizers import Adam
from keras.layers.core import Activation, Dropout, Flatten, Dense
from keras.optimizers import Adam
from keras import backend as K
from keras.callbacks import History
import keras
import os

from collections import deque
import itertools
import numpy as np
import random

random.seed(42)

In [12]:
class DQNAgent():
    def __init__(self, state_size, action_size):
        self._state_size = state_size
        self._action_size = action_size        
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self._learning_rate = 0.001
        self.model_name = 'SpaceInvaders-DQN.h5'
        self.model = self._build_model()
        self.target_model = self._build_model()
    
    
    def _build_model(self, hidden_size=24):
            
        model = Sequential()
        model.add(Conv2D(32, (8, 8), strides=(4, 4), input_shape=self._state_size))
        model.add(Activation('relu'))
        model.add(Conv2D(64, (4, 4), strides=(2, 2)))
        model.add(Activation('relu'))
        model.add(Conv2D(64, (3, 3), strides=(1, 1)))
        model.add(Activation('relu'))
        model.add(Flatten())
        model.add(Dense(512))
        model.add(Activation('relu'))
        model.add(Dense(self._action_size))
        model.add(Activation('linear'))
        print(model.summary())

        opt = Adam(lr=self._learning_rate)
        model.compile(loss='mean_squared_error', optimizer=opt)
        model.summary()
        
        if os.path.exists(self.model_name):
            model.load_weights(self.model_name)
        return model

    def remember(self, state, action, reward, next_state, done):
        state = state.reshape(1,210, 160, 3)
        next_state = next_state.reshape(1,210, 160, 3)
        self.memory.append((state, action, reward, next_state, done))
        
    def update_target_model(self):
        self.target_model = self.model
        
    def train_model(self, batch_size=20):
        def batch(iterable, n=1):
            l = len(iterable)
            for ndx in range(0, l, n):
                yield list(itertools.islice(iterable, ndx, min(ndx + n, l)))
        
        history = History()
        batch_data = list(batch(self.memory, batch_size))
        selected_batch = random.sample(batch_data, 1)
        
        for state, action, reward, next_state, done in selected_batch[0]:
            target = self.model.predict(state)
            target_val = self.model.predict(next_state)
            target_val_ = self.target_model.predict(next_state)
            if done:
                target[0][action] = reward
            else:
                a = np.argmax(target_val)
                target[0][action] = reward + self.gamma * target_val_[0][a]
            
            self.model.fit(state, target, epochs=1, verbose=0, callbacks=[history])
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay    

    def takeAction(self, state):
        state = state.reshape(1,210, 160, 3)
        act = np.argmax(self.model.predict(state))
        if np.random.rand() < self.epsilon:
            act = random.randrange(self._action_size)
        return act


In [14]:
episodes = 500
render = True
shows = 10
total_reward = list()

env = gym.make('SpaceInvaders-v0')
agent = DQNAgent(env.observation_space.shape, env.action_space.n)

for e in range(episodes):
    state = env.reset()
    
    rewards = 0
    while True:
        
        if render:
            env.render()
        action = agent.takeAction(state)
        next_state, reward, done, _ = env.step(action)
        rewards += reward
        
        if done:
            total_reward.append(rewards)
            agent.remember(state, action, rewards, next_state, done)
            state = next_state
#             print("Episode: {}, Total Reward: {}".format(e, rewards))
            break
        else:
            agent.remember(state, action, 0, next_state, done)
            state = next_state
            
    agent.train_model(32)
    
    if e % shows == 0:
        print("Episode: {}, Total Reward: {}".format(e, np.mean(total_reward[-shows:])))
        agent.update_target_model()
        agent.model.save_weights(agent.model_name)

Episode: 0, Total Reward: 115.0
Episode: 10, Total Reward: 140.5
Episode: 20, Total Reward: 198.5
Episode: 30, Total Reward: 260.0
Episode: 40, Total Reward: 203.5
Episode: 50, Total Reward: 245.0
Episode: 60, Total Reward: 159.0
Episode: 70, Total Reward: 231.0
Episode: 80, Total Reward: 147.5
Episode: 90, Total Reward: 253.0
Episode: 100, Total Reward: 214.5
Episode: 110, Total Reward: 288.5
Episode: 120, Total Reward: 226.0
Episode: 130, Total Reward: 136.5
Episode: 140, Total Reward: 202.5
Episode: 150, Total Reward: 201.0
Episode: 160, Total Reward: 205.0
Episode: 170, Total Reward: 160.0
Episode: 180, Total Reward: 192.0
Episode: 190, Total Reward: 214.5
Episode: 200, Total Reward: 175.0
Episode: 210, Total Reward: 122.0
Episode: 220, Total Reward: 223.0
Episode: 230, Total Reward: 216.0
Episode: 240, Total Reward: 153.0
Episode: 250, Total Reward: 196.0
Episode: 260, Total Reward: 178.5
Episode: 270, Total Reward: 178.0
Episode: 280, Total Reward: 176.5
Episode: 290, Total Rewar