In [14]:
from keras.layers import Dense, Activation
from keras.models import Sequential, load_model
from keras.optimizers import Adam

import numpy as np

import gym

In [7]:
class ReplayBuffer(object):
    def __init__(self, max_size, input_shape, n_actions, discrete=False):
        super(ReplayBuffer, self).__init__()
        self.mem_size = max_size
        self.discrete = discrete
        self.state_memory = np.zeros((self.mem_size, input_shape))
        self.new_state_memory = np.zeros((self.mem_size, input_shape))
        dtype = np.int8 if self.discrete else np.float32
        self.action_memory = np.zeros((self.mem_size, n_actions))
        self.reward_memory = np.zeros(self.mem_size)
        self.terminal_memory = np.zeros(self.mem_size, dtypes=np.float32)
        
    def store_transition(self, state, action, reward, new_state, done):
        index = self.mem_counter % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = new_state
        self.reward_memory[index] = reward
        self.terminal_memory[index] = 1 - int(done)
        
        if self.discrete:
            actions = np.zeros(self.action_memory.shape[1])
            actions=[action] = 1.0
            self.action_memory[index] = actions
        else:
            self.action_memory[index] = action
        self.mem_counter += 1
        
    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_counter, self.mem_size)
        batch = np.random.choice(max_mem, batch_size)
        
        states = self.state_memory[batch]
        new_states = self.new_state_memory[batch]
        rewards = self.reward_memory[batch]
        actions = self.action_memory[batch]
        terminal = self.terminal_memory[batch]
        
        return states, actions, rewards, new_states, terminal

In [9]:
def build_network(learning_rate, n_actions, input_dims, fc1_dims, fc2_dims):
    model = Sequential([
        Dense(fc1_dims, input_shape=(input_dims,)), 
        Activation('relu'),
        Dense(fc2_dims),
        Activation('relu'),
        Dense(n_actions)
    ])
    
    model.compile(optimizer=Adam(lr=learning_rate, loss="mse"))
    
    return model

In [12]:
class Agent(object):
    def __init__(self, alpha, gamma, n_actions, epsilon, batch_size, input_dims,
                fn, epsilon_decrease=0.996, espilon_end=0.01, meme_size=1000000,
                fname="dqn_model"):
        super(Agent, self).__init__()
        self.action_space = [i for i in range(n_actions)]
        self.n_actions = n_actions
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_dec = espilon_decrease
        self.espilon_min = espilon_end
        self.batch_size = batch_size
        self.model_file = fname
        
        self.memory = ReplayBuffer(mem_size, input_dims, n_actions, 
                                   discrete=True)
        self.q_eval = fn(alpha, n_actions, input_dims, 256, 256)
    
    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)
        
    def choose_action(self, state):
        state = state[np.newaxis,:]
        rand = np.random.random()
        if rand < self.epsilon:
            action = np.random.choice(self.actions_space)
        else:
            actions = self.q_eval.predict(state)
            action = np.argmax(actions)
            
        return action
    
    def learn(self):
        if self.memory.mem_counter < self.batch_size:
            return
        state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size)
        action_values = np.array(self.action_space, dtype=np.int8)
        action_indices = np.dot(action, action_values)
        
        q_eval = self.q_eval.predict(state)
        q_next = self.q_eval.predict(new_state)
        
        q_target = q_eval.copy()
        
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        
        q_target[batch_index, action_indices] = reward + \
            self.gamma*np.max(q_next, axis=1)*done
        
        _ = self.q_eval.fit(state, q_target, verbose=0)
        
        self.epsilon = self.epsilon*self.espilon_dec if self.epsilon > \
            self.espilons_min else self.espilon_min
        
        def save_model(self):
            self.q_eval.save(self.model_file)
        def load_model(self):
            self.q_eval = load_model(self.model_file)

In [16]:
env = gym.make("LunarLander-v2")

AttributeError: module 'gym.envs.box2d' has no attribute 'LunarLander'