In [9]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model

In [13]:
class ReplayBuffer(object):
    def __init__(self, max_size, input_dim):
            self.mem_size = max_size
            self.mem_counter = 0
        
            self.state_memory = np.zeros((self.mem_size, *input_dim), dtype= np.float32)
            self.new_state_memory = np.zeros((self.mem_size, *input_dim), dtype= np.float32)
        
            self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
            self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
            self.terminal_memory = np.zeros(self.mem_size, dtype=np.int32)
    
    def store_transition(self, state, action, reward, new_state, done):
            index = self.mem_counter % self.mem_size
            self.state_memory[index] = state
            self.new_state_memory[index] = _state
            self.reward_memory[index] = reward
            self.action_memory[index] = action
            self.terminal_memory[index] = 1 - int(done)
            self.mem_counter += 1
    
    def sample_buffer(self, batch_size):
            max_mem = min(self.mem_counter, self.mem_size)
            batch = np.random.choice(max_mem, batch_size, replace = False)
            states = self.state_memory[batch]
            states_ = self.new_state_memory[batch]
            reward = self.reward_memory[batch]
            action = self.action_memory[batch]
            terminal = self.terminal_memory[batch]
            
            return states, states_ , action, reward , terminal
    
    def build_dqn(lr, n_actions, input_dims, fc1_dims, fc2_dims):
        model = keras.Sequential([
            keras.layers.Dense(fc1_dims, activation="relu"),
            keras.layers.Dense(fc2_dims, activation="relu"),
            keras.layers.Dense(n_actions, activation=None)
        ])
        model.compile(optimizer=Adam(learning_rate = lr), loss = "mean_squared_error")
        return model
    

In [14]:
class Agent():
    def __init__(self, lr, gamma, n_actions, epsilon, batch_size, input_dims,
                epsilon_dec = 1e-3, epsilon_end = 0.01, mem_size = 1000000, fname = "dqn_model.h5"):
            self.action = [i for i in range(n_actions)]
            self.gamma = gamma
            self.epsilon = epsilon
            self.eps_min = epsilon_end
            self.batch_size = batch_size
            self.model_file = fname
            self.memory = ReplayBuffer(mem_size, input_dims)
            self.q_eval = build_dqn(lr, n_actions, input_dims, 256, 256)
    
    def store_transition(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state,done)
    
    def choose_action(self, observation):
            if np.random.random() < self.epsilon: 
                action = np.random.choice(self.action_space)
            else : 
                state = np.array([observation])
                actions = self.q_eval.predict(state)
                
                action = np.argmax(actions)
            return action
    
    def learn(self):
            if self.memory.mem_counter < self.batch_size:
                return 
            states, states_, actions, rewards, dones = \
                    self.memory.sample_buffer(batch_size)
            q_eval = self.q_eval.predict(states)
            q_next = self.q_eval.predict(states_)
            
            q_target = np.copy(q_eval)
            batch_index = np.arrange(self.batch_size , dtype=np.int32)
            
            q_target[batch_index, actions] = rewards + \
                                self.gamma* np.max(q_next, axis=1) * dones
            
            self.q_eval.train_on_batch(states, q_target)
            
            self.epsilon = self.epsilon - self.eps_dec if self.epsilon > \
                            self.eps_min else self.eps_min
    
    def save_model(self):
            self.q_eval.save(self.model_file)
    
    def load_model(self):
            self.q_eval = load_model(self.model_file)
            

In [43]:
!pip install Box2D
!pip3 install  gym
#!pip install gym[all]
!pip install box2d_py-2.3.8-cp38-cp38m-win_amd64.whl



ERROR: box2d_py-2.3.8-cp38-cp38m-win_amd64.whl is not a supported wheel on this platform.


In [37]:
import gym 
import utils
import tensorflow as tf

In [40]:
if __name__ == "__main__":
        tf.compat.v1.disable_eager_execution()
        env = gym.make("LunarLander-v2")
        lr = 0.001
        n_games = 250
        agent = Agent(gamma= 0.99, epsilon = 1.0, lr=lr,
                     input_dims = env.observation_space.shape[0],
                     n_actions = env.action_space.n, mem_size = 1000000, batch_size = 64,
                     epsilon_end = 0.01)
        
        scores = []
        eps_history = []
        
        for i in range(n_games):
            done = False
            score = 0
            observation = env.reset()
            while not done:
                action = agent.choose_action(observation)
                observation_ , reward, done, info = env.step()
                score += reward
                agent.store_transition(observation, action, reward, observation_, done)
                observation = observation_
                agent.learn()
                
            eps_history.append(agent.epsilon)
            scores.append(score)
            
            avg_score = np.mean(scores[-100:])
            print("episode: ", i , "score %.2f" % score,
                 "average_score %.2f" % avg_score,
                 "epsilon %.2f" %agent.epsilon)
            
            filename = "lunalander_tf2.png"
            x = [i + 1 for i in range(n_games)]
            utils.plot_learning_curve(x, scores, eps_history, filename)

AttributeError: module 'gym.envs.box2d' has no attribute 'LunarLander'