In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model

In [2]:
class ReplayBuffer():
    def __init__(self,mem_size,input_dim):
        self.mem_size = mem_size
        self.mem_cntr = 0
        self.state_memory = np.zeros((self.mem_size,*input_dim),dtype=np.float32)
        self.new_state_memory = np.zeros((self.mem_size,*input_dim),dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size,dtype=np.int32)
        self.reward_memory =  np.zeros(self.mem_size,dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size,dtype=np.int32)
        
    def store_transition(self,state,action,reward,next_state,done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = next_state
        self.action_memory[index] = action 
        self.reward_memory[index] = reward 
        self.terminal_memory[index] = 1-int(done)
        self.mem_cntr +=1
        
    def sample_buffer(self,batch_size):
        max_mem = min(self.mem_cntr,self.mem_size)
        batch = np.random.choice(max_mem,batch_size,replace=False)
        
        states = self.state_memory[batch] 
        next_states = self.new_state_memory[batch] 
        actions = self.action_memory[batch] 
        rewards = self.reward_memory[batch] 
        terminals = self.terminal_memory[batch]
        return states,actions,rewards,next_states,terminals
    
        
        
        

In [3]:
def build_dqn(lr,n_actions,input_dims,fc1_dims,fc2_dims):
    model = keras.Sequential()
    model.add(keras.layers.Dense(fc1_dims,activation='relu'))
    model.add(keras.layers.Dense(fc2_dims,activation='relu'))
    model.add(keras.layers.Dense(n_actions,activation=None))
    model.compile(optimizer=Adam(learning_rate=lr),loss='mean_squared_error')
    
    return model

In [6]:
class Agent():

    def __init__(self,lr,gamma,n_actions,epsilone,batch_size,input_dims,epsilone_dec=1e-3
                ,epsilone_end=0.01,mem_size=1000000,fname='dqn_LunarLande_model.h5'):
        self.action_space = [i for i in range(n_actions)]
        self.gamma =gamma
        self.epsilone = epsilone
        self.eps_min = epsilone_end
        self.eps_dec = epsilone_dec
        self.batch_size = batch_size
        self.model_file =fname
        self.memory = ReplayBuffer(mem_size,input_dims)
        self.q_eval = build_dqn(lr,n_actions,input_dims,256,256)
        
    def store_transition(self,state,action,reward,next_state,done):
        self.memory.store_transition(state,action,reward,next_state,done)
        
    def choose_action(self,observation):
        if np.random.random() < self.epsilone:
            action = np.random.choice(self.action_space)
        else:
            state = np.array([observation])#add dimention
            actions = self.q_eval.predict([state])
            action = np.argmax(actions)
            
        return action
    
    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        
        states,actions,rewards,next_states,terminals=\
        self.memory.sample_buffer(self.batch_size)
        
        q_eval =self.q_eval.predict(states)
        q_next = self.q_eval.predict(next_states)
        
        q_target = np.copy(q_eval)
        batch_index = np.arange(self.batch_size,dtype=np.int32)
        q_target[batch_index,actions]=reward+terminals*self.gamma*np.max(q_next,axis=1)
        
        self.q_eval.train_on_batch(states,q_target)
        self.epsilone = self.epsilone-self.eps_dec if self.epsilone > self.eps_min else self.eps_min
        
    def save_model(self):
        self.q_eval.save(self.model_file)
        
    def load_model(self):
        self.q_eval = load_model(self.model_file)
        

In [7]:
# !pip install mujoco_py==2.0.2.8
# !pip install 'gym[all]'
import gym

if __name__ == '__main__':
    tf.compat.v1.disable_eager_execution()
    env = gym.make('LunarLander-v2')
    lr = 0.001
    n_games = 500
    agent=Agent(gamma = 0.99,epsilone=1,lr=lr,input_dims=env.observation_space.shape,
               n_actions=env.action_space.n,mem_size=1000000,batch_size=64,epsilone_end=0.01)
    
    scores = []
    eps_history = []
    for i in range(n_games):
        done=False
        score =0
        observation = env.reset()
        while not done:
            action =agent.choose_action(observation)
            observation_,reward,done,info = env.step(action)
            score +=reward
            agent.store_transition(observation,action,reward,observation_,done)
            
            observation = observation_
            
            agent.learn()
            
        eps_history.append(agent.epsilone)
        scores.append(score)
        
        avg_score = np.mean(scores[-100:])
        print('episode: ',i,'score %.2f' % score,
             'average_score %.2f' % avg_score,
             'epsilone %.2f' % agent.epsilone)
        if i% 20 ==0:
            agent.save_model()
        

episode:  0 score -75.41 average_score -75.41 epsilone 0.99
episode:  1 score -151.22 average_score -113.31 epsilone 0.88
episode:  2 score -217.18 average_score -147.94 epsilone 0.82
episode:  3 score -84.72 average_score -132.13 epsilone 0.75
episode:  4 score -305.36 average_score -166.78 epsilone 0.67
episode:  5 score -245.75 average_score -179.94 epsilone 0.55
episode:  6 score -400.11 average_score -211.39 epsilone 0.46
episode:  7 score -66.91 average_score -193.33 epsilone 0.40
episode:  8 score -253.00 average_score -199.96 epsilone 0.30
episode:  9 score -256.57 average_score -205.62 epsilone 0.22
episode:  10 score -159.56 average_score -201.43 epsilone 0.04
episode:  11 score -377.04 average_score -216.07 epsilone 0.01
episode:  12 score -46.46 average_score -203.02 epsilone 0.01
episode:  13 score -293.13 average_score -209.46 epsilone 0.01
episode:  14 score -345.94 average_score -218.56 epsilone 0.01
episode:  15 score -37.76 average_score -207.26 epsilone 0.01
episode:

episode:  130 score -351.89 average_score -344.16 epsilone 0.01
episode:  131 score -111.07 average_score -341.41 epsilone 0.01
episode:  132 score -261.22 average_score -342.85 epsilone 0.01
episode:  133 score -230.76 average_score -343.65 epsilone 0.01
episode:  134 score -99.49 average_score -339.81 epsilone 0.01
episode:  135 score -537.48 average_score -339.79 epsilone 0.01
episode:  136 score -460.14 average_score -341.61 epsilone 0.01
episode:  137 score -727.17 average_score -344.83 epsilone 0.01
episode:  138 score -193.00 average_score -344.32 epsilone 0.01
episode:  139 score -186.15 average_score -342.02 epsilone 0.01
episode:  140 score -539.03 average_score -344.08 epsilone 0.01
episode:  141 score -520.03 average_score -344.21 epsilone 0.01
episode:  142 score -370.48 average_score -343.48 epsilone 0.01
episode:  143 score -353.20 average_score -344.68 epsilone 0.01
episode:  144 score -596.74 average_score -346.07 epsilone 0.01
episode:  145 score -678.95 average_score

episode:  259 score -519.00 average_score -425.61 epsilone 0.01
episode:  260 score -321.80 average_score -425.00 epsilone 0.01
episode:  261 score -877.80 average_score -430.16 epsilone 0.01
episode:  262 score -151.12 average_score -430.04 epsilone 0.01
episode:  263 score -715.56 average_score -432.38 epsilone 0.01
episode:  264 score -586.01 average_score -433.64 epsilone 0.01
episode:  265 score -627.66 average_score -435.12 epsilone 0.01
episode:  266 score -454.35 average_score -437.85 epsilone 0.01
episode:  267 score -375.27 average_score -440.79 epsilone 0.01
episode:  268 score -303.74 average_score -438.72 epsilone 0.01
episode:  269 score -344.13 average_score -436.21 epsilone 0.01
episode:  270 score -110.42 average_score -436.21 epsilone 0.01
episode:  271 score -493.93 average_score -439.84 epsilone 0.01
episode:  272 score -560.49 average_score -444.70 epsilone 0.01
episode:  273 score -360.56 average_score -445.85 epsilone 0.01
episode:  274 score -537.30 average_scor

episode:  388 score -739.45 average_score -467.48 epsilone 0.01
episode:  389 score -646.89 average_score -469.29 epsilone 0.01
episode:  390 score -468.42 average_score -471.60 epsilone 0.01
episode:  391 score -344.80 average_score -469.87 epsilone 0.01
episode:  392 score -313.96 average_score -471.98 epsilone 0.01
episode:  393 score -290.90 average_score -470.70 epsilone 0.01
episode:  394 score -336.90 average_score -468.67 epsilone 0.01
episode:  395 score -282.03 average_score -465.26 epsilone 0.01
episode:  396 score -417.02 average_score -464.29 epsilone 0.01
episode:  397 score -299.26 average_score -464.68 epsilone 0.01
episode:  398 score -488.55 average_score -463.92 epsilone 0.01
episode:  399 score -486.47 average_score -464.81 epsilone 0.01
episode:  400 score -424.75 average_score -464.69 epsilone 0.01
episode:  401 score -397.42 average_score -465.90 epsilone 0.01
episode:  402 score -134.69 average_score -462.40 epsilone 0.01
episode:  403 score -272.80 average_scor