In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model,Sequential
from tensorflow.keras.layers import Activation,Dense,Conv2D,Flatten

In [2]:
class ReplayBuffer():
    
    def __init__(self,mem_size,input_dim):
        self.mem_size = mem_size
        self.cntr = 0
        
        self.state_memory = np.zeros((mem_size,*input_dim),dtype=np.float32)
        self.new_state_memory = np.zeros((mem_size,*input_dim),dtype=np.float32)
        self.action_memory = np.zeros(mem_size,dtype=np.int32)
        self.reward_memory = np.zeros(mem_size,dtype=np.float32)
        self.terminal_memory = np.zeros(mem_size,dtype=np.uint8)
        
    def store_exprience(self,state,action,reward,state_,done):
        index = self.cntr % self.mem_size

        self.state_memory[index] = state 
        self.new_state_memory[index] = state_
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.terminal_memory[index] = done
        
        self.cntr += 1
        
    def sample_exprience(self,batch_size):
        
        min_index = min(self.cntr,self.mem_size)
        batch = np.random.choice(min_index,batch_size, replace=False)
        
        states = self.state_memory[batch]
        states_ = self.new_state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        dones = self.terminal_memory[batch]
                           
        return states,actions,rewards,states_,dones
                           
    

In [3]:
def build_dqn(lr,n_actions,input_dims,fc1_dims):
    model =Sequential()
    model.add(Conv2D(filters=32,kernel_size=8,strides=4,activation='relu',
                     input_shape=(*input_dims,),data_format='channels_first'))
    model.add(Conv2D(filters=64,kernel_size=4,strides=2,activation='relu',data_format='channels_first'))
    model.add(Conv2D(filters=64,kernel_size=3,strides=1,activation='relu',data_format='channels_first'))
    model.add(Flatten())
    model.add(Dense(fc1_dims,activation='relu'))
    model.add(Dense(n_actions))
    
    model.compile(optimizer=Adam(lr=lr),loss='mean_squared_error')
    return model

In [4]:
class Agent(object):
    def __init__(self,alpha,gamma,n_actions,epsilon,batch_size,replace,input_dims,
                eps_dec=1e-5,eps_min=0.01,mem_size=1000000,q_eval_fname='q_eval.h5',
                 q_target_fname='q_target.h5'):
        self.action_space = [i for i in range(n_actions)]
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_dec = eps_dec
        self.eps_min = eps_min
        self.batch_size = batch_size
        self.replace = replace
        self.q_target_model_file = q_target_fname
        self.q_eval_model_file = q_eval_fname
        self.learn_step = 0
        self.memory = ReplayBuffer(mem_size,input_dims)
        self.q_eval = build_dqn(alpha,n_actions,input_dims,512)
        self.q_next = build_dqn(alpha,n_actions,input_dims,512)
        
    def replace_target_network(self):
        if self.replace != 0 and self.learn_step % self.replace == 0:
            self.q_next.set_weights(self.q_eval.get_weights())

    def store_transition(self,state,action,reward,state_,done):
        self.memory.store_exprience(state,action,reward,state_,done)

    def choose_action(self,observation):
        if np.random.random() < self.epsilon:
            action = np.random.choice(self.action_space)
        else:
            state = np.array([observation],copy=False,dtype=np.float32)
            Q = self.q_eval.predict(state)
            action = np.argmax(Q)

        return action
    def learn(self):
        if self.memory.cntr >self.batch_size:
            state,action,reward,new_state,done = self.memory.sample_exprience(self.batch_size)

            self.replace_target_network()
            q_eval = self.q_eval.predict(state)
            q_next = self.q_next.predict(new_state)

        
            indices = np.arange(self.batch_size)
            q_target = q_eval[:]
            q_target[indices,action] = reward + self.gamma*np.max(q_next,axis=1)*(1 - done)

            self.q_eval.train_on_batch(state,q_target)

            self.epsilon = self.epsilon - self.eps_dec if self.epsilon>self.eps_min else self.eps_min

            self.learn_step += 1

    def save_model(self):
        self.q_eval.save(self.q_eval_model_file)
        self.q_next.save(self.q_target_model_file)

    def load_model(self):
        self.q_eval=load_model(self.q_eval_model_file)
        self.q_next = laod_model(self.q_target_model_file)

In [5]:
# !pip install mujoco_py==2.0.2.8
# !pip install 'gym[all]'
import gym

In [6]:
class SkipEnv(gym.Wrapper):
    def __init__(self, env=None, skip=4):
        super(SkipEnv, self).__init__(env)
        self._skip = skip

    def step(self, action):
        t_reward = 0.0
        done = False
        for _ in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            t_reward += reward
            if done:
                break
        return obs, t_reward, done, info

    def reset(self):
        self._obs_buffer = []
        obs = self.env.reset()
        self._obs_buffer.append(obs)
        return obs


In [7]:
class PreProcessFrame(gym.ObservationWrapper):
    def __init__(self, env=None):
        super(PreProcessFrame, self).__init__(env)
        self.observation_space = gym.spaces.Box(low=0, high=255,
                                                shape=(80,80,1), dtype=np.uint8)
    def observation(self, obs):
        return PreProcessFrame.process(obs)

    @staticmethod
    def process(frame):

        new_frame = np.reshape(frame, frame.shape).astype(np.float32)

        new_frame = 0.299*new_frame[:,:,0] + 0.587*new_frame[:,:,1] + \
                    0.114*new_frame[:,:,2]

        new_frame = new_frame[35:195:2, ::2].reshape(80,80,1)

        return new_frame.astype(np.uint8)

In [8]:
class ScaleFrame(gym.ObservationWrapper):
    def observation(self,obs):
        return np.array(obs).astype(np.float32)/255.0

In [9]:
class BufferWrapper(gym.ObservationWrapper):
    def __init__(self, env, n_steps):
        super(BufferWrapper, self).__init__(env)
        self.observation_space = gym.spaces.Box(
                             env.observation_space.low.repeat(n_steps, axis=0),
                             env.observation_space.high.repeat(n_steps, axis=0),
                             dtype=np.float32)

    def reset(self):
#         print('reset',self.observation_space.low.shape)
        self.buffer = np.zeros_like(self.observation_space.low, dtype=np.float32)
        return self.observation(self.env.reset())

    def observation(self, observation):
#         print('observation',observation.shape)
#         print('observation444',self.buffer[:-1].shape)
        self.buffer[:-1] = self.buffer[1:]
        self.buffer[-1,:,:] = observation
        return self.buffer
    

In [10]:
class MoveImgChannel(gym.ObservationWrapper):
    def __init__(self, env):
        super(MoveImgChannel, self).__init__(env)
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0,
                            shape=(self.observation_space.shape[-1],
                                   self.observation_space.shape[0],
                                   self.observation_space.shape[1]),
                            dtype=np.float32)

    def observation(self, observation):
        return np.moveaxis(observation, 2, 0)

In [11]:
def make_env(env_name):
    env = gym.make(env_name)
    env = SkipEnv(env)
    env = PreProcessFrame(env)
    env = MoveImgChannel(env)
    env = BufferWrapper(env,4)
    return ScaleFrame(env)

In [12]:
env = make_env('PongNoFrameskip-v4')

In [13]:
num_games = 250
load_checkpoint =False
best_score = -21
agent = Agent(gamma=0.99,epsilon=1.0,alpha= 0.0001,input_dims=(4,80,80),n_actions=6,mem_size=25000,eps_min=0.02,batch_size=32,
             replace=1000,eps_dec=1e-5)

In [14]:
scores = []
eps_history =[]
n_steps = 0
for i in range(num_games):
    score =0 
    observation = env.reset()
    done =False
    while not done:
        action = agent.choose_action(observation)
        observation_,reward,done,info = env.step(action)
        n_steps +=1
        score += reward
        
        agent.store_transition(observation,action,reward,observation_,int(done))
        agent.learn()
        
#         env_render()
        observation = observation_
    scores.append(score)
    avg_score = np.mean(scores[-100:])
    print('episode: ',i,'score %.2f' % score,
             'average_score %.2f' % avg_score,
             'epsilone %.2f' % agent.epsilon)
    if avg_score > best_score:
        agent.save_model()
        best_score = avg_score
    
    

episode:  0 score -19.00 average_score -19.00 epsilone 0.99
episode:  1 score -18.00 average_score -18.50 epsilone 0.98
episode:  2 score -21.00 average_score -19.33 epsilone 0.97
episode:  3 score -19.00 average_score -19.25 epsilone 0.96
episode:  4 score -21.00 average_score -19.60 epsilone 0.95
episode:  5 score -20.00 average_score -19.67 epsilone 0.94
episode:  6 score -21.00 average_score -19.86 epsilone 0.93
episode:  7 score -20.00 average_score -19.88 epsilone 0.92
episode:  8 score -19.00 average_score -19.78 epsilone 0.91
episode:  9 score -19.00 average_score -19.70 epsilone 0.91
episode:  10 score -21.00 average_score -19.82 epsilone 0.90
episode:  11 score -20.00 average_score -19.83 epsilone 0.89
episode:  12 score -19.00 average_score -19.77 epsilone 0.88
episode:  13 score -21.00 average_score -19.86 epsilone 0.87
episode:  14 score -21.00 average_score -19.93 epsilone 0.86
episode:  15 score -19.00 average_score -19.88 epsilone 0.85
episode:  16 score -21.00 average_

episode:  136 score 19.00 average_score -0.24 epsilone 0.02
episode:  137 score 20.00 average_score 0.16 epsilone 0.02
episode:  138 score 21.00 average_score 0.58 epsilone 0.02
episode:  139 score 19.00 average_score 0.98 epsilone 0.02
episode:  140 score 17.00 average_score 1.36 epsilone 0.02
episode:  141 score 21.00 average_score 1.77 epsilone 0.02
episode:  142 score 20.00 average_score 2.18 epsilone 0.02
episode:  143 score 21.00 average_score 2.57 epsilone 0.02
episode:  144 score 7.00 average_score 2.84 epsilone 0.02
episode:  145 score 12.00 average_score 3.17 epsilone 0.02
episode:  146 score 12.00 average_score 3.49 epsilone 0.02
episode:  147 score 7.00 average_score 3.76 epsilone 0.02
episode:  148 score 12.00 average_score 4.07 epsilone 0.02
episode:  149 score 13.00 average_score 4.39 epsilone 0.02
episode:  150 score 21.00 average_score 4.80 epsilone 0.02
episode:  151 score 19.00 average_score 5.17 epsilone 0.02
episode:  152 score 5.00 average_score 5.39 epsilone 0.02