In [ ]:
import tensorflow as tf
import tensorflow.keras.layers as layers
import gym
import numpy as np
import random
import math

In [ ]:
def buildModel():
    inputs = tf.keras.Input(shape=(8))
    x = layers.Dense(32,activation='relu')(inputs)
    x = layers.Dense(64,activation='relu')(x)
    x = layers.Dense(32,activation='relu')(x)
    x = layers.Dense(4)(x)

    return tf.keras.Model(inputs = inputs, outputs = x) 

In [ ]:
class Memory():
    def __init__(self, size):
        self.size = size
        self.memory = []

    def sample(self,n):
        n = self.size if n > self.size else n
        return random.choices(self.memory,k=n)
    
    def add(self,sample):
        self.memory.append(sample)
        if len(self.memory) > self.size:
            self.memory.pop(0)
    
    def __len__(self):
        return len(self.memory)
        

In [ ]:
env = gym.make('LunarLander-v2')

online = buildModel()
target = buildModel()

optimizer=tf.keras.optimizers.Adam()
loss = tf.keras.losses.MeanSquaredError()

memory = Memory(200000)

In [ ]:
MAX_EPSILON = 1
MIN_EPSILON = 0.01
LAMBDA = 0.00005
GAMMA = 0.95
BS = 128
TAU = 0.08

In [ ]:
def train(memory,online,target):
    batch = memory.sample(BS)
    states = np.array([val[0] for val in batch])
    actions = np.array([val[1] for val in batch])
    rewards = np.array([val[2] for val in batch])
    next_states = np.array([val[3] for val in batch])
    dones = np.array([val[4] for val in batch])

    with tf.GradientTape() as t:

        q = online(states)
        next_q_online = online(next_states)
        next_q_target = target(next_states)

        next_actions = tf.argmax(next_q_online,axis=1)

        action_q = tf.gather(q,actions[:,None],batch_dims=1)

        target_q = rewards
        mask = np.logical_not(dones).astype('float')
        discounted_q = tf.gather(next_q_target,next_actions[:,None],batch_dims=1) * GAMMA
        masked_q = tf.math.multiply(tf.squeeze(discounted_q),mask)
        target_q += masked_q

        loss_value = loss(target_q,action_q)

        grads = t.gradient(loss_value, online.trainable_variables)
        optimizer.apply_gradients(zip(grads,online.trainable_variables))

        for o,t in zip(online.trainable_variables,target.trainable_variables):
            t.assign(t * (1- TAU) + o * TAU)
        
        return loss_value

In [ ]:
e = MAX_EPSILON
step = 0

In [ ]:
render = True

In [ ]:
for episode in range(1000):

    done = False
    state = np.array(env.reset())
    totalReward = 0
    loss_value = 0

    while not done:
        if render: env.render()
 
        q = online(state[None,:])
        if np.random.rand() < e:
            action = np.random.randint(low=0, high=env.action_space.n)
        else:
            action = tf.argmax(q,axis=1).numpy()[0] 

        next_state,reward,done,info = env.step(action)

        memory.add((state,action,reward,next_state,done))

        if(len(memory) > BS):
            train(memory,online,target)

        state = next_state
        
        step += 1
        e = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * step)
        totalReward += reward

    print(episode,totalReward)

env.close()