In [ ]:
import tensorflow as tf
import tensorflow.keras.layers as layers
import gym
import numpy as np
import random
import math

In [ ]:
def buildActor():
    inputs = tf.keras.Input(shape=(8))
    x = layers.Dense(64,activation='relu')(inputs)
    x = layers.Dense(128,activation='relu')(x)
    x = layers.Dense(64,activation='relu')(x)
    x = layers.Dense(4)(x)
    return tf.keras.Model(inputs=inputs,outputs=x)

In [ ]:
def buildCritic():
    inputs = tf.keras.Input(shape=(8))
    x = layers.Dense(32,activation='relu')(inputs)
    x = layers.Dense(64,activation='relu')(x)
    x = layers.Dense(32,activation='relu')(x)
    x = layers.Dense(1)(x)

    return tf.keras.Model(inputs = inputs, outputs = x) 

In [ ]:
env = gym.make('LunarLander-v2')

actor = buildActor()
critic = buildCritic()

actor_optimizer = tf.keras.optimizers.Adam()
critic_optimizer = tf.keras.optimizers.Adam()
critic_loss = tf.keras.losses.MeanSquaredError()

In [ ]:
MAX_EPSILON = 1
MIN_EPSILON = 0.01
LAMBDA = 0.00005
GAMMA = 0.95
BS = 128
TAU = 0.08

In [ ]:
step = 0

In [ ]:
render = True

In [ ]:
class Memory:
  def __init__(self):
    self.states = []
    self.actions = []
    self.rewards = []
    
  def store(self, state, action, reward):
    self.states.append(state)
    self.actions.append(action)
    self.rewards.append(reward)
    
  def clear(self):
    self.states = []
    self.actions = []
    self.rewards = []
        

In [ ]:
for episode in range(1000):

    done = False
    state = np.array(env.reset())
    totalReward = 0
    loss_value = 0

    while not done:
        if render: env.render()
 
        with tf.GradientTape(persistent=True) as t:

            logits = actor(state[None,:])
            action = tf.random.categorical(logits,1,dtype=tf.int32).numpy()[0,0]

            next_state,reward,done,info = env.step(action)

            totalReward += reward
            target = reward + GAMMA * critic(next_state[None,:])
            value = critic(state[None,:])
            td_error = target - value

            prob = tf.nn.softmax(logits)
            actor_loss = -tf.math.log(prob[0,action] + 1e-5) * tf.stop_gradient(td_error)

            grads = t.gradient(actor_loss,actor.trainable_variables)
            actor_optimizer.apply_gradients(zip(grads,actor.trainable_variables))

            loss_value = critic_loss(target,value)
            grads = t.gradient(loss_value,critic.trainable_variables)
            critic_optimizer.apply_gradients(zip(grads,critic.trainable_variables))

        state = next_state
    print(episode,totalReward)
env.close()