<a href="https://colab.research.google.com/github/abhisheksuran/Atari_DQN/blob/master/A2C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import tensorflow as tf 
import gym
import tensorflow_probability as tfp

In [2]:
!pip3 install box2d-py

Collecting box2d-py
[?25l  Downloading https://files.pythonhosted.org/packages/06/bd/6cdc3fd994b0649dcf5d9bad85bd9e26172308bbe9a421bfc6fdbf5081a6/box2d_py-2.3.8-cp36-cp36m-manylinux1_x86_64.whl (448kB)
[K     |▊                               | 10kB 18.3MB/s eta 0:00:01[K     |█▌                              | 20kB 3.8MB/s eta 0:00:01[K     |██▏                             | 30kB 4.9MB/s eta 0:00:01[K     |███                             | 40kB 5.1MB/s eta 0:00:01[K     |███▋                            | 51kB 5.1MB/s eta 0:00:01[K     |████▍                           | 61kB 5.8MB/s eta 0:00:01[K     |█████▏                          | 71kB 6.0MB/s eta 0:00:01[K     |█████▉                          | 81kB 6.0MB/s eta 0:00:01[K     |██████▋                         | 92kB 6.1MB/s eta 0:00:01[K     |███████▎                        | 102kB 6.2MB/s eta 0:00:01[K     |████████                        | 112kB 6.2MB/s eta 0:00:01[K     |████████▊                       | 12

In [3]:
env= gym.make("LunarLander-v2")
low = env.observation_space.low
high = env.observation_space.high

In [4]:
class critic(tf.keras.Model):
  def __init__(self):
    super().__init__()
    self.d1 = tf.keras.layers.Dense(2048,activation='relu')
    self.d2 = tf.keras.layers.Dense(1536,activation='relu')
    self.v = tf.keras.layers.Dense(1, activation = None)

  def call(self, input_data):
    x = self.d1(input_data)
    x = self.d2(x)
    v = self.v(x)
    return v
    

class actor(tf.keras.Model):
  def __init__(self):
    super().__init__()
    self.d1 = tf.keras.layers.Dense(2048,activation='relu')
    self.d2 = tf.keras.layers.Dense(1536,activation='relu')
    self.a = tf.keras.layers.Dense(4,activation='softmax')

  def call(self, input_data):
    x = self.d1(input_data)
    x = self.d2(x)
    a = self.a(x)
    return a
    

In [5]:
class agent():
    def __init__(self, gamma = 0.99):
        self.gamma = gamma
        self.a_opt = tf.keras.optimizers.Adam(learning_rate=5e-6)
        self.c_opt = tf.keras.optimizers.Adam(learning_rate=5e-6)
        self.actor = actor()
        self.critic = critic()
        self.log_prob = None
    
    def act(self,state):
        prob = self.actor(np.array([state]))
        #print(prob)
        prob = prob.numpy()
        dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
        action = dist.sample()
        return int(action.numpy()[0])
        # action = np.random.choice([i for i in range(env.action_space.n)], 1, p=prob[0])
        # log_prob = tf.math.log(prob[0][action]).numpy()
        # self.log_prob = log_prob[0]
        # #print(self.log_prob)
        # return action[0]


    def actor_loss(self, prob, action, td):
        dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
        log_prob = dist.log_prob(action)
        loss = -log_prob*td
        return loss



    def learn(self, state, action, reward, next_state, done):
        state = np.array([state])
        next_state = np.array([next_state])
        #self.gamma = tf.convert_to_tensor(0.99, dtype=tf.double)
        #d = 1 - done
        #d = tf.convert_to_tensor(d, dtype=tf.double)
        with tf.GradientTape() as tape1, tf.GradientTape() as tape2:
            p = self.actor(state, training=True)
             
            #p = self.actor(state, training=True).numpy()[0][action]
            #p = tf.convert_to_tensor([[p]], dtype=tf.float32)
            #print(p)
            v =  self.critic(state,training=True)
            #v = tf.dtypes.cast(v, tf.double)

            vn = self.critic(next_state, training=True)
            #vn = tf.dtypes.cast(vn, tf.double)
            td = reward + self.gamma*vn*(1-int(done)) - v
            #print(td)
            #td = tf.math.subtract(tf.math.add(reward, tf.math.multiply(tf.math.multiply(self.gamma, vn), d)), v)
            #a_loss = -self.log_prob*td
            a_loss = self.actor_loss(p, action, td)
            #a_loss = -tf.math.multiply(tf.math.log(p),td)
            #a_loss = tf.keras.losses.categorical_crossentropy(td, p)
            #a_loss = -tf.math.multiply(self.log_prob,td)
            c_loss = td**2
            #c_loss = tf.math.pow(td,2)
        grads1 = tape1.gradient(a_loss, self.actor.trainable_variables)
        grads2 = tape2.gradient(c_loss, self.critic.trainable_variables)
        self.a_opt.apply_gradients(zip(grads1, self.actor.trainable_variables))
        self.c_opt.apply_gradients(zip(grads2, self.critic.trainable_variables))
        return a_loss, c_loss

In [None]:
agentoo7 = agent()
steps = 10000
for s in range(steps):
  
  done = False
  state = env.reset()
  total_reward = 0
  all_aloss = []
  all_closs = []
  
  while not done:
    #env.render()
    action = agentoo7.act(state)
    #print(action)
    next_state, reward, done, _ = env.step(action)
    aloss, closs = agentoo7.learn(state, action, reward, next_state, done)
    all_aloss.append(aloss)
    all_closs.append(closs)
    state = next_state
    total_reward += reward
    
    if done:
      
      #print("total step for this episord are {}".format(t))
      print("total reward after {} steps is {}".format(s, total_reward))

total reward after 0 steps is -265.3146739973837
total reward after 1 steps is -164.54026395057352
total reward after 2 steps is -292.58368525671295
total reward after 3 steps is -169.9464951206346
total reward after 4 steps is -89.54967405103599
total reward after 5 steps is -96.27658794860051
total reward after 6 steps is -146.28894348036465
total reward after 7 steps is -120.01497235023736
total reward after 8 steps is -134.00506031965483
total reward after 9 steps is -368.92938353995595
total reward after 10 steps is -106.84171341454652
total reward after 11 steps is -337.81389062157854
total reward after 12 steps is -160.0976083204911
total reward after 13 steps is -191.5539615256103
total reward after 14 steps is -167.3316617927939
total reward after 15 steps is -247.38520710613622
total reward after 16 steps is -87.2054730629381
total reward after 17 steps is -248.50373841208412
total reward after 18 steps is -206.089520535602
total reward after 19 steps is -231.00469720370455
t

In [None]:
env.action_space.sample()