<a href="https://colab.research.google.com/github/atulgupta01/EVA_Group_Assignment/blob/master/P2S9/evap2assignment9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
import random
import time
from collections import deque

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from matplotlib import pyplot as plt
from torch.autograd import Variable

import gym
import pybullet_envs
from gym import wrappers

**Step 1:**

In [0]:
#====================================
# ReplayBuffer class
# init function has max_size = 1e6
# we initialize the replay mem buffer with 1e6 and then add transitions to it
# once the buffer is full new trainsition overwrites it 
# sample fucntion takes batch size as input selects random tuple from the storage
# appends all the return of the storage individually and returns that
#====================================

In [0]:
def ReplayBuffer(object):
    def __init__(self, max_size=1e6):
        self.storage = []
        self.max_size = max_size
        self.ptr = 0

    def add(self, transition):
        if len(self.storage) == self.max_size:
            self.storage[int(self.ptr)] = transition
            self.ptr = (self.ptr + 1) % self.max_size
        else:
            self.storage.append(transition)

    def sample(self, batch_size):
        ind = np.random.randint(0, len(self.storage), batch_size)
        batch_states , batch_next_states, batch_actions, batch_rewards ,batch_dones = [],[],[],[],[]
        for i in ind:
            state, next_state, action, reward, done = self.storage[i]
            batch_states.append(np.array(state, copy=False))
            batch_next_state.append(np.array(next_state, copy=False))
            batch_actions.append(np.array(action, copy=False))
            batch_rewards.append(np.array(reward, copy=False))
            batch_dones.append(np.array(done, copy=False))
        return np.array(batch_states), np.array(batch_next_states), np.array(
            batch_actions), np.array(batch_rewards).reshape(
                -1, 1), np.array(batch_dones).reshape(-1, 2)

In [0]:

#====================================
# Actor Model Class
# inputs --> state dimensions, action dimensions, max action limit (to limit action predicted by some limit)
# output --> returns predicted action (max action is multiplied at the last layer output to limit the action taken)
#====================================

In [0]:
class Actor(nn.Module):
    def __init__(self, state_dims, action_dim, max_action):
        # max action is to clip in case we added too much noise
        # state dim are state parameters
        # action dim is number of actions
        super(Actor, self).__init__()
        self.layer_1 = nn.Linear(state_dims, 400)
        self.layer_2 = nn.Linear(400, 300)
        self.layer_3 = nn.Linear(300, action_dim)
        self.max_action = max_action

    def forward(self, x):
        x = F.relu(self.layer_1(x))
        x = F.relu(self.layer_2(x))
        # basically x says which action to be taken and by how much the action should be taken
        x = self.max_actions * torch.tanh(self.layer_3(x))
        return x

In [0]:
#====================================
# Critic Model Class
# 2 Critic model is defined 
# forward fucntion says that given one action and state we predict q value
# here we have chosen only critic one to update the actor model(somehat like gan theory) so
# Q1 uses only the first critic network  using this we will update the actor network
#====================================

In [0]:
class Critic(nn.Module):
    def __init__(self, state_dims, action_dim):
        super(Critic, self).__init__()  # activate the inheritance
        # First Critic Network
        self.layer_1 = nn.Linear(state_dims + action_dim, 400)
        self.layer_2 = nn.Linear(400, 300)
        self.layer_3 = nn.Linear(300, 1)
        # Second Critic Network
        self.layer_4 = nn.Linear(state_dims + action_dim, 400)
        self.layer_5 = nn.Linear(400, 300)
        self.layer_6 = nn.Linear(300, 1)

    def forward(self, x, u):  # x -> state, u -> action
        xu = torch.cat([x, u], 1)  # concat along axis 1
        # forward propagatin on First Critic
        x1 = F.relu(self.layer_1(xu))
        x1 = F.relu(self.layer_2(x1))
        x1 = self.layer_3(x1)
        # forward propagatin on Second Critic
        x2 = F.relu(self.layer_4(xu))
        x2 = F.relu(self.layer_5(x2))
        x2 = F.relu(self.layer_5(x2))

        return x1, x2

    def Q1(
        self, x, u
    ):  # x -> state, u -> action this is to update the actor using first critic
        xu = torch.cat([x, u], 1)  # concat along axis 1
        x1 = F.relu(self.layer_1(xu))
        x1 = F.relu(self.layer_2(x1))
        x1 = self.layer_3(x1)
        return x1

In [0]:
#====================================
# Trainin Process
# we have 2 actor (actor model(trained using backpropagation),actor_target(trained using polyak avg))
# critic 1 and 2 has same dnn str
# we have 2 critic1 (critic1 model(trained using backpropagation),critic1 target (trained using polyak avg))
# we have 2 critic2 (critic2 model(trained using backpropagation),critic2 target (trained using polyak avg))
#====================================

In [0]:
# select the available device gpu/cpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Train Block
class T3D(object):
    def __init__(self, state_dims, action_dim, max_action):
        # making sure our T3D class can work with any env
        
        # load actor and actor_target, inilialize actor_target with actor weights 
        self.actor = Actor(state_dims, action_dim, max_action).to(device)  # GD
        self.actor_target = Actor(state_dims, action_dim,
                                  max_action).to(device)  # Polyak Avg
        # initializing with model weights to keep them same
        self.actor_target.load_state_dict(self.actor.state_dict)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters())

        # load critic and critic_target, inilialize critic_target with actor weights 
        self.critic = Critic(state_dims, action_dim).to(device)  # GD
        self.critic_target = Critic(state_dims,
                                    action_dim).to(device)  # Polyak Avg
        # initializing with model weights to keep them same
        self.critic_target.load_state_dict(self.critic.state_dict)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
        self.max_action = max_action

    def select_action(self, state):
        # select state
        state = torch.Tensor(state.reshape(1, -1)).to(device)
        # convert to numpy
        return self.actor(state).cpu().data.numpy().flatten()

    ################# Step 4 ################
    def train(self,
              replay_buffer,
              iterations,
              batch_size=100,
              discount=0.99,
              tau=0.005,
              policy_noise=0.2,
              noise_clip=0.5,
              policy_freq=2):
        for it in range(iterations):
            ################ Step 4 ######################
            # smaple from a batch of transitions (s,s',a,r) from memory
            batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(
                batch_size)
            state = torch.Tensor(batch_states).to(device)
            next_state = torch.Tensor(batch_next_states).to(device)
            action = torch.Tensor(batch_actions).to(device)
            reward = torch.Tensor(batch_rewards).to(device)
            done = torch.Tensor(batch_dones).to(device)

            ################# Step 5 ################
            # from the next state s', the actor target plays the next actionss a'
            next_action = self.actor_target.forward(next_state)

            ################# Step 6 ################
            # We add Gaussian noise to this next action a'
            # and we clamp it in the range of values supported by the environments

            noise = torch.Tensor(batch_actions).data.normal_(
                0, policy_noise).to(device)
            noise = noise.clamp(-noise_clip, noise_clip)
            next_action = (next_action + noise).clamp(-self.max_action,
                                                      self.max_action)

            ################# Step 7 ################
            # The two Critic targets take each the couple(s',a') as input and
            # return two Q values,q
            target_Q1, target_Q2 = self.critic_target.forward(
                next_state, next_action)

            ################# Step 8 ################
            # We keep the minimum of these two Q-values
            target_Q = torch.min(target_Q1, target_Q2)

            ################# Step 9 ################
            target_Q = reward + ((1 - done) * discount * target_Q).detach()

            ################# Step 10 ################
            # The two critic models take each the couple (s,a') as input and return two Q values
            current_Q1, current_Q2 = self.critic.forward(state, action)

            ################# Step 11 ################
            # We compute the loss coming from 2 Critic models
            F
            critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(
                current_Q2, target_Q)

            ################# Step 12 ################
            # Backpropagate the critic loss and update the parameters of the 2 critic models with Adam optimizers
            self.critic_optimizer.zero_grad(
            )  # initialize critic prev grad(if any) to zero
            critic_loss.backward()  # computing the gradients
            self.critic_optimizer.step()  # performing the weight updates

            ################# Step 13 ################
            # Once every 2 iterations, we update our Actor model by performing gradient accent
            # on the output of the first critic model

            if it % policy_freq == 0:
            # This is DPG part
                actor_loss = (self.critic.Q1(state, self.actor(state)).mean())
                self.actor_optimizer.zero_grad(
                )  # initialize actor prev grad(if any) to zero
                actor_loss.backward()  # computing the gradients
                self.actor_optimizer.step()  # performing the weight updates

            ################# Step 14 ################
            # Still once every 2 iteration, we update the weights of the Critic target by Polyak averaging

                for param, target_param in zip(self.actor.parameters(),
                                               self.actor_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau)
                                            & target_param.data)

            ################# Step 15 ################
            # Still once every 2 iteration, we update the weights of the actor target by Polyak averaging

                for param, target_param in zip(self.critic.parameters(),
                                               self.critic_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau)
                                            & target_param.data)