In [6]:
import gym
import matplotlib.pyplot as plt
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as functions
import numpy as np
import random
from torch.distributions.normal import Normal
%matplotlib inline

In [7]:
class ActorNet(nn.Module):
    
    def __init__(self):
        super(ActorNet,self).__init__()
        self.fc1 = nn.Linear(3,32)
        self.fc2 = nn.Linear(32,16)
        self.fc3 = nn.Linear(16,2)    # output is std dev and mean
        
    def forward(self,x):
        x = functions.relu(self.fc1(x))
        x = functions.relu(self.fc2(x))
        x = self.fc3(x)
        x[:,1] = torch.clamp(x[:,1].clone(),min=0.1)

        return x

In [8]:
from collections import deque
 
class ReplayBuffer:
    
    def __init__(self,capacity):
        self.memory = deque(maxlen=capacity)
        
    def push(self,tup_le):
        self.memory.append(tup_le)
        
    def sample(self,sample_size):
        try:
            ob, action, reward, newob, done = zip(*random.sample(self.memory,sample_size))
        except:
            ob, action, reward, newob, done = zip(*random.sample(self.memory,len(self.memory)))
        
        return ob,action,reward,newob,done

In [16]:
class Agent:
    
    def __init__(self,env,Actor,Actor_optimizer,Critic,fixed_net_Critic,Critic_optimizer,buffercap,gamma = 0.99):
        #Critic
        self.Critic = Critic
        self.fixed_net_Critic = fixed_net_Critic
        self.Critic_optimizer = Critic_optimizer

        #Actor
        self.Actor = Actor
        self.Actor_optimizer = Actor_optimizer

        #Replay Memory and ENV
        self.gamma = gamma
        self.buffer = ReplayBuffer(buffercap)
        self.env = env
        
    
    def gen_episode(self,render = False,not_training = False):#generate one episode

        ob = self.env.reset()
        done = False
        reward_count = 0
        if(render):
            env.render()
            time.sleep(2)
        
        while(not done):

            ## get action
            if(render):
                env.render()
                time.sleep(0.15)
            with torch.no_grad():
                out = self.Actor(torch.from_numpy(ob.reshape(1,-1)).float().to(device))
                try:
                  dist = Normal(out[0,0],out[0,1])
                except:
                  print(out[0,0],out[0,1])
                  dist = Normal(out[0,0],out[0,1])
                action = dist.sample().cpu().numpy().reshape(1)   # because environment requires output in shape of 1

            
            newob, rew, done, _ = self.env.step(action)
            self.buffer.push((ob,action,rew,newob,done))
            if(not not_training):
                self.train_actor((ob,action,rew,newob,done))
            ob = newob
            reward_count += rew
            if(not not_training):
                self.train_critic()
        
        return reward_count/200
    

    def train_critic(self):   #perform one step of gradient descent
        batch_size = 64
        X,actions,rewards,Y,terminated = self.buffer.sample(batch_size)
        #print('Type of X is: ',type(X))

        X = np.array(X)
        #print('Shape of X is: ',X.shape)
        Y = np.array(Y)


        with torch.no_grad():
            Y = self.fixed_net_Critic(torch.from_numpy(Y).float().to(device)).cpu().numpy()
        #print(Y.shape)
        rewards = np.array(rewards)
        rewards = rewards.reshape(rewards.shape[0],1)
        terminated = np.array(terminated)
        terminated = terminated.reshape(terminated.shape[0],1)
        #print(1-terminated)
        Y = rewards + self.gamma*Y*(1-terminated)

        
        loss = nn.MSELoss()
        X = torch.from_numpy(X).float().to(device)
        Y = torch.from_numpy(Y).float().to(device)
        self.Critic_optimizer.zero_grad()
        output = self.Critic(X)
        cost = loss(output,Y)
        cost.backward(retain_graph=True)  #remove retain graph to get the error about the intermediate values being freed # add the retain graph line if you train the actor here
        self.Critic_optimizer.step()

        #start training actor
        #with torch.no_grad():
        #  adv = torch.from_numpy(rewards).float().to(device) + self.gamma*Y - output
        #self.Actor_optimizer.zero_grad()
        #actors_output = self.Actor(X)
        #loss = -1*torch.mean(torch.log(actors_output)*adv)
        #loss.backward()
        #self.Actor_optimizer.step()
        

    def train_actor(self,tup_le):
        states,actions,rewards,state_dashes,terminated = tup_le
        states = np.array(states)
        states = states.reshape(1,-1)
        state_dashes = np.array(state_dashes)
        state_dashes = state_dashes.reshape(1,-1)
        
        rewards = np.array(rewards)

        actions = np.array(actions)
        #print(type(actions),actions.shape)
        #print('Rewards shape is: ',rewards.shape)
        #print('Terminated shape is: ',terminated.shape)

        #define advantage function
        with torch.no_grad():
            V_s = self.Critic(torch.from_numpy(states).float().to(device))
            V_s_dash = self.Critic(torch.from_numpy(state_dashes).float().to(device))
        adv = torch.from_numpy(rewards).float().to(device) + self.gamma*V_s_dash - V_s
        
        self.Actor_optimizer.zero_grad()
        output = self.Actor(torch.from_numpy(states).float().to(device))
        #print('Output size is: ',output.size())
        dist = Normal(output[0,0],output[0,1])
        
        #print('Output is: ',output)
        loss = -1*dist.log_prob(torch.from_numpy(actions).to(device))*adv
        #print('Loss is: ',loss)
        loss.backward()
        self.Actor_optimizer.step()



In [10]:
if(torch.cuda.is_available()):
    print('CUDA Available')
device = torch.device('cuda')

CUDA Available


In [11]:
Actor = ActorNet()
Actor.load_state_dict(torch.load('InvPenduActor2.pth'))
Actor.eval()
Actor.to(device)

ActorNet(
  (fc1): Linear(in_features=3, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=16, bias=True)
  (fc3): Linear(in_features=16, out_features=2, bias=True)
)

In [12]:
name = 'Pendulum-v0'
env = gym.make(name)
agent = Agent(env,Actor,None,None,None,None,1)

In [25]:
checkward = agent.gen_episode(render = True,not_training=True)
print(checkward)

-1.4405231945576675


In [26]:
env.close()