# Gym solving - TRPO or PPO
---
- TRPO(Trust Region Policy Optimization)

- PPO(Proximal Policy Optimization)

In [1]:
# 해당 셀은 필자의 Jupyter notebook 환경 문제로 인해 작성되었습니다
import os
try:
    os.environ["DISPLAY"]
except:
    os.environ["SDL_VIDEODRIVER"] = "dummy"
os.environ["SDL_VIDEODRIVER"] = "dummy"

In [2]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import gym

import sys
sys.path.append('../material')
from utils import moving_average

from IPython.display import clear_output
from IPython.display import Video

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [4]:
torch.manual_seed(123)
torch.cuda.manual_seed_all(123)
np.random.seed(123)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [5]:
env = gym.make('CartPole-v0')
env._max_episode_steps=2000

  logger.warn(


# TRPO or PPO agent 생성

In [6]:
class PO_Agent(nn.Module):
    def __init__(self, state_shape, n_actions):
        super(PO_Agent,self).__init__()
        '''
        입력변수
            state_shape: state 차원 -> [위치, 속도, 각도, 각속도]
            output_dim: actor 차원 -> [왼쪽, 오른쪽]
                        critic 차원 -> 1
            device : cpu, cuda device정보 
        N.N 구조
            2 - hidden layers, 64 nodes
            Activation function -> Relu
        '''
        self.state_shape = state_shape
        self.n_actions = n_actions
        
        self.seq = nn.Sequential(
            nn.Linear(self.state_shape,128), 
            nn.ReLU(),
        )
        self.policy = nn.Sequential(
            nn.Linear(128,self.n_actions)
        )
            
        self.value = nn.Sequential(
            nn.Linear(128,1)
        )
        
    def forward(self, state_t):
        '''
        입력인자
            state_t : 상태([batch,state_shape]), torch.tensor
        출력인자
            policy : 정책([batch,n_actions]), torch.tensor
            value : 가치함수([batch]), torch.tensor
        '''
        policy = self.policy(self.seq(state_t))
        value = self.value(self.seq(state_t)).squeeze(dim=-1)
        return policy, value

    def sample_actions(self,state_t):
        '''
        입력인자
            state_t : 상태([1,state_shape]), torch.tensor
        출력인자
            action_t : 행동함수 using torch.multinomial
        '''
        policy, _ = self.forward(state_t)
        policy = torch.squeeze(policy)
        softmax_policy = F.softmax(policy,dim=0)
        action = torch.multinomial(softmax_policy, num_samples=1).item()
        return action

In [7]:
gamma=0.99
epsilon = 1e-03
delta = 1e-05

state = env.reset()
num_state = state.shape[0]
num_action = env.action_space.n
learning_rate = 5e-04
max_episode = 1000
update_per_episode=10

# Method 1 - TRPO vine
---
Objective function

$J = \mathbb{E}[\frac{\pi{(a \vert s)}}{\pi_{old}(a \vert s)}A(s,a)]-KL(\pi_{old}{( \cdot \vert s)}|\pi( \cdot \vert s)) $

where,

$KL(\pi_{old}{( \cdot \vert s)}|\pi( \cdot \vert s)) \leq \delta$

In [8]:
TRPO_agent = PO_Agent(num_state,num_action).to(device)
optimizer = optim.Adam(TRPO_agent.parameters(),lr=learning_rate)
torch_KL = nn.KLDivLoss(reduction='batchmean')
critic_loss = nn.MSELoss()

In [14]:
def discounted_reward(rewards,gamma=gamma):
    '''
    Return 계산함수
        target_value = R_t + gamma * R_{t+1} + gamma**2 * R_{t+2} ...
    '''
    target_value = []
    discounted_reward = 0
    for reward in reversed(rewards):
        discounted_reward = reward + discounted_reward * gamma
        target_value.insert(0, discounted_reward)
    return target_value

In [9]:


def TRPO_singleloss(transition,train_agent,env,gamma=gamma,delta=delta):
    '''
    TRPO singleloss함수 계산코드(One step ahead!!)
    - 1 step만 앞서 Policy sampling -->> old or current policy
    입력인자
        batch_sample - 리플레이로부터 받은 샘플(S,A,R,S',done)
        train_agent - 훈련에이전트
        env - 환경
        gamma - 할인율
        delta - KL-divergence 범위
    출력인자
        Total_loss
    목적함수 
              |Return|                  |Value|                       |Entropy| 
        (policy/policy_old)*advantage + (value_infer-value_target)**2 + policy*log(policy)
    '''
    #states,actions,rewards,next_state,done = transition
    
    #states = torch.Tensor(states).to(device).view(-1,num_state)
    #actions = torch.Tensor(actions).to(device).view(-1,num_action)
    #rewards = torch.Tensor(rewards[None]).to(device)
    #next_state = torch.Tensor([next_state]).to(device).view(-1,num_state)


    policies, values = train_agent(states)
    _, next_value = train_agent(next_state)
    if done:
        next_value = 0
    
    curr_probs = F.softmax(policies,dim=-1)
    curr_logprobs = F.log_softmax(policies,dim=-1)

    trpo_policy = torch.exp(curr_log_probs - old_logprobs)

    #target_values = rewards+gamma*next_value
    target_values = discounted_reward(rewards,gamma)
    
    advantages = target_values - values
    entropy = -torch.sum(curr_probs*curr_logprobs,dim=-1)

    actor_loss = -torch.mean(trpo_policy*advantages.detach() + epsilon*entropy)
    critic_loss = F.mse_loss(target_values.detach(),values)
    total_loss = actor_loss + critic_loss
    return total_loss, actor_loss, critic_loss

def TRPO_vineloss(rollout_transition,train_agent,env,gamma=gamma):
    '''
    TRPO vineloss함수 계산코드(Roll-out!!!)
    - 특정 배치 크기만큼 임의 행동 샘플링 --> old policy  
    입력인자
        batch_sample - 리플레이로부터 받은 샘플(S,A,R,S',done)
        train_agent - 훈련에이전트
        env - 환경
        gamma - 할인율
    출력인자
        Total_loss
    목적함수 
              |Return|                  |Value|                       |Entropy| 
        (policy/policy_old)*advantage + (value_infer-value_target)**2 + policy*log(policy)
    '''
    states,actions,rewards,next_state,done = rollout_transition
    


    #states = torch.Tensor(states).to(device).view(-1,num_state)
    #actions = torch.Tensor(actions).to(device).view(-1,num_action)
    #rewards = torch.Tensor(rewards[None]).to(device)
    #next_state = torch.Tensor([next_state]).to(device).view(-1,num_state)
    
    policies, values = #train_agent(states)
    _, next_value = #train_agent(next_state)
    if done:
        next_value = 0
    
    old_policies, old_values = train_agent(states)
    old_probs = F.softmax(old_policies,dim=-1)
    old_logprobs = F.log_softmax(old_policies,dim=-1)

    curr_probs = F.softmax(policies,dim=-1)
    curr_logprobs = F.log_softmax(policies,dim=-1)
    trpo_policy = torch.exp(curr_log_probs - old_probs)

    target_values = discounted_reward(rewards,gamma)
    
    advantages = target_values - values
    entropy = -torch.sum(curr_probs*curr_logprobs,dim=-1)

    actor_loss = -torch.mean(trpo_policy*advantages.detach() + epsilon*entropy)
    critic_loss = F.mse_loss(target_values.detach(),values)
    total_loss = actor_loss + critic_loss
    return total_loss, actor_loss, critic_loss

SyntaxError: invalid syntax (1679655290.py, line 82)

In [29]:
def TRPO_vineupdate(agent,optimizer,env, updates=10, gamma=0.99):
    # Vine rollout - env reset된 상태
    states, actions, rewards, next_states, dones = [], [], [], [], []
    state = env.reset()
    total_reward = 0
    for t in range(env._max_episode_steps):
        torch_state = torch.Tensor(state).to(device)
        torch_state = torch.unsqueeze(torch_state,0)
        action = agent.sample_actions(torch_state)
        next_state,reward,done,_ = env.step(action)
        #total_reward += reward
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        next_states.append(next_state)
        dones.append(done)
        
        total_reward += reward
        if done:
            break

    # Old policy calculation
    torch_states = torch.Tensor(states).to(device)
    old_policies, old_values = agent(torch_states)
    old_probs = F.softmax(old_policies,dim=-1)
    old_logprobs = F.log_softmax(old_policies,dim=-1)

    # Current policy -> update
    for iter in range(updates):
        curr_policies, curr_values = agent(torch_states)
        curr_probs = F.softmax(curr_policies,dim=-1)
        curr_logprobs = F.log_softmax(curr_policies,dim=-1)

        Advantage = torch.Tensor(discounted_reward(rewards,gamma)).to(device)-old_values
        trpo_goal = torch.exp(curr_logprobs-old_logprobs)
        entropy = -torch.sum(curr_probs*curr_logprobs,dim=-1)
        KL_div = torch_KL(old_probs, curr_probs)
        Critic = critic_loss(Advantage, curr_values)
        # Debug
        print(f'trpo shape: {trpo_goal.shape}')
        print(f'Advantege shape: {Advantage.shape}')
        print(f'KL_div shape: {KL_div.shape}')
        print(f'Entropy shape: {entropy.shape}')
        print(f'Critic shape: {Critic.shape}')
        total_loss = torch.mean(-trpo_goal.T*Advantage.detach() + KL_div + entropy) + Critic
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()
    # log 용도 loss 출력
    return agent, Advantage, trpo_goal.mean(), Critic, total_reward

In [30]:
reward_record, TDloss_record, ACloss_record, CRloss_record = [], [], [], []
for ep in range(max_episode):
    done = False
    
    cnt = 0
    TRPO_agent, adv, trpo, critic, total_reward = TRPO_vineupdate(TRPO_agent,optimizer,env)
    print(f'ep: {ep}, total_reward: {total_reward}')

trpo shape: torch.Size([41, 2])
Advantege shape: torch.Size([41])
KL_div shape: torch.Size([])
Entropy shape: torch.Size([41])
Critic shape: torch.Size([])
trpo shape: torch.Size([41, 2])
Advantege shape: torch.Size([41])
KL_div shape: torch.Size([])
Entropy shape: torch.Size([41])
Critic shape: torch.Size([])


RuntimeError: Trying to backward through the graph a second time, but the saved intermediate results have already been freed. Specify retain_graph=True when calling .backward() or autograd.grad() the first time.