# REINFORCE

## 최소 형태의 REINFORCE

In [1]:
from torch.distributions import Categorical
import gym
import numpy as np
import torch
import torch.nn as nn 
import torch.optim as optim 

GAMMA = 0.9

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class Pi(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(Pi, self).__init__()
        layers = [
            nn.Linear(in_dim, 64),
            nn.ReLU(),
            nn.Linear(64, out_dim),
        ]
        self.model = nn.Sequential(*layers)
        self.onpolicy_reset()
        self.train()        # 훈련 모드 설정
        
    def onpolicy_reset(self):
        self.log_probs = []
        self.rewards = []
        
    def forward(self, x):
        pdparam = self.model(x)
        return pdparam
    
    def act(self, state):
        x = torch.from_numpy(state.astype(np.float32))      # 텐서로 변경
        pdparam = self.forward(x)                           # feed forward
        pd = Categorical(logits=pdparam)                    # probability distribution
        action = pd.sample()                                # 확률 분포를 통한 행동 정책 \pi(a|s)
        log_prob = pd.log_prob(action)                      # \pi(a|s)의 로그확률
        self.log_probs.append(log_prob)                     # 훈련을 위해 저장
        return action.item()

In [4]:
def train(pi, optimizer):
    # REINFORCE 알고리즘의 gradient descent loop
    T = len(pi.rewards)
    rets = np.empty(T, dtype=np.float32)        # returns
    future_ret = 0.0
    # 이득을 효율적으로 계산
    for t in reversed(range(T)):
        future_ret = pi.rewards[t] + GAMMA * future_ret
        rets[t] = future_ret
    
    rets = torch.tensor(rets)
    log_probs = torch.stack(pi.log_probs)
    loss = - log_probs * rets                   # 경사 항, 최대화를 위해 음의 부호로 설정
    loss = torch.sum(loss)
    optimizer.zero_grad()
    loss.backward()                             # 역전파, 경사를 계산
    optimizer.step()                            # 경사 상승, 가중치를 업데이트
    return loss

In [5]:
env = gym.make('CartPole-v0')
in_dim = env.observation_space.shape[0]         # 4
out_dim = env.action_space.n                    # 2
pi = Pi(in_dim, out_dim)                        # REINFORCE를 위한 정책
optmizer = optim.Adam(pi.parameters(), lr=0.01)

for epi in range(300):
    state = env.reset()
    for t in range(200):                        # 카트폴의 시간 간격의 최대 개수는 200
        action = pi.act(state)
        state, reward, done, _ = env.step(action)
        pi.rewards.append(reward)
        env.render()
        if done:
            break
    loss = train(pi, optmizer)                  # 에피소드별로 훈련 수행
    total_reward = sum(pi.rewards)
    solved = total_reward > 195.0
    pi.onpolicy_reset()                         # 활성정책 : 훈련 이후에 메모리 삭제
    print(f'Episode {epi}, loss: {loss}, total_reward : {total_reward}, solved : {solved}')

Episode 0, loss: 35.8833122253418, total_reward : 11.0, solved : False
Episode 1, loss: 47.432167053222656, total_reward : 14.0, solved : False
Episode 2, loss: 27.8303165435791, total_reward : 11.0, solved : False
Episode 3, loss: 48.445430755615234, total_reward : 14.0, solved : False
Episode 4, loss: 20.594528198242188, total_reward : 10.0, solved : False
Episode 5, loss: 39.09783172607422, total_reward : 12.0, solved : False
Episode 6, loss: 37.69865036010742, total_reward : 13.0, solved : False
Episode 7, loss: 68.54659271240234, total_reward : 18.0, solved : False
Episode 8, loss: 101.47730255126953, total_reward : 22.0, solved : False
Episode 9, loss: 23.294645309448242, total_reward : 10.0, solved : False
Episode 10, loss: 80.08537292480469, total_reward : 20.0, solved : False
Episode 11, loss: 61.49821472167969, total_reward : 16.0, solved : False
Episode 12, loss: 77.72594451904297, total_reward : 19.0, solved : False
Episode 13, loss: 23.794321060180664, total_reward : 11.0,

## 파이토치로 정책 생성하기

### 이산 정책의 생성

In [6]:
from torch.distributions import Categorical
import torch

# 2개의 행동을 가정
# 정책 네트워크로부터 행동의 logit 확률을 획득
policy_net_output = torch.Tensor([-1.6094, -0.2231])        # 임의의 값
# pdparams를 logit으로 probs = [0.2, 0.8]과 동일
pdparams = policy_net_output
pd = Categorical(logits=pdparams)

# 행동을 추출
action = pd.sample()
# => tensor(1), '오른쪽으로 이동'

# 행동 로그 확률을 계산
pd.log_prob(action)
# => tensor(-0.2231), '오른쪽으로 이동'에 대한 로그 확률

tensor(-1.6094)

In [5]:
# logit
np.log(0.2), np.log(0.8)

(-1.6094379124341003, -0.2231435513142097)

### 연속 정책의 생성

In [7]:
from torch.distributions import Normal
import torch

# 하나의 행동을 가정 (펜듈럼 : 토크)
# 정책 네크워크로부터 행동의 평균과 표준편차를 획득
policy_net_output = torch.Tensor([1.0, 0.2])
# pdparams는 (평균, 표준편차), (loc, scale)
pdparams = policy_net_output
pd = Normal(loc=pdparams[0], scale=pdparams[1])

# 행동을 추출
action = pd.sample()
# => tensor(1.0295), 토크의 크기

# 행동 로그확률을 계산
pd.log_prob(action)
# => tendor(0.6796), 이 토크의 로그확률


tensor(0.5852)