In [9]:
! pip install gym



In [10]:
import gym
import torch
import matplotlib.pyplot as plt
import torch.nn as nn 

from torch.distributions.categorical import Categorical

In [11]:
env = gym.make("CartPole-v1")
s_dim = env.observation_space.shape[0]
a_dim = env.action_space.n

In [73]:
#MLP
class MultiLayerPerceptron(nn.Module):
  def __init__(self, 
               input_dim:int, output_dim:int,
               hidden_act:str, out_act: str):

    super().__init__()
    self.input_dim = input_dim
    self.output_dim = output_dim
    self.hidden_act = getattr(nn, hidden_act)()
    self.out_act = getattr(nn, out_act)()

    self.layers = nn.ModuleList()
    self.layers.append(nn.Linear(self.input_dim, 16))
    self.layers.append(self.hidden_act)
    self.layers.append(nn.Linear(16, 32))
    self.layers.append(self.hidden_act)
    self.layers.append(nn.Linear(32, 64))
    self.layers.append(self.hidden_act)
    self.layers.append(nn.Linear(64, self.output_dim))
    self.layers.append(self.out_act)

  def forward(self, xs):
    for layer in self.layers:
      xs = layer(xs)
    return xs

In [70]:
class REINFORCE(nn.Module):
  def __init__(self, policy:nn.Module, gamma:float=1.0, lr:float=0.0002):
    super(REINFORCE, self).__init__()
    self.policy = policy
    self.gamma = gamma
    self.lr = lr
    self.opt = torch.optim.Adam(params=self.policy.parameters(), lr=lr)
    self._eps = 1e-25

  def get_action(self, state):
    with torch.no_grad():
      logits = self.policy(state)
      dist = Categorical(logits=logits)
      a = dist.sample()
    return a
  
  @staticmethod
  def _pre_process_inputs(episode:tuple):
    states, actions, rewards = episode

    states = states.flip(dims=[0])
    actions = actions.flip(dims=[0])
    rewards = rewards.flip(dims=[0])
    return states, actions, rewards
  
  def update_episode(self, episode):
    # sample-by-sample update (inefficient)
    states, actions, rewards = self._pre_process_inputs(episode)

    g = 0 
    for s, a, r in zip(states, actions, rewards):
      g = r + self.gamma*g
      dist = Categorical(logits=self.policy(s))
      prob = dist.probs[a]
      pg_loss = -torch.log(prob + self._eps)*g # minus for gradient ascent
      
      self.opt.zero_grad()
      pg_loss.backward()
      self.opt.step()

In [33]:
#에이전트의 성능 평가를 위한 이동평균 계산기
class EMA:
  def __init__(self, alpha:float = 0.5):
    self.s = None
    self.alpha = alpha

  def update(self, y):
    if self.s is None:
      self.s = y
    else:
      self.s = self.alpha*y + (1-self.alpha)*self.s

In [34]:
#텐서 변환
def to_tensor(data, size):
  return torch.tensor(data).float().view(size)

In [93]:
net = MultiLayerPerceptron(input_dim=s_dim, 
                           output_dim=a_dim,
                           hidden_act="ReLU",
                           out_act="Identity")

agent = REINFORCE(net)
ema = EMA()

In [94]:
net.layers

ModuleList(
  (0): Linear(in_features=4, out_features=16, bias=True)
  (1): ReLU()
  (2): Linear(in_features=16, out_features=32, bias=True)
  (3): ReLU()
  (4): Linear(in_features=32, out_features=64, bias=True)
  (5): ReLU()
  (6): Linear(in_features=64, out_features=2, bias=True)
  (7): Identity()
)

In [95]:
n_eps = 2000
print_every = 500

for ep in range(n_eps):
  s = env.reset()
  cum_r = 0

  states = []
  actions = []
  rewards = []
  
  #정책에 따라 trajectory 형성
  while True:
    s = to_tensor(s, size=(1,4))
    a = agent.get_action(s) 
    ns, r, done, info = env.step(a.item())

    states.append(s)
    actions.append(a)
    rewards.append(r)

    s = ns
    cum_r += r
    if done:
      break
  
  ema.update(cum_r)
  if ep % print_every == 0:
    print("Episode {} || EMA: {}".format(ep, ema.s))
  
  states = torch.cat(states, dim=0) #concatenate
  actions = torch.cat(actions, dim=0) #concatenate
  rewards = torch.tensor(rewards)

  episode = (states, actions, rewards)
  agent.update_episode(episode)

Episode 0 || EMA: 20.0
Episode 500 || EMA: 326.1607551550861
Episode 1000 || EMA: 500.0
Episode 1500 || EMA: 141.9705440559444
