In [None]:
! pip install gym



In [None]:
import gym
import numpy as np
import torch
import torch.nn as nn

In [None]:
#CartPole 환경 생성
#(위치, 각도, 속도, 각속도)
env = gym.make("CartPole-v1")
env.reset()

array([ 0.01320257, -0.01019533,  0.03457361, -0.01318813])

In [None]:
#임의의 행동으로 한 스텝 진행
print(env.action_space.sample())
print(env.step(env.action_space.sample())) #(ns, r, done, info) 리턴

1
(array([ 0.01299866, -0.20579561,  0.03430985,  0.29019958]), 1.0, False, {})


In [None]:
#상태공간 및 행동공간 확인하기
s_dim = env.observation_space.shape[0]
a_dim = env.action_space.n

print("state space dimension: {}".format(s_dim))
print("action space dimension: {}".format(a_dim))

state space dimension: 4
action space dimension: 2


In [None]:
#현재 상태 받아오기
env.state

(0.012998663920133236,
 -0.20579561393908874,
 0.03430985168543008,
 0.2901995807180004)

In [None]:
#MLP 구현
class MultiLayerPerceptron(nn.Module):
  def __init__(self, 
               input_dim:int, output_dim:int,
               num_neurons:list, hidden_act:str, out_act: str):

    super().__init__()
    self.input_dim = input_dim
    self.output_dim = output_dim
    self.num_neurons = num_neurons
    self.hidden_act = getattr(nn, hidden_act)()
    self.out_act = getattr(nn, out_act)()

    input_dims = [input_dim] + num_neurons
    output_dims = num_neurons + [output_dim]
    
    self.layers = nn.ModuleList()
    for i, (in_dim, out_dim) in enumerate(zip(input_dims, output_dims)):
      is_last = True if i == len(input_dims) else False
      self.layers.append(nn.Linear(in_dim, out_dim))
      if is_last:
        self.layers.append(self.out_act)
      else:
        self.layers.append(self.hidden_act)

    # self.layers = nn.ModuleList()
    # self.layers.append(nn.Linear(1, 16))
    # self.layers.append(nn.ReLU())
    # self.layers.append(nn.Linear(16, 32))
    # self.layers.append(nn.ReLU())
    # self.layers.append(nn.Linear(32, 64))

  def forward(self, xs):
    for layer in self.layers:
      xs = layer(xs)
    return xs

In [None]:
#Navie DQN 구현
class NaiveDQN(nn.Module):
  def __init__(self, 
               state_dim:int, action_dim:int,
               qnet:nn.Module, lr:float, gamma:float, epsilon:float):
    
    super(NaiveDQN, self).__init__()
    self.state_dim = state_dim
    self.action_dim = action_dim
    self.qnet = qnet
    self.lr = lr
    self.gamma = gamma
    self.epsilon = epsilon

    self.opt = torch.optim.Adam(params=self.qnet.parameters(), lr=lr)
    self.register_buffer("epsilons", torch.ones(1)*epsilon)
    self.criteria = nn.MSELoss()

  def get_action(self, state):
    qs = self.qnet(state) #qs는 batch x action

    #epislon-greedy
    if self.train:
      prop = np.random.uniform(0.0, 1.0, 1)
      if torch.from_numpy(prop).float() <= self.epsilon: 
        action = np.random.choice(range(self.action_dim))
      else: 
        action = qs.argmax(dim=-1)
      return int(action)
  
  def update_sample(self, state, action, reward, next_state, done):
    s, a, r, ns = state, action, reward, next_state
    #Q-Learning target
    q_max, _ = self.qnet(next_state).max(dim=-1)
    q_target = r + self.gamma * q_max * (1-done)
    q_target = q_target.detach()

    """
    with torch.no_grad():
      q_max, _ = self.qnet(next_state).max(dim=-1)
      q_target = r + self.gamma * q_max * (1-done)

    """

    loss = self.criteria(self.qnet(s)[0, action], q_target)
    self.opt.zero_grad()
    loss.backward()
    self.opt.step()

In [None]:
qnet = MultiLayerPerceptron(input_dim=s_dim,
                            output_dim=a_dim,
                            num_neurons=[128],
                            hidden_act="ReLU",
                            out_act="Identity")

agent = NaiveDQN(state_dim=s_dim, 
                 action_dim=a_dim,
                 qnet=qnet,
                 lr=1e-4,
                 gamma=1.0,
                 epsilon=1.0)

In [None]:
#에이전트의 성능 평가를 위한 이동평균 계산기
class EMA:
  def __init__(self, alpha:float = 0.5):
    self.s = None
    self.alpha = alpha

  def update(self, y):
    if self.s is None:
      self.s = y
    else:
      self.s = self.alpha*y + (1-self.alpha)*self.s

In [None]:
#리워드 합으로 평가
n_eps = 10000
print_every = 500
ema_factor = 0.5
ema = EMA(ema_factor)

for ep in range(n_eps):
  env.reset()
  cum_r = 0 #누적 보상
  while True:
    s = env.state
    s = torch.tensor(s).float().view(1, 4) #float32 아니면 error
    a = agent.get_action(s)
    ns, r, done, info = env.step(a)

    ns = torch.tensor(ns).float()
    agent.update_sample(s, a, r, ns, done)
    cum_r += r
    if done:
      ema.update(cum_r)

      if ep % print_every == 0:
        print("Episode {} || EMA: {} || eps: {}".format(ep, ema.s, agent.epsilon))
    
      if ep>= 150:
        agent.epsilon *= 0.999
      break
env.close()

####.detach()에 관하여

In [None]:
mlp = MultiLayerPerceptron(input_dim=4,
                           output_dim=2,
                           num_neurons=[64, 32],
                           hidden_act="ReLU",
                           out_act="Identity")

In [None]:
xs = np.random.uniform(1.0, 10.0, size=(10, 4))
xs = torch.from_numpy( xs ).view(10, 4).float()
xs

tensor([[6.3921, 2.6298, 7.4136, 4.7489],
        [4.9340, 1.1022, 3.0610, 2.5916],
        [6.7262, 3.9927, 8.0141, 7.0119],
        [3.1749, 8.2022, 4.3721, 8.5152],
        [3.1802, 8.9127, 5.8001, 3.8845],
        [7.1603, 6.5441, 1.0765, 3.9276],
        [8.5353, 1.3522, 9.2767, 8.5346],
        [4.3007, 9.4684, 2.3827, 1.2168],
        [8.0778, 4.3005, 6.8617, 9.1942],
        [2.3993, 6.6922, 5.8740, 2.0673]])

In [None]:
#.detach()로 그라디언트 트래킹 끄기
print(mlp(xs).requires_grad)
print(mlp(xs).detach().requires_grad)

True
False


In [None]:
#with문으로 끄기 
with torch.no_grad():
  print(mlp(xs).requires_grad)

False


In [None]:
#모델 파라미터 출력
mlp.state_dict()