In [1]:
! pip install gym



In [2]:
import numpy as np
import torch 
import torch.nn as nn
import gym
import random
import matplotlib.pyplot as plt

In [3]:
env = gym.make("CartPole-v1")
env

<TimeLimit<CartPoleEnv<CartPole-v1>>>

In [4]:
s_dim = env.observation_space.shape[0]
a_dim = env.action_space.n

In [5]:
print(f"상태 공간 크기: {s_dim}")
print(f"행동 공간 크기: {a_dim}")

상태 공간 크기: 4
행동 공간 크기: 2


In [6]:
#Experience Replay (deque 대신)
class ReplayMemory:
  def __init__(self, max_size):
    self.buffer = [None]*max_size
    self.max_size = max_size
    self.index = 0
    self.size = 0

  def push(self, obj):
    self.buffer[self.index] = obj
    self.size = min(self.size+1, self.max_size)
    #max_size 넘어가면 다시 인덱스 = 0 
    self.index = (self.index+1) % self.max_size 
  
  def sample(self, batch_size):
    #배치 사이즈만큼 랜덤하게 인덱스 추출
    indices = random.sample(range(self.size), batch_size) 
    return [self.buffer[index] for index in indices]

  def __len__(self):
    return self.size

In [23]:
#MLP
class MultiLayerPerceptron(nn.Module):
  def __init__(self, 
               input_dim:int, output_dim:int,
               hidden_act:str, out_act: str):

    super().__init__()
    self.input_dim = input_dim
    self.output_dim = output_dim
    self.hidden_act = getattr(nn, hidden_act)()
    self.out_act = getattr(nn, out_act)()

    self.layers = nn.ModuleList()
    self.layers.append(nn.Linear(self.input_dim, 16))
    self.layers.append(self.hidden_act)
    self.layers.append(nn.Linear(16, 32))
    self.layers.append(self.hidden_act)
    self.layers.append(nn.Linear(32, 64))
    self.layers.append(self.hidden_act)
    self.layers.append(nn.Linear(64, self.output_dim))
    self.layers.append(self.out_act)

  def forward(self, xs):
    for layer in self.layers:
      xs = layer(xs)
    return xs

In [54]:
#DQN
class DQN(nn.Module):
  def __init__(self, 
               state_dim:int,
               action_dim:int,
               lr:float,
               gamma:float,
               epsilon:float):
    
    super(DQN, self).__init__()
    self.state_dim = state_dim
    self.action_dim = action_dim
    self.lr = lr
    self.gamma = gamma
    self.criteria = nn.SmoothL1Loss()
    self.register_buffer("epsilon", torch.ones(1)*epsilon)

  def get_action(self, state, act_net:nn.Module):
    q_value = act_net(state)
    prob = np.random.uniform(low=0.0, high=1.0, size=1)
    prob = torch.tensor(prob).float()
    if prob <= self.epsilon:
      action = np.random.choice(range(self.action_dim))
    else:
      action = q_value.argmax(dim=-1)
    return int(action)

  def update(self, state, action, reward, next_state, done, 
             eval_net: nn.Module,
             act_net: nn.Module):
    s, a, r, ns = state, action, reward, next_state

    with torch.no_grad():
      q_max, _ = eval_net(ns).max(dim=-1, keepdims=True)
      target = r + self.gamma*q_max*(1-done)
    
    q = act_net(s).gather(1, a)
    loss = self.criteria(q, target)

    opt = torch.optim.Adam(params=act_net.parameters(), lr=self.lr)
    opt.zero_grad()
    loss.backward()
    opt.step()

In [35]:
def prepare_training_inputs(sampled_exps, device="cpu"):
  states = []
  actions = []
  rewards = []
  next_states = []
  dones = []

  for sampled_exp in sampled_exps:
    states.append(sampled_exp[0])
    actions.append(sampled_exp[1])
    rewards.append(sampled_exp[2])
    next_states.append(sampled_exp[3])
    dones.append(sampled_exp[4])
  
  states = torch.cat(states, dim=0).float().to(device)
  actions = torch.cat(actions, dim=0).to(device)
  rewards = torch.cat(rewards, dim=0).float().to(device)
  next_states = torch.cat(next_states, dim=0).float().to(device)
  dones = torch.cat(dones, dim=0).float().to(device)
  return states, actions, rewards, next_states, dones

In [67]:
lr = 1e-4 * 7
batch_size = 256
gamma = 1.0
memory_size = 50000
total_eps = 3000
eps_max = 0.08
eps_min = 0.01
sampling_only_until = 3000

In [68]:
qnet_a = MultiLayerPerceptron(input_dim=s_dim,
                              output_dim=a_dim,
                              hidden_act="ReLU",
                              out_act="Identity")

qnet_b = MultiLayerPerceptron(input_dim=s_dim,
                              output_dim=a_dim,
                              hidden_act="ReLU",
                              out_act="Identity")

agent = DQN(state_dim=s_dim, 
            action_dim=a_dim,
            lr=lr, gamma=gamma, epsilon=1.0)

memory = ReplayMemory(memory_size)

In [69]:
print_every = 100
alter = 0

for n_epi in range(total_eps):
  #epsilon scheduling
  #slowly decaying_epsilon
  epsilon = max( eps_min, eps_max-eps_min*(n_epi/200) )
  agent.epsilon = torch.tensor(epsilon)
  s = env.reset()
  cum_r = 0

  #target switching
  if alter % 2 == 0:
    eval_net = qnet_a
    act_net = qnet_b
  else:
    eval_net = qnet_b
    act_net = qnet_a

  while True: 
    s = torch.tensor(s).float().view(1, 4)
    a = agent.get_action(s, act_net=act_net)
    ns, r, done, info = env.step(a)
    
    #batch shape 변환 후 sample 생성
    experience = (s, 
                  torch.tensor(a).view(1, 1) ,
                  torch.tensor(r/100).view(1, 1), #1/100 scaling
                  torch.tensor(ns).float().view(1, 4), 
                  torch.tensor(done).view(1, 1)) 
    #샘플 저장
    memory.push(experience)

    s = ns
    cum_r += r
    if done:
      break
  
  if len(memory) >= sampling_only_until:
    alter += 1
    #train agent
    sampled_exps = memory.sample(batch_size)
    sampled_exps = prepare_training_inputs(sampled_exps)
    agent.update(*sampled_exps,
                 eval_net = eval_net,
                 act_net = act_net)
  
  if n_epi % print_every == 0:
    msg = (n_epi, cum_r, epsilon)
    print("Episode: {:4.0f} | Cum R: {:4.0f} | Epsilon: {:.3f}".format(*msg))

Episode:    0 | Cum R:    9 | Epsilon: 0.080
Episode:  100 | Cum R:    9 | Epsilon: 0.075
Episode:  200 | Cum R:    9 | Epsilon: 0.070
Episode:  300 | Cum R:   10 | Epsilon: 0.065
Episode:  400 | Cum R:   11 | Epsilon: 0.060
Episode:  500 | Cum R:   10 | Epsilon: 0.055
Episode:  600 | Cum R:   12 | Epsilon: 0.050
Episode:  700 | Cum R:   29 | Epsilon: 0.045
Episode:  800 | Cum R:   31 | Epsilon: 0.040
Episode:  900 | Cum R:   31 | Epsilon: 0.035
Episode: 1000 | Cum R:   48 | Epsilon: 0.030
Episode: 1100 | Cum R:   38 | Epsilon: 0.025
Episode: 1200 | Cum R:  158 | Epsilon: 0.020
Episode: 1300 | Cum R:  500 | Epsilon: 0.015
Episode: 1400 | Cum R:    9 | Epsilon: 0.010
Episode: 1500 | Cum R:    9 | Epsilon: 0.010
Episode: 1600 | Cum R:   10 | Epsilon: 0.010
Episode: 1700 | Cum R:    9 | Epsilon: 0.010
Episode: 1800 | Cum R:   10 | Epsilon: 0.010
Episode: 1900 | Cum R:    8 | Epsilon: 0.010
Episode: 2000 | Cum R:    9 | Epsilon: 0.010
Episode: 2100 | Cum R:   10 | Epsilon: 0.010
Episode: 2