In [6]:
! pip install gym



In [7]:
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch
import random
import gym

In [8]:
Env = gym.make("Pendulum-v0")
Env

<TimeLimit<PendulumEnv<Pendulum-v0>>>

In [9]:
input_dim = Env.observation_space.shape
output_dim = Env.action_space
print(input_dim, output_dim)

(3,) Box(-2.0, 2.0, (1,), float32)


In [10]:
#(코사인, 사인, 각속도)
Env.reset()

array([ 0.44520689, -0.89542773,  0.44410247])

In [11]:
class OUProcess:
  def __init__(self, mu):
      self.theta, self.dt, self.sigma = 0.1, 0.01, 0.1
      self.mu = mu
      self.x_prev = np.zeros_like(self.mu)

  def __call__(self):
      x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
          self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
      self.x_prev = x
      return x

In [12]:
class ReplayMemory:
  def __init__(self, max_size):
    self.buffer = [None]*max_size
    self.max_size = max_size
    self.index = 0 
    self.size = 0
  
  def push(self, obj):
    self.buffer[self.index] = obj
    self.size = min(self.index+1, self.max_size)
    self.index = (self.index+1) % self.max_size

  def sample(self, batch_size):
    indices = random.sample(range(self.size), batch_size)
    return [self.buffer[index] for index in indices]
  
  def __len__(self):
    return self.size

In [13]:
class actor(nn.Module):
  def __init__(self, 
               input_dim: int, output_dim: int,
               hidden_act: str, out_act: str):
    super().__init__()

    self.hidden_act = getattr(nn, hidden_act)()
    self.out_act = getattr(nn, out_act)()

    self.layers = nn.ModuleList()
    self.layers.append(nn.Linear(input_dim, 16))
    self.layers.append(self.hidden_act)
    self.layers.append(nn.Linear(16, 32))
    self.layers.append(self.hidden_act)
    self.layers.append(nn.Linear(32, 64))
    self.layers.append(self.hidden_act)
    self.layers.append(nn.Linear(64, output_dim))
    self.layers.append(self.out_act)

  def forward(self, x):
    for layer in self.layers:
      x = layer(x)
    return x.clamp(-2.0, 2.0)

In [14]:
class critic(nn.Module):
  def __init__(self, 
               state_dim: int, action_dim: int, output_dim: int, 
               hidden_act: str, out_act: str):
    super().__init__()

    self.hidden_act = getattr(nn, hidden_act)()
    self.out_act = getattr(nn, out_act)()

    self.state_dim = state_dim
    self.action_dim = action_dim

    self.layers = nn.ModuleList()
    self.layers.append(nn.Linear(32, 64))
    self.layers.append(self.hidden_act)
    self.layers.append(nn.Linear(64, 128))
    self.layers.append(self.hidden_act)
    self.layers.append(nn.Linear(128, output_dim))
    self.layers.append(self.out_act)

  def forward(self, s, a):
    s_vec = nn.Linear(in_features=self.state_dim, out_features=16)(s)
    s_vec = self.hidden_act(s_vec)

    a_vec = nn.Linear(in_features=self.action_dim, out_features=16)(a)
    a_vec = self.hidden_act(a_vec)

    x = torch.cat([s_vec, a_vec], dim=1)
    for layer in self.layers:
      x = layer(x)     
    return x

In [50]:
class CDQAgent(nn.Module):
  def __init__(self, 
               criticA_net:nn.Module, criticA_target:nn.Module,
               criticB_net:nn.Module, criticB_target:nn.Module,
               actorA_net:nn.Module, actorA_target:nn.Module,
               criticA_lr:float, criticB_lr:float, 
               actorA_lr:float, discount_factor:float):
    super().__init__()

    self.criticA_net = criticA_net
    self.criticA_target = criticA_target
    self.criticB_net = criticB_net
    self.criticB_target = criticB_target
    self.actorA_net = actorA_net 
    self.actorA_target = actorA_target

    self.criticA_lr = criticA_lr
    self.criticB_lr = criticB_lr
    self.actorA_lr = actorA_lr
    self.discount_factor = discount_factor

    self.criteriaA = nn.MSELoss()
    self.criteriaB = nn.MSELoss()
    self.criticA_opt = torch.optim.Adam(params=criticA_net.parameters(), lr=criticA_lr)
    self.criticB_opt = torch.optim.Adam(params=criticB_net.parameters(), lr=criticB_lr)
    self.actorA_opt = torch.optim.Adam(params=actorA_net.parameters(), lr=actorA_lr)
    
  def get_action(self, state):
    state = torch.tensor(state).float().view(1,-1)
    return self.actorA_net(state).detach() 

  def update(self, state, action, next_state, reward, done):
    s, a, r, ns = state, action, reward, next_state

    with torch.no_grad():
      q_A = self.criticA_target(ns, self.actorA_target(ns))
      q_B = self.criticB_target(ns, self.actorA_target(ns))
      q_max = torch.min(q_A, q_B)
      target = r + self.discount_factor*q_max*(1-done)
    
    inferA = self.criticA_net(s, a)
    inferB = self.criticB_net(s, a)
    criticA_loss = self.criteriaA(inferA, target)
    criticB_loss = self.criteriaB(inferB, target)
    criticA_loss.backward()
    criticB_loss.backward()
    self.criticA_opt.step()
    self.criticB_opt.step()
    self.criticA_opt.zero_grad()
    self.criticB_opt.zero_grad()

    actorA_loss = -self.criticA_net(s, self.actorA_net(s)).mean()
    actorA_loss.backward()
    self.actorA_opt.step()
    self.actorA_opt.zero_grad()

In [51]:
def preprocessing(batch):
  states = []
  actions = []
  next_states = []
  rewards = []
  dones = []
  
  for sample in batch:
    states.append( torch.tensor(sample[0]).float().view(1, -1) )
    actions.append( sample[1].view(1,-1) )
    next_states.append( torch.tensor(sample[2]).float().view(1,-1) )
    rewards.append( sample[3].view(1,-1) )
    dones.append( torch.tensor(sample[4]).float().view(1,-1) )

  states = torch.cat(states, dim=0)
  actions = torch.cat(actions, dim=0)
  next_states = torch.cat(next_states, dim=0)
  rewards = torch.cat(rewards, dim=0)
  dones = torch.cat(dones, dim=0)

  return states, actions, next_states, rewards, dones

In [52]:
#Soft target update
def soft_update(net, net_target, tau):
  for param_target, param in zip(net_target.parameters(), net.parameters()):
    param_target.data.copy_(param_target.data*(1.0-tau) + param.data*tau)

In [61]:
actorA_lr = 0.005
criticA_lr = 0.001
criticB_lr = 0.001
discount_factor = 0.99
batch_size = 256
memory_size = 50000
tau = 0.002
sampling_only_until = 2000
target_update_interval = 10 

In [62]:
criticA_net = critic(state_dim=3, action_dim=1, output_dim=1, hidden_act="ReLU", out_act="Identity")
criticA_target = critic(state_dim=3, action_dim=1, output_dim=1, hidden_act="ReLU", out_act="Identity")
criticB_net = critic(state_dim=3, action_dim=1, output_dim=1, hidden_act="ReLU", out_act="Identity")
criticB_target = critic(state_dim=3, action_dim=1, output_dim=1, hidden_act="ReLU", out_act="Identity")

actorA_net = actor(input_dim=3, output_dim=1, hidden_act="ReLU", out_act="Identity")
actorA_target = actor(input_dim=3, output_dim=1, hidden_act="ReLU", out_act="Identity")

agent = CDQAgent(criticA_net=criticA_net, criticA_target=criticA_target,
                 criticB_net=criticB_net, criticB_target=criticB_target,
                 actorA_net=actorA_net, actorA_target=actorA_target,
                 criticA_lr=criticA_lr, criticB_lr=criticB_lr, actorA_lr=actorA_lr,
                 discount_factor=discount_factor)

In [63]:
memory = ReplayMemory(memory_size)

In [64]:
num_epi = 200
print_every = 10

criticA_net.load_state_dict(criticA_target.state_dict())
criticB_net.load_state_dict(criticB_target.state_dict())
actorA_net.load_state_dict(actorA_target.state_dict())

for epi in range(num_epi):
  ou_noise = OUProcess(mu=np.zeros(1))
  state = Env.reset()
  cum_r = 0 

  while True:
    action = agent.get_action(state) + ou_noise()[0]
    next_state, reward, done, info = Env.step(action)
    
    sample = (state, action, next_state, reward, done)
    memory.push(sample)
  
    state = next_state
    cum_r += reward

    if len(memory) >= sampling_only_until:
      batch = memory.sample(batch_size)
      batch = preprocessing(batch)
      agent.update(*batch)
      soft_update(criticA_net, criticA_target, tau)
      soft_update(actorA_net, actorA_target, tau)

    if done:
      break
  
  if epi % print_every == 0:
      print("Episode: {} | cum_r: {}".format(epi, cum_r))

Episode: 0 | cum_r: tensor([-1335.3964])
Episode: 10 | cum_r: tensor([-967.1556])
Episode: 20 | cum_r: tensor([-1247.2682])
Episode: 30 | cum_r: tensor([-1883.2717])
Episode: 40 | cum_r: tensor([-750.3878])
Episode: 50 | cum_r: tensor([-1190.5750])
Episode: 60 | cum_r: tensor([-1074.7672])
Episode: 70 | cum_r: tensor([-852.1053])
Episode: 80 | cum_r: tensor([-865.8614])
Episode: 90 | cum_r: tensor([-1365.4606])
Episode: 100 | cum_r: tensor([-861.6691])
Episode: 110 | cum_r: tensor([-1263.3657])
Episode: 120 | cum_r: tensor([-1605.4863])
Episode: 130 | cum_r: tensor([-935.4053])
Episode: 140 | cum_r: tensor([-1598.9093])
Episode: 150 | cum_r: tensor([-871.4820])
Episode: 160 | cum_r: tensor([-663.4680])
Episode: 170 | cum_r: tensor([-1044.7020])
Episode: 180 | cum_r: tensor([-838.1151])
Episode: 190 | cum_r: tensor([-1267.9717])
