In [None]:
! pip install gym



In [None]:
import numpy as np
import torch 
import torch.nn as nn
import gym
import random
import matplotlib.pyplot as plt
import numpy as np

from torch.distributions.categorical import Categorical

In [None]:
env = gym.make("Pendulum-v0")
env

<TimeLimit<PendulumEnv<Pendulum-v0>>>

In [None]:
#Pendulum action space
env.action_space

Box(-2.0, 2.0, (1,), float32)

In [None]:
#Pendulum state space 
env.observation_space

Box(-8.0, 8.0, (3,), float32)

In [None]:
#(코사인, 사인, 각속도)
env.reset()

array([-0.09803687,  0.99518278,  0.35303924])

In [None]:
class OUProcess:
  def __init__(self, mu):
      self.theta, self.dt, self.sigma = 0.1, 0.01, 0.1
      self.mu = mu
      self.x_prev = np.zeros_like(self.mu)

  def __call__(self):
      x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
          self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
      self.x_prev = x
      return x

In [None]:
#Experience Replay (deque 대신)
class ReplayMemory:
  def __init__(self, max_size):
    self.buffer = [None]*max_size
    self.max_size = max_size
    self.index = 0
    self.size = 0

  def push(self, obj):
    self.buffer[self.index] = obj
    self.size = min(self.size+1, self.max_size)
    #max_size 넘어가면 다시 인덱스 = 0 
    self.index = (self.index+1) % self.max_size 
  
  def sample(self, batch_size):
    #배치 사이즈만큼 랜덤하게 인덱스 추출
    indices = random.sample(range(self.size), batch_size) 
    return [self.buffer[index] for index in indices]

  def __len__(self):
    return self.size

In [None]:
def prepare_training_inputs(sampled_exps, device="cpu"):
  states = []
  actions = []
  rewards = []
  next_states = []
  dones = []

  for sampled_exp in sampled_exps:
    states.append(sampled_exp[0])
    actions.append(sampled_exp[1])
    rewards.append(sampled_exp[2])
    next_states.append(sampled_exp[3])
    dones.append(sampled_exp[4])
  
  states = torch.cat(states, dim=0).float().to(device)
  actions = torch.cat(actions, dim=0).to(device)
  rewards = torch.cat(rewards, dim=0).float().to(device)
  next_states = torch.cat(next_states, dim=0).float().to(device)
  dones = torch.cat(dones, dim=0).float().to(device)
  return states, actions, rewards, next_states, dones

In [None]:
class MLP(nn.Module):
  def __init__(self, 
               input_dim:int, output_dim:int,
               hidden_act:str, out_act: str):

    super().__init__()
    self.input_dim = input_dim
    self.output_dim = output_dim
    self.hidden_act = getattr(nn, hidden_act)()
    self.out_act = getattr(nn, out_act)()

    self.layers = nn.ModuleList()
    self.layers.append(nn.Linear(self.input_dim, 128))
    self.layers.append(self.hidden_act)
    self.layers.append(nn.Linear(128, 64))
    self.layers.append(self.hidden_act)
    self.layers.append(nn.Linear(64, self.output_dim))
    self.layers.append(self.out_act)

  def forward(self, xs):
    for layer in self.layers:
      xs = layer(xs)
    return xs

In [None]:
#Actor, Critic
class Actor(nn.Module):
  def __init__(self):
    super(Actor, self).__init__()

    self.mlp = MLP(input_dim=3,
                   output_dim=1,
                   hidden_act="ReLU",
                   out_act="Identity")
    
  def forward(self, state):
    return self.mlp(state).clamp(-2.0, 2.0)
  
class Critic(nn.Module):
  def __init__(self):
    super(Critic, self).__init__()

    self.q_estimator = MLP(input_dim=128,
                           output_dim=1,
                           hidden_act="ReLU",
                           out_act="Identity")

  def forward(self, s, a):
    s_vec = nn.Linear(in_features=3, out_features=64)(s)
    s_vec = nn.ReLU()(s_vec)
    
    a_vec = nn.Linear(in_features=1, out_features=64)(a)
    a_vec = nn.ReLU()(a_vec)
      
    emb = torch.cat([s_vec, a_vec], dim=-1)
    return self.q_estimator(emb)

In [None]:
class DDPG(nn.Module):
  def __init__(self,
               critic:nn.Module,
               critic_target:nn.Module,
               actor:nn.Module,
               actor_target:nn.Module,
               lr_critic:float = 0.0005,
               lr_actor:float = 0.001,
               gamma:float = 0.99):
    super().__init__()

    self.critic = critic
    self.actor = actor
    self.lr_critic = lr_critic
    self.lr_actor = lr_actor
    self.gamma = gamma

    #optimizer
    self.critic_opt = torch.optim.Adam(params=critic.parameters(), lr=lr_critic)
    self.actor_opt = torch.optim.Adam(params=actor.parameters(), lr=lr_actor)

    #target network
    critic_target.load_state_dict(critic.state_dict())
    self.critic_target = critic_target
    actor_target.load_state_dict(actor.state_dict())
    self.actor_target = actor_target

    self.criteria = nn.SmoothL1Loss()
  
  def get_action(self, state):
    with torch.no_grad():
      a = self.actor(state)
    return a
  
  def update(self, state, action, reward, next_state, done):
    s, a, r, ns = state, action, reward, next_state

    with torch.no_grad():
      target = r + self.gamma*self.critic_target(ns, self.actor_target(ns))*(1-done)
    critic_loss = self.criteria(self.critic(s, a), target)

    self.critic_opt.zero_grad()
    critic_loss.backward()
    self.critic_opt.step()

    actor_loss = -self.critic(s, self.actor(s)).mean()
    self.actor_opt.zero_grad()
    actor_loss.backward()
    self.actor_opt.step()

In [None]:
#Soft target update
def soft_update(net, net_target, tau):
  for param_target, param in zip(net_target.parameters(), net.parameters()):
    param_target.data.copy_(param_target.data*(1.0-tau) + param.data*tau)

In [None]:
lr_actor = 0.005
lr_critic = 0.001
gamma = 0.99
batch_size = 256
memory_size = 50000
tau = 0.001
sampling_only_until = 2000 

In [None]:
actor, actor_target = Actor(), Actor()
critic, critic_target = Critic(), Critic()

agent = DDPG(critic=critic,
             critic_target=critic_target,
             actor=actor,
             actor_target=actor_target)

memory = ReplayMemory(max_size=memory_size)

In [None]:
total_eps = 200
print_every = 10

for n_epi in range(total_eps):
  ou_noise = OUProcess(mu=np.zeros(1))
  s = env.reset()
  cum_r = 0

  while True:
    s = torch.tensor(s).float().view(1,3)
    a = np.array(agent.get_action(s)) + ou_noise()[0]
    ns, r, done, info = env.step(a)

    experience = (s, 
                  torch.tensor(a).view(1,1),
                  torch.tensor(r).view(1,1),
                  torch.tensor(ns).float().view(1,3),
                  torch.tensor(done).view(1,1))
    
    memory.push(experience)

    s = ns
    cum_r += r

    if len(memory) >= sampling_only_until:
      #train agent
      sampled_exps = memory.sample(batch_size)
      sampled_exps = prepare_training_inputs(sampled_exps)
      agent.update(*sampled_exps)
      #update target network
      soft_update(agent.actor, agent.actor_target, tau)
      soft_update(agent.critic, agent.critic_target, tau)

    if done:
      break
    
  if n_epi % print_every == 0:
    msg = (n_epi, cum_r) 
    print("Episode: {} | Cumulative Reward: {}".format(*msg))

Episode: 0 | Cumulative Reward: [-1249.9844]
Episode: 10 | Cumulative Reward: [-1671.9213]
Episode: 20 | Cumulative Reward: [-862.99005]
Episode: 30 | Cumulative Reward: [-1395.553]
Episode: 40 | Cumulative Reward: [-1081.5059]
Episode: 50 | Cumulative Reward: [-1434.7241]
Episode: 60 | Cumulative Reward: [-861.8643]
Episode: 70 | Cumulative Reward: [-1078.3242]
Episode: 80 | Cumulative Reward: [-1452.9554]
Episode: 90 | Cumulative Reward: [-670.442]
Episode: 100 | Cumulative Reward: [-1117.2136]
Episode: 110 | Cumulative Reward: [-1069.6384]
Episode: 120 | Cumulative Reward: [-969.6323]
Episode: 130 | Cumulative Reward: [-1319.126]
Episode: 140 | Cumulative Reward: [-1236.5948]
Episode: 150 | Cumulative Reward: [-1058.4697]
Episode: 160 | Cumulative Reward: [-1017.4551]
Episode: 170 | Cumulative Reward: [-1882.1885]
Episode: 180 | Cumulative Reward: [-1513.8538]
Episode: 190 | Cumulative Reward: [-1251.8711]
