In [1]:
import gym
import torch
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim
from torch.distributions import Categorical

In [2]:
# Hyperparameters
learning_rate = 0.0005
gamma = 0.98
lmbda = 0.95
eps_clip = 0.1
K_epoch = 3
T_horizon = 20

In [3]:
class PPO(nn.Module):
    def __init__(self):
        super(PPO, self).__init__()
        self.data = []
        
        self.fc1 = nn.Linear(4, 256)
        self.fc_pi = nn.Linear(256, 2)
        self.fc_v = nn.Linear(256, 1)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
    
    def pi(self, x, softmax_dim=0):
        x = F.relu(self.fc1(x))
        x = self.fc_pi(x)
        prob = F.softmax(x, dim=softmax_dim)
        return prob
    
    def v(self, x):
        x = F.relu(self.fc1(x))
        v = self.fc_v(x)
        return v
    
    def put_data(self, transition):
        self.data.append(transition)
        
    def make_batch(self):
        s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, done_lst = [], [], [], [], [], []
        for transition in self.data:
            s, a, r, s_prime, prob_a, done = transition
            
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            prob_a_lst.append([prob_a])
            done_mask = 0 if done else 1
            done_lst.append([done_mask])
            
        s, a, r, s_prime, done_mask, prob_a = torch.tensor(s_lst, dtype=torch.float), \
            torch.tensor(a_lst), torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
            torch.tensor(done_lst, dtype=torch.float), torch.tensor(prob_a_lst)
        self.data = []
        return s, a, r, s_prime, done_mask, prob_a
    
    def train_net(self):
        s, a, r, s_prime, done_mask, prob_a = self.make_batch()
        
        for i in range(K_epoch):
            td_target = r + gamma * self.v(s_prime) * done_mask
            delta = td_target - self.v(s)
            delta = delta.detach().numpy()  # fixed delta from training
            
            advantage_lst = []
            advatage = 0.0
            for delta_t in delta[::-1]:     # delta 뒤에서부터 꺼내오기
                advatage = gamma * lmbda * advatage + delta_t[0]    # 뒤에서부터 계산
                advantage_lst.append([advatage])
            advantage_lst.reverse()     # 뒤에서부터 계산했으므로 뒤집기
            advatage = torch.tensor(advantage_lst, dtype=torch.float)
            
            pi = self.pi(s, softmax_dim=1)
            pi_a = pi.gather(1, a)
            ratio = torch.exp(torch.log(pi_a) - torch.log(prob_a))      # a/b == exp(log(a) - log(b))
            
            surr1 = ratio * advatage
            surr2 = torch.clamp(ratio, 1-eps_clip, 1+eps_clip) * advatage   # clamp is clipping funtion
            loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(self.v(s), td_target.detach())
            
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
        

In [4]:
env = gym.make('CartPole-v1')

In [5]:
model = PPO()

In [6]:
score = 0.0
print_interval = 100


In [7]:
for n_epi in range(10000):
    s, _ = env.reset()
    done = False
    while not done:
        for t in range(T_horizon):
            prob = model.pi(torch.from_numpy(s).float())
            m = Categorical(prob)
            a = m.sample().item()
            s_prime, r, done, truncated, info = env.step(a)
            
            model.put_data((s, a, r/100.0, s_prime, prob[a].item(), done))
            s = s_prime
            
            score += r
            if done:
                break
            
        model.train_net()
    
    if n_epi%print_interval==0 and n_epi!=0:
        print('# of eposide: {}, avg score: {:.1f}'.format(n_epi, score/print_interval))
        score = 0.0
        
env.close()



# of eposide: 100, avg score: 30.2
# of eposide: 200, avg score: 46.7
# of eposide: 300, avg score: 75.9
# of eposide: 400, avg score: 363.7
# of eposide: 500, avg score: 2345.1
# of eposide: 600, avg score: 195.2
# of eposide: 700, avg score: 795.2
# of eposide: 800, avg score: 112.3
# of eposide: 900, avg score: 102.5
# of eposide: 1000, avg score: 114.9
# of eposide: 1100, avg score: 156.5
# of eposide: 1200, avg score: 230.3
# of eposide: 1300, avg score: 235.2
# of eposide: 1400, avg score: 447.6
# of eposide: 1500, avg score: 565.7
# of eposide: 1600, avg score: 124.9
# of eposide: 1700, avg score: 131.3
# of eposide: 1800, avg score: 253.0
# of eposide: 1900, avg score: 382.7
# of eposide: 2000, avg score: 470.5
# of eposide: 2100, avg score: 884.5
# of eposide: 2200, avg score: 439.6
# of eposide: 2300, avg score: 431.4
# of eposide: 2400, avg score: 150.7
# of eposide: 2500, avg score: 203.0
# of eposide: 2600, avg score: 135.5
# of eposide: 2700, avg score: 116.8
# of eposide