In [1]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

# Hyperparameters
learning_rate = 0.0002
gamma = 0.98
n_rollout = 10
EPISODE = 5000 # 10000

In [2]:
class ActorCritic(nn.Module):
    def __init__(self):
        super(ActorCritic, self).__init__()
        self.data = []
        
        self.fc1 = nn.Linear(4, 256)
        self.fc_pi = nn.Linear(256, 2)
        self.fc_v = nn.Linear(256, 1)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
    
    def pi(self, x, softmax_dim=0):
        x = F.relu(self.fc1(x))
        x = self.fc_pi(x)
        prob = F.softmax(x, dim=softmax_dim)
        return prob
    
    def v(self, x):
        x = F.relu(self.fc1(x))
        v = self.fc_v(x)
        return v
    
    def put_data(self, item):
        self.data.append(item)
        
    def make_batch(self):
        s_lst, a_lst, r_lst, s_prime_lst, done_lst = [],[],[],[],[]
        
        for transition in self.data:
            s, a, r, s_prime, done = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r/100.0])
            s_prime_lst.append(s_prime)
            done_mask = 0.0 if done else 1.0
            done_lst.append([done_mask])
            
        s_batch = torch.tensor(s_lst, dtype=torch.float)
        a_batch = torch.tensor(a_lst)
        r_batch = torch.tensor(r_lst, dtype=torch.float)
        s_prime_batch = torch.tensor(s_prime_lst, dtype=torch.float)
        done_batch = torch.tensor(done_lst, dtype=torch.float)
        self.data = []
        
        return s_batch, a_batch, r_batch, s_prime_batch, done_batch
            
        
    def train_net(self):
        s, a, r, s_prime, done = self.make_batch()
        td_target = r + gamma * self.v(s_prime) * done
        delta = td_target - self.v(s)
        
        pi = self.pi(s, softmax_dim=1)
        pi_a = pi.gather(1, a)
        loss = -torch.log(pi_a) * delta.detach() + F.smooth_l1_loss(self.v(s), td_target.detach())

        self.optimizer.zero_grad()
        loss.mean().backward()
        self.optimizer.step()


In [3]:
def main():
    env = gym.make('CartPole-v1')
    model = ActorCritic()
    score = 0.0
    print_interval = 100
    
    for n_epi in range(EPISODE):
        # S는 (카트의 위치, 카트의 속도, 막대의 각도, 막대의 각속도)로 구성된 State
        # S -> 1.1803e-01,  1.3199e+00, -1.5365e-01, -2.0408e+00           
        s = env.reset()
#         if(n_epi == 1):
#             print('s -> ', s)

        done = False
        
        while not done:
            for t in range(n_rollout):
                
                # prob는 (왼쪽 확률, 오른쪽 확률)로 구성된 Action의 확률
                # prob -> 0.1213, 0.1994                      
                prob = model.pi(torch.from_numpy(s).float())
                m = Categorical(prob)
                
                # 임으로 Action 하나 선택
                # a ->  tensor(0) 또는 tensor(1)                
                a = m.sample().item()
                
#                 if(n_epi == 1):
#                     print('prob -> ', prob)
#                     print('m -> ', m)
#                     print('a -> ', a)

                s_prime, r, done, info = env.step(a)
                model.put_data((s,a,r,s_prime,done))

                s = s_prime
                score += r

                if done:
                    break
                    
            model.train_net()
        
        if n_epi % print_interval == 0 and n_epi != 0:
            print('# of episode : {}, avg score :  {}'.format(n_epi, score/print_interval))
            score = 0.0
            
    env.close()

In [4]:
main()

# of episode : 100, avg score :  23.97
# of episode : 200, avg score :  35.13
# of episode : 300, avg score :  48.26
# of episode : 400, avg score :  113.15
# of episode : 500, avg score :  223.05
# of episode : 600, avg score :  252.53
# of episode : 700, avg score :  205.25
# of episode : 800, avg score :  334.65
# of episode : 900, avg score :  306.89
# of episode : 1000, avg score :  394.72
# of episode : 1100, avg score :  434.08
# of episode : 1200, avg score :  461.06
# of episode : 1300, avg score :  162.5
# of episode : 1400, avg score :  233.97
# of episode : 1500, avg score :  347.66
# of episode : 1600, avg score :  335.27
# of episode : 1700, avg score :  237.02
# of episode : 1800, avg score :  324.84
# of episode : 1900, avg score :  275.63
# of episode : 2000, avg score :  262.66
# of episode : 2100, avg score :  246.45
# of episode : 2200, avg score :  296.54
# of episode : 2300, avg score :  261.13
# of episode : 2400, avg score :  327.24
# of episode : 2500, avg scor