In [1]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

# Hyperparameters
learning_rate = 0.0002
gamma = 0.98
EPISODE = 5000 # 50000

In [2]:
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.data = []
        
        self.fc1 = nn.Linear(4, 128)
        self.fc2 = nn.Linear(128, 2)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim=0)
        return x
    
    def put_data(self, item):
        self.data.append(item)
        
    def train_net(self):
        R = 0
        self.optimizer.zero_grad()
        for r, prob in self.data[::-1]:
            R = r + gamma * R
            loss = -R * torch.log(prob)
            loss.backward()
            
        self.optimizer.step()
        self.data = []  

In [3]:
def main():
    env = gym.make('CartPole-v1')
    pi = Policy()
    score = 0.0
    print_interval = 100
    
    for n_epi in range(EPISODE):
        # S는 (카트의 위치, 카트의 속도, 막대의 각도, 막대의 각속도)로 구성된 State
        # S -> 1.1803e-01,  1.3199e+00, -1.5365e-01, -2.0408e+00        
        s = env.reset()
#         if(n_epi == 1):
#             print('s -> ', s)
            
        done = False
        
        while not done:
            
            # prob는 (왼쪽 확률, 오른쪽 확률)로 구성된 Action의 확률
            # prob -> 0.1213, 0.1994            
            prob = pi(torch.from_numpy(s).float())
            
            # ?
            # m ->  Categorical(probs: torch.Size([2]))
            m = Categorical(prob)
            
            # 임으로 Action 하나 선택
            # a ->  tensor(0) 또는 tensor(1)
            a = m.sample()
            
#             if(n_epi == 1):
#                 print('prob -> ', prob)
#                 print('m -> ', m)
#                 print('a -> ', a)
#                 print('a.item() -> ', a.item())

            s_prime, r, done, info = env.step(a.item())
            pi.put_data((r,prob[a]))
            s = s_prime
            score += r
            
        pi.train_net()
        if n_epi % print_interval == 0 and n_epi != 0:
            print('# of episode : {}, avg score :  {}'.format(n_epi, score/print_interval))
            score = 0.0
            
    env.close()
    
    

In [4]:
  main()

# of episode : 100, avg score :  22.1
# of episode : 200, avg score :  23.73
# of episode : 300, avg score :  25.19
# of episode : 400, avg score :  34.31
# of episode : 500, avg score :  34.37
# of episode : 600, avg score :  42.75
# of episode : 700, avg score :  45.36
# of episode : 800, avg score :  57.21
# of episode : 900, avg score :  60.64
# of episode : 1000, avg score :  76.2
# of episode : 1100, avg score :  92.44
# of episode : 1200, avg score :  109.04
# of episode : 1300, avg score :  158.84
# of episode : 1400, avg score :  178.97
# of episode : 1500, avg score :  190.35
# of episode : 1600, avg score :  201.84
# of episode : 1700, avg score :  213.87
# of episode : 1800, avg score :  213.38
# of episode : 1900, avg score :  239.81
# of episode : 2000, avg score :  267.11
# of episode : 2100, avg score :  293.01
# of episode : 2200, avg score :  301.23
# of episode : 2300, avg score :  292.94
# of episode : 2400, avg score :  318.37
# of episode : 2500, avg score :  315.