In [1]:
import gym
import collections
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
# Hyperpaarameters
learning_rate = 0.0005
gamma = 0.98
buffer__limit = 50000
batch_size = 32
EPISODE = 5000 # 50000

In [3]:
class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen=buffer__limit)
        
    def put(self, transition):
        self.buffer.append(transition)
        
    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [],[],[],[],[]
        
        for transition in mini_batch:
            s, a, r, s_prime, done_mask = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            done_mask_lst.append([done_mask])
            
        return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), torch.tensor(done_mask_lst)
            
    def size(self):
        return len(self.buffer)
                                                    

In [4]:
class Qnet(nn.Module):
    def __init__(self):
        super(Qnet, self).__init__()
        self.fc1 = nn.Linear(4, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 2)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def sample_action(self, obs, epsilon):
        
        # network 학습 진행
        out = self.forward(obs)
        
        coin = random.random()
        if coin < epsilon:
            return random.randint(0, 1)
        else:
            return out.argmax().item()
        

In [5]:
def train(q, q_target, memory, optimizer):
    for i in range(10):
        s, a, r, s_prime, done_mask = memory.sample(batch_size)
        
        # q network 학습
        # S는 (카트의 위치, 카트의 속도, 막대의 각도, 막대의 각속도)로 구성된 State
        # S -> 1.1803e-01,  1.3199e+00, -1.5365e-01, -2.0408e+00
        # q_out는 (왼쪽 확률, 오른쪽 확률)로 구성된 Action의 확률
        # q_out -> 0.1213, 0.1994
        # a는 왼쪽(0) 또는 오늘쪽(1)의 Action 
        # a -> 0

        q_out = q(s) 
        
#         if(i == 0):
#             print('s -> ', s)
#             print('q_out -> ', q_out)
#             print('a -> ', a)

        # q_a는 q_out에서 a를 적용한 확률
        # q_a -> 0.1213        
        q_a = q_out.gather(1,a)
#         if(i == 0):
#             print('q_a -> ', q_a)        
        
        # q_target network 학습
        # 32 x 2
        target_q = q_target(s_prime)
        
        # Action값중 가장 큰 값을 선택
        # 1 x 32
        max_q = target_q.max(1)
        
        # 재배열
        # 32 x 1
        max_q_prime = max_q[0].unsqueeze(1)
        
#         if(i == 0):
#             print('target_q -> ', target_q)
#             print('max_q -> ', max_q)
#             print('max_q_prime -> ', max_q_prime)            

        
        # q_target과 q사이에 차이값 계산(?)
        target = r + gamma * max_q_prime * done_mask
        
#         if(i == 0):
#             print('done_mask -> ', done_mask)
#             print('target -> ', target)

        # 손실 계산
        # loss -> 0.0027
        loss = F.smooth_l1_loss(q_a, target)
        
#         if(i == 0):
#             print('loss -> ', loss)
        
        # loss에 대한 Gradient 계산
        optimizer.zero_grad()
        
        # Gradient 역전파 수행
        loss.backward()
        
        # weight 값 업데ㅣ트
        optimizer.step()
        

In [6]:
def main():
    env = gym.make('CartPole-v1')
    q = Qnet()
    q_target = Qnet()
    q_target.load_state_dict(q.state_dict())
    memory = ReplayBuffer()
    
    print_interval = 100
    score = 0.0
    optimizer = optim.Adam(q.parameters(), lr=learning_rate)
    
    for n_epi in range(EPISODE):
        epsilon = max(0.01, 0.08 - 0.01*(n_epi/200))
        
        # Linear annealing from 8% to 1%
        s = env.reset()
        done = False
        
        while not done:
            a = q.sample_action(torch.from_numpy(s).float(), epsilon)
            s_prime, r, done, info = env.step(a)
            done_mask = 0.0 if done else 1.0
            memory.put((s, a, r/100.0, s_prime, done_mask))
            s = s_prime
            score += r
            if done:
                break
                
        if memory.size() > 2000:
            train(q, q_target, memory, optimizer)
            
        if n_epi % print_interval == 0 and n_epi != 0:
            q_target.load_state_dict(q.state_dict())
            print('n_episode : {}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%'.format(
            n_epi, score/print_interval, memory.size(), epsilon*100))
            score = 0.0
            
    env.close()

In [7]:
main()

n_episode : 100, score : 22.8, n_buffer : 2276, eps : 7.5%
n_episode : 200, score : 10.3, n_buffer : 3306, eps : 7.0%
n_episode : 300, score : 11.1, n_buffer : 4420, eps : 6.5%
n_episode : 400, score : 11.0, n_buffer : 5518, eps : 6.0%
n_episode : 500, score : 27.9, n_buffer : 8308, eps : 5.5%
n_episode : 600, score : 73.7, n_buffer : 15681, eps : 5.0%
n_episode : 700, score : 107.4, n_buffer : 26419, eps : 4.5%
n_episode : 800, score : 143.5, n_buffer : 40766, eps : 4.0%
n_episode : 900, score : 102.9, n_buffer : 50000, eps : 3.5%
n_episode : 1000, score : 145.5, n_buffer : 50000, eps : 3.0%
n_episode : 1100, score : 175.9, n_buffer : 50000, eps : 2.5%
n_episode : 1200, score : 190.8, n_buffer : 50000, eps : 2.0%
n_episode : 1300, score : 188.7, n_buffer : 50000, eps : 1.5%
n_episode : 1400, score : 169.8, n_buffer : 50000, eps : 1.0%
n_episode : 1500, score : 163.2, n_buffer : 50000, eps : 1.0%
n_episode : 1600, score : 151.1, n_buffer : 50000, eps : 1.0%
n_episode : 1700, score : 19