In [5]:
import gym
import collections
import random

In [6]:
# cuda version
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:41:10_Pacific_Daylight_Time_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [7]:
!nvidia-smi

Wed Oct 25 21:24:20 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 537.58                 Driver Version: 537.58       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4070 Ti   WDDM  | 00000000:01:00.0  On |                  N/A |
|  0%   45C    P0              27W / 285W |    548MiB / 12282MiB |      2%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [9]:
# hyperparameters
learning_rate = 0.0005
gamma = 0.98
buffer_limit = 50000
batch_size = 32

In [24]:
class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen=buffer_limit)
    
    def put(self, transition):
        self.buffer.append(transition)
        
    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []
        
        for transition in mini_batch:
            s, a, r, s_prime, done_mask = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            done_mask_lst.append([done_mask])
        
        return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), torch.tensor(done_mask_lst)
    
    def size(self):
        return len(self.buffer)

class Qnet(nn.Module):
    def __init__(self):
        super(Qnet, self).__init__()
        self.fc1 = nn.Linear(4, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 2)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def sample_action(self, obs, epsilon):
        out = self.forward(obs)
        coin = random.random()
        if coin < epsilon:
            return random.randint(0, 1)
        else:
            return out.argmax().item()

In [25]:
def train(q, q_target, memory, optimizer):
    for i in range(10):
        s, a, r, s_prime, done_mask = memory.sample(batch_size)
            
        q_out = q(s)
        q_a = q_out.gather(1, a)
        max_q_prime = q_target(s_prime).max(1)[0].unsqueeze(1)
        target = r + gamma * max_q_prime * done_mask
        loss = F.smooth_l1_loss(q_a, target)
            
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [26]:
env = gym.make('CartPole-v1')

In [27]:
q = Qnet()
q_target = Qnet()

In [28]:
q_target.load_state_dict(q.state_dict())

<All keys matched successfully>

In [29]:
memory = ReplayBuffer()

In [30]:
print_interval = 100
score = 0.0
optimizer = optim.Adam(q.parameters(), lr=learning_rate)

In [31]:
for n_epi in range(10000):
    # linear annealing from 8% to 1%
    epsilon = max(0.01, 0.08 - 0.01 * (n_epi/200))
    s, _ = env.reset()
    done = False
    
    while not done:
        a = q.sample_action(torch.from_numpy(s).float(), epsilon)
        s_prime, r, done, truncated, info = env.step(a)
        done_mask = 0.0 if done else 1.0
        memory.put((s, a, r/100.0, s_prime, done_mask))     # for learing properly r is divided by 100
        s = s_prime
        
        score += r
        if done:
            break
        
    if memory.size() > 2000:
        train(q, q_target, memory, optimizer)
        
    if n_epi%print_interval==0 and n_epi !=0:
        q_target.load_state_dict(q.state_dict())
        print('n_episode: {}, score: {:.1f}, n_buffer: {}, eps: {:1f}%'
              .format(n_epi, score/print_interval, memory.size(), epsilon*100))
        score = 0.0
env.close()

n_episode: 100, score: 9.8, n_buffer: 983, eps: 7.500000%
n_episode: 200, score: 9.5, n_buffer: 1936, eps: 7.000000%




n_episode: 300, score: 10.1, n_buffer: 2946, eps: 6.500000%
n_episode: 400, score: 9.8, n_buffer: 3929, eps: 6.000000%
n_episode: 500, score: 10.3, n_buffer: 4963, eps: 5.500000%
n_episode: 600, score: 11.4, n_buffer: 6107, eps: 5.000000%
n_episode: 700, score: 35.8, n_buffer: 9686, eps: 4.500000%
n_episode: 800, score: 61.0, n_buffer: 15785, eps: 4.000000%
n_episode: 900, score: 107.9, n_buffer: 26574, eps: 3.500000%
n_episode: 1000, score: 125.3, n_buffer: 39108, eps: 3.000000%
n_episode: 1100, score: 6268.9, n_buffer: 50000, eps: 2.500000%
n_episode: 1200, score: 140.3, n_buffer: 50000, eps: 2.000000%
n_episode: 1300, score: 199.2, n_buffer: 50000, eps: 1.500000%
n_episode: 1400, score: 280.2, n_buffer: 50000, eps: 1.000000%
n_episode: 1500, score: 200.3, n_buffer: 50000, eps: 1.000000%
n_episode: 1600, score: 101.4, n_buffer: 50000, eps: 1.000000%
n_episode: 1700, score: 111.6, n_buffer: 50000, eps: 1.000000%
n_episode: 1800, score: 141.8, n_buffer: 50000, eps: 1.000000%
n_episode:

KeyboardInterrupt: 