In [1]:
import gym
import torch
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim
from torch.distributions import Categorical

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [3]:
# hyperparameter
learning_rate = 0.0002
gamma = 0.98

In [13]:
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.data = []
        
        self.fc1 = nn.Linear(4, 128)
        self.fc2 = nn.Linear(128, 2)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim=0)
        return x
    
    def put_data(self, item):
        self.data.append(item)
        
    def train_net(self):
        R = 0
        self.optimizer.zero_grad()
        for r, prob in self.data[::-1]:
            R = r + gamma * R
            loss = -torch.log(prob) * R
            loss.backward()
        self.optimizer.step()
        self.data = []

In [14]:
env = gym.make('CartPole-v1')

In [15]:
pi = Policy()

In [16]:
score = 0.0
print_interval = 100

In [17]:
for n_epi in range(10000):
    s, _ = env.reset()
    done = False
    
    while not done:     # CartPole-v1 faces to terminate at 500 step
        prob = pi(torch.from_numpy(s).float())
        m = Categorical(prob)
        a = m.sample()
        s_prime, r, done, truncated, info = env.step(a.item())
        pi.put_data((r, prob[a]))
        s = s_prime
        score += r
    
    pi.train_net()
    
    if n_epi%print_interval == 0 and n_epi !=0:
        print('# of episodes: {}, avg score: {}'
              .format(n_epi, score/print_interval))
        sㄴcore = 0.0

env.close()

# of episodes: 100, avg score: 19.93
# of episodes: 200, avg score: 23.6
# of episodes: 300, avg score: 28.38
# of episodes: 400, avg score: 37.79
# of episodes: 500, avg score: 41.48
# of episodes: 600, avg score: 52.21
# of episodes: 700, avg score: 52.6
# of episodes: 800, avg score: 58.86
# of episodes: 900, avg score: 68.9
# of episodes: 1000, avg score: 84.04
# of episodes: 1100, avg score: 96.36
# of episodes: 1200, avg score: 113.67
# of episodes: 1300, avg score: 136.58
# of episodes: 1400, avg score: 172.21
# of episodes: 1500, avg score: 173.56
# of episodes: 1600, avg score: 189.1
# of episodes: 1700, avg score: 237.01
# of episodes: 1800, avg score: 216.79
# of episodes: 1900, avg score: 248.78
# of episodes: 2000, avg score: 300.54
# of episodes: 2100, avg score: 271.23
# of episodes: 2200, avg score: 318.45
# of episodes: 2300, avg score: 278.25
# of episodes: 2400, avg score: 318.14
# of episodes: 2500, avg score: 284.06
# of episodes: 2600, avg score: 348.5
# of episod

KeyboardInterrupt: 