In [1]:
import gym
import ptan
import numpy as np
from torch.utils.tensorboard import SummaryWriter

import torch
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
GAMMA = 0.99
LEARNING_RATE = 0.01
# specifying how many complete episodes we will use for training
EPISODE_TO_TRAIN = 4

In [3]:
class PGN(nn.Module):
    def __init__(self, input_size, n_actions):
        super(PGN, self).__init__()
        
        self.net = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions)
        )
        
    def forward(self, x):
        return self.net(x)

In [4]:
def calc_qvals(rewards):
    res = []
    sum_r = 0.0
    for r in reversed(rewards):
        sum_r *= GAMMA
        sum_r += r 
        res.append(sum_r)
    return list(reversed(res))

In [5]:
env = gym.make("CartPole-v0")
writer = SummaryWriter(comment="-cartpole-reinforce")

net = PGN(env.observation_space.shape[0], env.action_space.n)
print(net)

PGN(
  (net): Sequential(
    (0): Linear(in_features=4, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=2, bias=True)
  )
)


In [6]:
agent = ptan.agent.PolicyAgent(net, preprocessor=ptan.agent.float32_preprocessor,
                               apply_softmax=True)
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA)
optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)

In [7]:
total_rewards = []
step_idx = 0
done_episodes = 0

batch_episodes = 0
batch_states, batch_actions, batch_qvals = [], [], []
cur_rewards = []

In [8]:
for step_idx, exp in enumerate(exp_source):
    batch_states.append(exp.state)
    batch_actions.append(int(exp.action))
    cur_rewards.append(exp.reward)
    
    if exp.last_state is None:
        batch_qvals.extend(calc_qvals(cur_rewards))
        cur_rewards.clear()
        batch_episodes += 1
    
    # handle new rewards
    new_rewards = exp_source.pop_rewards_steps()
    if new_rewards:
        done_episodes += 1
        reward = new_rewards[0][0]
        total_rewards.append(reward)
        mean_rewards = float(np.mean(total_rewards[-100:]))
        print("%d: reward: %6.2f, mean_100: %6.2f, episodes: %d"
              % (step_idx, reward, mean_rewards, done_episodes))
        writer.add_scalar("reward", reward, step_idx)
        writer.add_scalar("reward_100", mean_rewards, step_idx)
        writer.add_scalar("episodes", done_episodes, step_idx)
        if mean_rewards > 195:
            print("Solved in %d steps and %d episodes!"
                  %(step_idx, done_episodes))
            break
    
    # when enough episodes have passed since the last training step,
    # we perform optimiation on the gathered exmaples
    # we convert states, actions, and Q-value into the appropriate PyTorch form
    if batch_episodes < EPISODE_TO_TRAIN:
        continue
    
    optimizer.zero_grad()
    states_v = torch.FloatTensor(batch_states)
    batch_actions_t = torch.LongTensor(batch_actions)
    batch_qvals_v = torch.FloatTensor(batch_qvals)
    
    # we calculate the loss from the steps.
    logits_v = net(states_v)        # network calculates states into logits
    log_prob_v = F.log_softmax(logits_v, dim=1)     # calculate the logartithm + softmax of them
    log_prov_actions_v = batch_qvals_v * log_prob_v[range(len(batch_states)), batch_actions_t]      # select log probabilities from the action taken and scale them with Q-values
    loss_v = -log_prov_actions_v.mean()         # average those scaled values and do negation to obtain the loss to minimize.
    
    loss_v.backward()
    optimizer.step()
    
    batch_episodes = 0
    batch_states.clear()
    batch_actions.clear()
    batch_qvals.clear()

writer.close()

15: reward:  15.00, mean_100:  15.00, episodes: 1
62: reward:  47.00, mean_100:  31.00, episodes: 2
88: reward:  26.00, mean_100:  29.33, episodes: 3
103: reward:  15.00, mean_100:  25.75, episodes: 4
131: reward:  28.00, mean_100:  26.20, episodes: 5
176: reward:  45.00, mean_100:  29.33, episodes: 6
197: reward:  21.00, mean_100:  28.14, episodes: 7
211: reward:  14.00, mean_100:  26.38, episodes: 8
226: reward:  15.00, mean_100:  25.11, episodes: 9
259: reward:  33.00, mean_100:  25.90, episodes: 10
296: reward:  37.00, mean_100:  26.91, episodes: 11
312: reward:  16.00, mean_100:  26.00, episodes: 12
345: reward:  33.00, mean_100:  26.54, episodes: 13
385: reward:  40.00, mean_100:  27.50, episodes: 14
409: reward:  24.00, mean_100:  27.27, episodes: 15
449: reward:  40.00, mean_100:  28.06, episodes: 16
491: reward:  42.00, mean_100:  28.88, episodes: 17
522: reward:  31.00, mean_100:  29.00, episodes: 18
534: reward:  12.00, mean_100:  28.11, episodes: 19
571: reward:  37.00, mea