In [1]:
import os
import numpy as np
import torch as T
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
import matplotlib.pyplot as plt

In [2]:
def plot_learning_curve(x, scores):
    running_avg = np.zeros(len(scores))
    for i in range(len(running_avg)):
        running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
    plt.plot(x, running_avg)
    plt.title('Running average of previous 100 scores')

In [3]:
class PPOMemory:
  def __init__(self, batch_size):
    self.states = []
    self.probs = []
    self.vals = []
    self.actions = []
    self.rewards = []
    self.dones = []

    self.batch_size = batch_size

  def generate_batches(self):
    n_states = len(self.states)
    batch_start = np.arange(0, n_states, self.batch_size)
    indices = np.arange(n_states, dtype = np.int64)
    np.random.shuffle(indices)
    batches = [indices[i:i+self.batch_size] for i in batch_start]

    return np.array(self.states),np.array(self.actions),np.array(self.probs),np.array(self.vals),np.array(self.rewards),np.array(self.dones),batches

  def store_memory(self, state, action, probs, vals, reward, done):
    self.states.append(state)
    self.actions.append(action)
    self.probs.append(probs)
    self.vals.append(vals)
    self.rewards.append(reward)
    self.dones.append(done)

  def clear_memory(self):
    self.states = []
    self.probs = []
    self.actions = []
    self.rewards = []
    self.dones = []
    self.vals = []

In [4]:
class ActorNetwork(nn.Module):
    def __init__(self, n_actions, input_dims, alpha,
            fc1_dims=256, fc2_dims=256, chkpt_dir='tmp/ppo'):
        super(ActorNetwork, self).__init__()

       # self.checkpoint_file = os.path.join(chkpt_dir, 'actor_torch_ppo')
        self.actor = nn.Sequential(
                nn.Linear(*input_dims, fc1_dims),
                nn.ReLU(),
                nn.Linear(fc1_dims, fc2_dims),
                nn.ReLU(),
                nn.Linear(fc2_dims, n_actions),
                nn.Softmax(dim=-1)
        )

        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        dist = self.actor(state)
        dist = Categorical(dist)
        
        return dist

   # def save_checkpoint(self):
   #     T.save(self.state_dict(), self.checkpoint_file)

   # def load_checkpoint(self):
       # self.load_state_dict(T.load(self.checkpoint_file))


In [5]:
class CriticNetwork(nn.Module):
    def __init__(self, input_dims, alpha, fc1_dims=256, fc2_dims=256,
            chkpt_dir='tmp/ppo'):
        super(CriticNetwork, self).__init__()

       # self.checkpoint_file = os.path.join(chkpt_dir, 'critic_torch_ppo')
        self.critic = nn.Sequential(
                nn.Linear(*input_dims, fc1_dims),
                nn.ReLU(),
                nn.Linear(fc1_dims, fc2_dims),
                nn.ReLU(),
                nn.Linear(fc2_dims, 1)
        )

        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        value = self.critic(state)

        return value

    #def save_checkpoint(self):
        #T.save(self.state_dict(), self.checkpoint_file)

   # def load_checkpoint(self):
        s#elf.load_state_dict(T.load(self.checkpoint_file))

In [6]:
class Agent:
    def __init__(self, n_actions, input_dims, gamma=0.99, alpha=0.0003, gae_lambda=0.95,
            policy_clip=0.2, batch_size=64, n_epochs=10):
        self.gamma = gamma
        self.policy_clip = policy_clip
        self.n_epochs = n_epochs
        self.gae_lambda = gae_lambda

        self.actor = ActorNetwork(n_actions, input_dims, alpha)
        self.critic = CriticNetwork(input_dims, alpha)
        self.memory = PPOMemory(batch_size)
       
    def remember(self, state, action, probs, vals, reward, done):
        self.memory.store_memory(state, action, probs, vals, reward, done)

    #def save_models(self):
       # print('... saving models ...')
        #self.actor.save_checkpoint()
        #self.critic.save_checkpoint()

    #def load_models(self):
     #   print('... loading models ...')
      #  self.actor.load_checkpoint()
       # self.critic.load_checkpoint()

    def choose_action(self, observation):
        state = T.tensor([observation], dtype=T.float).to(self.actor.device)

        dist = self.actor(state)
        value = self.critic(state)
        action = dist.sample()

        probs = T.squeeze(dist.log_prob(action)).item()
        action = T.squeeze(action).item()
        value = T.squeeze(value).item()

        return action, probs, value

    def learn(self):
        for _ in range(self.n_epochs):
            state_arr, action_arr, old_prob_arr, vals_arr,\
            reward_arr, dones_arr, batches = \
                    self.memory.generate_batches()

            values = vals_arr
            advantage = np.zeros(len(reward_arr), dtype=np.float32)

            for t in range(len(reward_arr)-1):
                discount = 1
                a_t = 0
                for k in range(t, len(reward_arr)-1):
                    a_t += discount*(reward_arr[k] + self.gamma*values[k+1]*\
                            (1-int(dones_arr[k])) - values[k])
                    discount *= self.gamma*self.gae_lambda
                advantage[t] = a_t
            advantage = T.tensor(advantage).to(self.actor.device)

            values = T.tensor(values).to(self.actor.device)
            for batch in batches:
                states = T.tensor(state_arr[batch], dtype=T.float).to(self.actor.device)
                old_probs = T.tensor(old_prob_arr[batch]).to(self.actor.device)
                actions = T.tensor(action_arr[batch]).to(self.actor.device)

                dist = self.actor(states)
                critic_value = self.critic(states)

                critic_value = T.squeeze(critic_value)

                new_probs = dist.log_prob(actions)
                prob_ratio = new_probs.exp() / old_probs.exp()
                #prob_ratio = (new_probs - old_probs).exp()
                weighted_probs = advantage[batch] * prob_ratio
                weighted_clipped_probs = T.clamp(prob_ratio, 1-self.policy_clip,
                        1+self.policy_clip)*advantage[batch]
                actor_loss = -T.min(weighted_probs, weighted_clipped_probs).mean()

                returns = advantage[batch] + values[batch]
                critic_loss = (returns-critic_value)**2
                critic_loss = critic_loss.mean()

                total_loss = actor_loss + 0.5*critic_loss
                self.actor.optimizer.zero_grad()
                self.critic.optimizer.zero_grad()
                total_loss.backward()
                self.actor.optimizer.step()
                self.critic.optimizer.step()

        self.memory.clear_memory()               

In [7]:
import gym
import numpy as np

In [None]:
if __name__ == '__main__':
    env = gym.make('CartPole-v0')
    N = 20
    batch_size = 5
    n_epochs = 4
    alpha = 0.0003
    agent = Agent(n_actions=env.action_space.n, batch_size=batch_size, 
                    alpha=alpha, n_epochs=n_epochs, 
                    input_dims=env.observation_space.shape)
    n_games = 300

    figure_file = 'plots/cartpole.png'

    best_score = env.reward_range[0]
    score_history = []

    learn_iters = 0
    avg_score = 0
    n_steps = 0

    for i in range(n_games):
        observation = env.reset()
        env.render()
        done = False
        score = 0
        while not done:
            action, prob, val = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            n_steps += 1
            score += reward
            agent.remember(observation, action, prob, val, reward, done)
            if n_steps % N == 0:
                agent.learn()
                learn_iters += 1
            observation = observation_
        score_history.append(score)
        avg_score = np.mean(score_history[-100:])

        if avg_score > best_score:
            best_score = avg_score
            #agent.save_models()

        print('episode', i, 'score %.1f' % score, 'avg score %.1f' % avg_score,
                'time_steps', n_steps, 'learning_steps', learn_iters)
    x = [i+1 for i in range(len(score_history))]
    plot_learning_curve(x, score_history)

  state = T.tensor([observation], dtype=T.float).to(self.actor.device)


episode 0 score 11.0 avg score 11.0 time_steps 11 learning_steps 0
episode 1 score 43.0 avg score 27.0 time_steps 54 learning_steps 2
episode 2 score 34.0 avg score 29.3 time_steps 88 learning_steps 4
episode 3 score 19.0 avg score 26.8 time_steps 107 learning_steps 5
episode 4 score 19.0 avg score 25.2 time_steps 126 learning_steps 6
episode 5 score 16.0 avg score 23.7 time_steps 142 learning_steps 7
episode 6 score 15.0 avg score 22.4 time_steps 157 learning_steps 7
episode 7 score 11.0 avg score 21.0 time_steps 168 learning_steps 8
episode 8 score 11.0 avg score 19.9 time_steps 179 learning_steps 8
episode 9 score 18.0 avg score 19.7 time_steps 197 learning_steps 9
episode 10 score 22.0 avg score 19.9 time_steps 219 learning_steps 10
episode 11 score 22.0 avg score 20.1 time_steps 241 learning_steps 12
episode 12 score 17.0 avg score 19.8 time_steps 258 learning_steps 12
episode 13 score 21.0 avg score 19.9 time_steps 279 learning_steps 13
episode 14 score 41.0 avg score 21.3 time_s

episode 114 score 200.0 avg score 115.2 time_steps 11837 learning_steps 591
episode 115 score 200.0 avg score 117.0 time_steps 12037 learning_steps 601
episode 116 score 200.0 avg score 118.9 time_steps 12237 learning_steps 611
episode 117 score 184.0 avg score 120.5 time_steps 12421 learning_steps 621
episode 118 score 25.0 avg score 120.7 time_steps 12446 learning_steps 622
episode 119 score 111.0 avg score 121.5 time_steps 12557 learning_steps 627
episode 120 score 200.0 avg score 123.3 time_steps 12757 learning_steps 637
episode 121 score 41.0 avg score 123.4 time_steps 12798 learning_steps 639
episode 122 score 200.0 avg score 125.0 time_steps 12998 learning_steps 649
episode 123 score 200.0 avg score 125.5 time_steps 13198 learning_steps 659
episode 124 score 200.0 avg score 127.2 time_steps 13398 learning_steps 669
episode 125 score 200.0 avg score 128.8 time_steps 13598 learning_steps 679
episode 126 score 162.0 avg score 129.6 time_steps 13760 learning_steps 688
episode 127 sc

episode 222 score 200.0 avg score 173.1 time_steps 30310 learning_steps 1515
episode 223 score 200.0 avg score 173.1 time_steps 30510 learning_steps 1525
episode 224 score 200.0 avg score 173.1 time_steps 30710 learning_steps 1535
episode 225 score 200.0 avg score 173.1 time_steps 30910 learning_steps 1545
episode 226 score 200.0 avg score 173.5 time_steps 31110 learning_steps 1555
episode 227 score 200.0 avg score 174.2 time_steps 31310 learning_steps 1565
episode 228 score 200.0 avg score 175.1 time_steps 31510 learning_steps 1575
episode 229 score 200.0 avg score 175.1 time_steps 31710 learning_steps 1585
episode 230 score 200.0 avg score 175.1 time_steps 31910 learning_steps 1595
episode 231 score 200.0 avg score 175.1 time_steps 32110 learning_steps 1605
episode 232 score 200.0 avg score 175.1 time_steps 32310 learning_steps 1615
episode 233 score 54.0 avg score 173.7 time_steps 32364 learning_steps 1618
episode 234 score 200.0 avg score 173.7 time_steps 32564 learning_steps 1628
