In [None]:
from unityagents import UnityEnvironment

import random
from collections import namedtuple, deque
from agent import Agent

In [None]:
env = UnityEnvironment(file_name='Reacher_Windows_x86_64/Reacher.exe')
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]

## Taking Random Action

In [None]:
num_agents = len(env_info.agents)
action_size = brain.vector_action_space_size
print ("Action Size is :", action_size)
states = env_info.vector_observations
state_size = states.shape[1]

env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
for i in range(100):
    actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
    actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
    env_info = env.step(actions)[brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    dones = env_info.local_done                        # see if episode finished
    scores += env_info.rewards                         # update the score (for each agent)
    states = next_states                               # roll over states to next time step
    if np.any(dones):                                  # exit loop if episode finished
        break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))
# env.close()

## Training the Agent with PPO

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim

num_agents = len(env_info.agents)
max_t = 1e5
n_episodes = 300
LR = 3e-4          # learning rate
EPSILON = 1e-5      # Adam epsilon
ROLLOUT_LENGTH = 1024

state_size = env_info.vector_observations.shape[1]
hidden_size = 512
action_size = brain.vector_action_space_size


agent = Agent( num_agents, state_size, action_size)
# CONTINUE TRAINING
# agent.model.load_state_dict(torch.load('ppo_checkpoint.pth'))


def test_agent(env, brain_name):
    env_info = env.reset(train_mode = True)[brain_name]
    states = env_info.vector_observations
    scores = np.zeros(num_agents)
    while True:
        actions, _, _= agent.act(states)
        env_info = env.step(actions.cpu().detach().numpy())[brain_name]
        next_states = env_info.vector_observations
        rewards = env_info.rewards
        dones = env_info.local_done
        scores += env_info.rewards
        states = next_states
        if np.any(dones):
            break
    return np.mean(scores)
        
# def ppo( :params: )
if True:
    env.info = env.reset(train_mode = True)[brain_name]
    scores = []
    scores_window = deque(maxlen=100)
        
    for i_episode in range(1, n_episodes+1):
        # Each iteration, N parallel actors collect T time steps of data
        
        # AGENT: def step(self, rollout, num_agents):
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations  
        rollout = []
        agents_rewards = []

        for _ in range(ROLLOUT_LENGTH):
            actions, log_probs, values = agent.act(states)
            env_info = env.step(actions.cpu().detach().numpy())[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            # print ("env_info.local_done", env_info.local_done)
            agents_rewards += rewards            
            dones_ = np.array([1 if d else 0 for d in env_info.local_done])
            # append tuple ( s, a, p(a|s), r, dones, V(s) )
            rollout.append([states, actions.detach(), log_probs.detach(), rewards, 1 - dones_, values.detach()])
            states = next_states
            if np.any(dones):                                  # exit loop if episode finished
                break

        # last step
        values = agent.model(states)[-1].detach()
        rollout.append([states, None, None, None, None, values])
        
        agent.step(rollout, num_agents)
        
        test_mean_reward = test_agent(env, brain_name)
        scores.append(test_mean_reward)
        scores_window.append(test_mean_reward)

        if np.mean(scores_window) > 30.0:
            torch.save(agent.model.state_dict(), f"ppo_checkpoint.pth")
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(all_scores_window)))
            break       
        
        print('Episode {}, Total score this episode: {}, Last {} average: {}'.format(i_episode, test_mean_reward, min(i_episode, 100), np.mean(scores_window)) )

In [None]:
torch.save(agent.model.state_dict(), f"ppo_checkpoint.pth")
# PLOT THE SCORES
import matplotlib.pyplot as plt
%matplotlib inline

score = scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(score)), score)
plt.ylabel('Score')
plt.xlabel('Episode Number')
plt.show()

## Run a trained agent

In [None]:
agent = Agent( num_agents, state_size, action_size)
agent.model.load_state_dict(torch.load('ppo_checkpoint.pth'))

score = test_agent(env, brain_name)
print (score)

In [None]:
env.close()