In [4]:
import sys
import random
from collections import namedtuple, deque

import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from unityagents import UnityEnvironment
from IPython.display import clear_output

from agent import Agent

In [5]:
env = UnityEnvironment(file_name='Reacher_Windows_x86_64/Reacher.exe')
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


## Taking Random Action

In [3]:
num_agents = len(env_info.agents)
action_size = brain.vector_action_space_size
print ("Action Size is :", action_size)
states = env_info.vector_observations
state_size = states.shape[1]

env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
for i in range(100):
    actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
    actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
    env_info = env.step(actions)[brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    dones = env_info.local_done                        # see if episode finished
    scores += env_info.rewards                         # update the score (for each agent)
    states = next_states                               # roll over states to next time step
    if np.any(dones):                                  # exit loop if episode finished
        break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))
# env.close()

Action Size is : 4
Total score (averaged over agents) this episode: 0.04349999902769923


## Training the Agent with PPO

In [13]:
num_agents = len(env_info.agents)
max_t = 1e5
n_episodes = 300
LR = 3e-4          # learning rate
EPSILON = 1e-5      # Adam epsilon
ROLLOUT_LENGTH = 2048

state_size = env_info.vector_observations.shape[1]
hidden_size = 512
action_size = brain.vector_action_space_size


agent = Agent( num_agents, state_size, action_size)


def test_agent(env, brain_name):
    env_info = env.reset(train_mode = True)[brain_name]
    states = env_info.vector_observations
    scores = np.zeros(num_agents)
    while True:
        actions, _, _= agent.act(states)
        env_info = env.step(actions.cpu().detach().numpy())[brain_name]
        next_states = env_info.vector_observations
        rewards = env_info.rewards
        dones = env_info.local_done
        scores += env_info.rewards
        states = next_states
        if np.any(dones):
            break
    return np.mean(scores)
        
# def dqn( :params: )
if True:
    env.info = env.reset(train_mode = True)[brain_name]
    all_scores = []
    all_scores_window = deque(maxlen=100)
        
    for i_episode in range(n_episodes):
        # Each iteration, N parallel actors collect T time steps of data
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations  
        
        rollout = []
        agents_rewards = np.zeros(num_agents)
        episode_rewards = []

        for _ in range(ROLLOUT_LENGTH):
            actions, log_probs, values = agent.act(states)
            env_info = env.step(actions.cpu().detach().numpy())[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = np.array([1 if t else 0 for t in env_info.local_done])
            agents_rewards += rewards
            
            for j, done in enumerate(dones):
                if dones[j]:
                    episode_rewards.append(agents_rewards[j])
                    agents_rewards[j] = 0
                    
            rollout.append([states, values.detach(), actions.detach(), log_probs.detach(), rewards, 1 - dones])
            
            states = next_states

        pending_value = agent.model(states)[-1]
        returns = pending_value.detach()
        rollout.append([states, pending_value, None, None, None, None])
        
        agent.step(rollout, returns, num_agents)
        
        test_mean_reward = test_agent(env, brain_name)

        all_scores.append(test_mean_reward)
        all_scores_window.append(test_mean_reward)

        if np.mean(all_scores_window) > 30.0:
            torch.save(agent.model.state_dict(), f"ppo_checkpoint.pth")
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(all_scores_window)))
            break       
        
        print('Episode {}, Total score this episode: {}, Last {} average: {}'.format(i_episode + 1, test_mean_reward, min(i_episode + 1, 100), np.mean(all_scores_window)) )

Episode 1, Total score this episode: 0.13849999690428377, Last 1 average: 0.13849999690428377
Episode 2, Total score this episode: 0.12649999717250465, Last 2 average: 0.1324999970383942
Episode 3, Total score this episode: 0.18849999578669668, Last 3 average: 0.15116666328782838
Episode 4, Total score this episode: 0.20299999546259642, Last 4 average: 0.1641249963315204
Episode 5, Total score this episode: 0.23649999471381306, Last 5 average: 0.17859999600797893


In [None]:
torch.save(agent.model.state_dict(), f"ppo_checkpoint.pth")

## Run a trained agent

In [15]:
agent.model.load_state_dict(torch.load('ppo_checkpoint.pth'))

score = test_agent(env, brain_name)
print (score)

0.44299999009817836


In [16]:
env.close()