In [1]:
import sys
import random
from collections import namedtuple, deque

import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from unityagents import UnityEnvironment
from IPython.display import clear_output

from model import PPOPolicyNetwork
from agent import PPOAgent

import tqdm

In [2]:
env = UnityEnvironment(file_name='Reacher_Windows_x86_64/Reacher.exe')
brain_name = env.brain_names[0]
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [3]:
num_agents = len(env_info.agents)
action_size = brain.vector_action_space_size
print ("Action Size is :", action_size)
states = env_info.vector_observations
state_size = states.shape[1]

env_info = env.reset(train_mode=False)[brain_name]     # reset the environment    
states = env_info.vector_observations                  # get the current state (for each agent)
scores = np.zeros(num_agents)                          # initialize the score (for each agent)
for i in range(100):
    actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
    actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
    env_info = env.step(actions)[brain_name]           # send all actions to tne environment
    next_states = env_info.vector_observations         # get next state (for each agent)
    rewards = env_info.rewards                         # get reward (for each agent)
    dones = env_info.local_done                        # see if episode finished
    scores += env_info.rewards                         # update the score (for each agent)
    states = next_states                               # roll over states to next time step
    if np.any(dones):                                  # exit loop if episode finished
        break
print('Total score (averaged over agents) this episode: {}'.format(np.mean(scores)))
# env.close()

Action Size is : 4
Total score (averaged over agents) this episode: 0.0


In [None]:
config = {
    'environment': {
        'state_size':  env_info.vector_observations.shape[1],
        'action_size': brain.vector_action_space_size,
        'number_of_agents': len(env_info.agents)
    },
    'pytorch': {
        'device': torch.device("cpu")
    },
    'hyperparameters': {
        'discount_rate': 0.99,
        'tau': 0.95,
        'gradient_clip': 5,
        'rollout_length': 2048,
        'optimization_epochs': 10,
        'ppo_clip': 0.2,
        'log_interval': 2048,
        'max_steps': 1e5,
        'mini_batch_number': 32,
        'entropy_coefficent': 0.01,
        'episode_count': 250,
        'hidden_size': 512,
        'adam_learning_rate': 3e-4,
        'adam_epsilon': 1e-5
    }
}

In [7]:
num_agents = len(env_info.agents)
max_t = 1e5
n_episodes = 5
LR = 3e-4          # learning rate
EPSILON = 1e-5      # Adam epsilon

state_size = env_info.vector_observations.shape[1]
hidden_size = 512
action_size = brain.vector_action_space_size

new_policy = PPOPolicyNetwork(state_size, hidden_size, action_size)
policy = new_policy

def play_around(env, brain_name, policy):
    env_info = env.reset(train_mode = True)[brain_name]
    states = env_info.vector_observations
    scores = np.zeros(num_agents)
    while True:
        actions, _, _, _ = policy(states)
        env_info = env.step(actions.cpu().detach().numpy())[brain_name]
        next_states = env_info.vector_observations
        rewards = env_info.rewards
        dones = env_info.local_done
        scores += env_info.rewards
        states = next_states
        if np.any(dones):
            break
    return np.mean(scores)
        

if True:
    env.info = env.reset(train_mode = True)[brain_name]
    optimizier = optim.Adam(policy.parameters(), LR, eps = EPSILON)
    agent = PPOAgent(env, brain_name, policy, optimizier, num_agents)
    all_scores = []
    averages = []
    last_max = 30.0
        
    for i in tqdm.tqdm(range(n_episodes)):
        agent.step(num_agents)
        last_mean_reward = play_around(env, brain_name, policy)
        last_average = np.mean(np.array(all_scores[-100:])) if len(all_scores) > 100 else np.mean(np.array(all_scores))
        all_scores.append(last_mean_reward)
        averages.append(last_average)
        if i % 5 == 0:
            # torch.save(policy.state_dict(), f"ppo-max-hiddensize-{hidden_size}.pth")
            pass
        if last_average > last_max:
            torch.save(policy.state_dict(), f"ppo-max-hiddensize-{hidden_size}.pth")
            last_max = last_average
            clear_output(True)
        print('Episode: {} Total score this episode: {} Last {} average: {}'.format(i + 1, last_mean_reward, min(i + 1, 100), last_average))

print (all_scores, averages)

  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Episode: 1 Total score this episode: 0.24199999459087848 Last 1 average: nan


 20%|████████████████▊                                                                   | 1/5 [00:41<02:46, 41.59s/it]

Episode: 2 Total score this episode: 0.2849999936297536 Last 2 average: 0.24199999459087848


 40%|█████████████████████████████████▌                                                  | 2/5 [01:21<02:02, 41.00s/it]

Episode: 3 Total score this episode: 0.5389999879524112 Last 3 average: 0.26349999411031605


 60%|██████████████████████████████████████████████████▍                                 | 3/5 [02:01<01:21, 40.73s/it]

Episode: 4 Total score this episode: 0.6759999848902225 Last 4 average: 0.3553333253910145


 80%|███████████████████████████████████████████████████████████████████▏                | 4/5 [02:41<00:40, 40.54s/it]

Episode: 5 Total score this episode: 0.9244999793358147 Last 5 average: 0.4354999902658165


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [03:20<00:00, 40.22s/it]


[0.24199999459087848, 0.2849999936297536, 0.5389999879524112, 0.6759999848902225, 0.9244999793358147] [nan, 0.24199999459087848, 0.26349999411031605, 0.3553333253910145, 0.4354999902658165]


In [None]:
torch.save(policy.state_dict(), f"ppo-max-hiddensize-{config['hyperparameters']['hidden_size']}.pth")

In [None]:
env.close()

In [11]:
num_agents = len(env_info.agents)
max_t = 1e5
n_episodes = 250

hidden_size = 512

def play_round(env, brain_name, policy):
    env_info = env.reset(train_mode = True)[brain_name]
    states = env_info.vector_observations
    scores = np.zeros(num_agents)
    while True:
        actions, _, _, _ = policy(states)
        env_info = env.step(actions.cpu().detach().numpy())[brain_name]
        next_states = env_info.vector_observations
        rewards = env_info.rewards
        dones = env_info.local_done
        scores += env_info.rewards
        states = next_states
        if np.any(dones):
            break
    return np.mean(scores)

def ppo(env, brain_name, policy, train):
    if train:
        optimizier = optim.Adam(policy.parameters(), LR, eps = EPSILON)
        agent = PPOAgent(env, brain_name, policy, optimizier, num_agents)
        all_scores = []
        averages = []
        last_max = 30.0
        
        for i in tqdm.tqdm(range(num_episodes)):
            agent.step()
            last_mean_reward = play_round(env, brain_name, policy)
            last_average = np.mean(np.array(all_scores[-100:])) if len(all_scores) > 100 else np.mean(np.array(all_scores))
            all_scores.append(last_mean_reward)
            averages.append(last_average)
            if last_average > last_max:
                torch.save(policy.state_dict(), f"models/ppo-max-hiddensize-{hidden_size}.pth")
                last_max = last_average
            clear_output(True)
            print('Episode: {} Total score this episode: {} Last {} average: {}'.format(i + 1, last_mean_reward, min(i + 1, 100), last_average))
        return all_scores, averages
    else:
        score = play_round(env, brain_name, policy)
        return [score], [score]
    


policy = PPOPolicyNetwork(state_size, hidden_size, action_size)
policy.load_state_dict(torch.load('ppo-max-hiddensize-512.pth'))
_, _ = ppo(env, brain_name, policy, train=False)
print (_)

[34.549499227758496]


In [12]:
env.close()