# Unity ML Agents
## Proximal Policy Optimization (PPO)
Contains an implementation of PPO as described [here](https://arxiv.org/abs/1707.06347).

In [2]:
import numpy as np
import os
import tensorflow as tf
import random

from pprint import pprint

from ppo.history import *
from ppo.models import *
from ppo.trainer import Trainer
from unityagents import *

### Hyperparameters

In [3]:
#Change these two names when you work on a new project
project_name = "Steering" # The sub-directory name for model and summary statistics or work on a new build 
env_name =     "v0" # Name of the training environment file.

load_model = False # Whether to load a saved model.
train_model = True # Whether to train the model.

model_number_to_load = 0 # which model to load
                         #If this value is 0, load the last model in the given version.
    
load_best_model = False # You can either load the checkpoint with the best value or load the last checkpoint.

max_steps = 5e10 # Set maximum number of steps to run environment.
summary_freq = 2500 # Frequency at which to save training statistics.
save_freq = 25000 # Frequency at which to save model.

curriculum_file = None

### Setting up the necesary directory structure and naming conventions
project_path = "Projects/{}/Builds/{}".format(project_name, env_name)
env_path = "Projects/{}/{}".format(project_name, env_name)

if not os.path.exists(env_path):
    os.makedirs(env_path)

if not load_model:
    model_count = len([i for i in os.listdir(env_path) if os.path.isdir(env_path)]) + 1
    print("This is the {}th model".format(model_count))
elif load_model:
    if model_number_to_load == 0:
        model_number_to_load =  len([i for i in os.listdir(env_path) if os.path.isdir(env_path)])
    print("This is loading the {}th model with the following values:".format(model_number_to_load))
    model_count = model_number_to_load

env_summary_path = 'Projects/{}/{}/m{}/Summaries'.format(project_name, env_name, model_count)
env_model_path = 'Projects/{}/{}/m{}/Models'.format(project_name, env_name, model_count)

if not os.path.exists(env_model_path):
    os.makedirs(env_model_path)

if not os.path.exists(env_summary_path):
    os.makedirs(env_summary_path)

### Algorithm-specific parameters for tuning
gamma = 0.995 # Reward discount rate.
lambd = 0.95 # Lambda parameter for GAE.
time_horizon = 2048 # How many steps to collect per agent before adding to buffer.
beta = 1e-3 # Strength of entropy regularization
num_epoch = 3 # Number of gradient descent steps per batch of experiences.
num_layers = 1 # Number of hidden layers between state/observation encoding and value/policy layers.
epsilon = 0.1 # Acceptable threshold around ratio of old and new policy probabilities.
buffer_size = 2048 * 4 # How large the experience buffer should be before gradient descent.
learning_rate = 5e-5 # Model learning rate.
hidden_units = 32 # Number of units in hidden layer.
batch_size = 64 # How many experiences per gradient descent update step.
normalize = False

### Logging dictionary for hyperparameters
hyperparameter_dict = {'max_steps':max_steps, 'projet_name':project_name, 'env_name':env_name,
    'curriculum_file':curriculum_file, 'gamma':gamma, 'lambd':lambd, 'time_horizon':time_horizon,
    'beta':beta, 'num_epoch':num_epoch, 'epsilon':epsilon, 'buffer_size':buffer_size,
    'leaning_rate':learning_rate, 'hidden_units':hidden_units, 'batch_size':batch_size}

documentation = \
'''
state.Add(transform.position.x);
        state.Add(transform.position.z);

        state.Add(transform.rotation.eulerAngles.y / 180.0f - 1.0f);

        state.Add(Target.transform.position.x - transform.position.x);
        state.Add(Target.transform.position.z - transform.position.z);

        Vector3[]  obs_points = FindClosestTargetPoints("Obstacle", 1);
        foreach(Vector3 obs_point in obs_points) {
            if(DebugMode) Debug.DrawLine(transform.position, obs_point);
            state.Add(obs_point.x - transform.position.x);
            state.Add(obs_point.z - transform.position.z);
        }

        Vector3[]  wall_points = FindClosestTargetPoints("Wall", 2);
        foreach (Vector3 wall_point in wall_points) {
            if (DebugMode) Debug.DrawLine(transform.position, wall_point);
            state.Add(wall_point.x - transform.position.x);
            state.Add(wall_point.z - transform.position.z);
        }
        
    private void OnCollisionEnter(Collision collision) {

        if (collision.collider.gameObject.tag == "Target" && hitting_target_rewards_one) {
            reward += 1;
            done = true;
        }
        action = (int)act[0];
        if (action == 0) {
            car_rb.AddForce(transform.forward * speed_mult, ForceMode.VelocityChange);
        }
        if (action == 1) {
            car_rb.AddForce(-transform.forward * speed_mult, ForceMode.VelocityChange);
        }
        if (action == 2) {
            car_rb.AddTorque(0f, turn_mult, 0f, ForceMode.VelocityChange);
        }
        if (action == 3) {
            car_rb.AddTorque(0f, -turn_mult, 0f, ForceMode.VelocityChange);
        }

        if (car_rb.velocity.magnitude > maxSpeed) car_rb.velocity = car_rb.velocity * slowing_down_constant;
       
            case RewardStyle.SimpleProgress:
                hitting_target_rewards_one = true;
                float this_distance = Vector3.Distance(transform.position, Target.transform.position);
                float delta = last_distance - this_distance;
                reward += delta / 10f;
                last_distance = this_distance;
                //reward -= MinusRewardStep;
    }
'''
if True:
    pprint(hyperparameter_dict)
    
#Saving the model details in the summary.
model_name = "{}_m{}".format(env_name, model_count)    
with open('./Projects/{}/{}/m{}/Summaries/{}.txt'.format(project_name, env_name, model_count, model_name), 'w') as file:
    file.write("gamma = {}\n".format(gamma))
    file.write("lambd = {}\n".format(lambd))
    file.write("time_horizon = {}\n".format(time_horizon))
    file.write("beta = {}\n".format(beta))
    file.write("num_epoch = {}\n".format(num_epoch))
    file.write("epsilon = {}\n".format(epsilon))
    file.write("buffer_size = {}\n".format(buffer_size))
    file.write("learning_rate = {}\n".format(learning_rate))
    file.write("hidden_units = {}\n".format(hidden_units))
    file.write("batch_size = {}\n\n".format(batch_size))
    file.write("documentation = {}\n".format(documentation))


This is the 1th model
{'batch_size': 64,
 'beta': 0.001,
 'buffer_size': 8192,
 'curriculum_file': None,
 'env_name': 'v0',
 'epsilon': 0.1,
 'gamma': 0.995,
 'hidden_units': 32,
 'lambd': 0.95,
 'leaning_rate': 5e-05,
 'max_steps': 50000000000.0,
 'num_epoch': 3,
 'projet_name': 'Steering',
 'time_horizon': 2048}


### Load the environment

In [4]:
# When the environment crashes it take a while for the socket to get freed up. This random port selection within a range
# helps with that.
port_save = random.randint(0,100)
print(project_path)
env = UnityEnvironment(file_name=project_path, curriculum=curriculum_file, base_port = 6083 + port_save)
print(str(env))
brain_name = env.external_brain_names[0]

Projects/Steering/Builds/v0


INFO:unityagents:
'Academy' started successfully!


Unity Academy name: Academy
        Number of brains: 1
        Reset Parameters :
		
Unity brain name: Brain
        Number of observations (per agent): 0
        State space type: continuous
        State space size (per agent): 11
        Action space type: discrete
        Action space size (per agent): 4
        Memory space size (per agent): 0
        Action descriptions: , , , 


### Train the Agent(s)

In [None]:
tf.reset_default_graph()

if curriculum_file == "None":
    curriculum_file = None


def get_progress():
    if curriculum_file is not None:
        if env._curriculum.measure_type == "progress":
            return steps / max_steps
        elif env._curriculum.measure_type == "reward":
            return last_reward
        else:
            return None
    else:
        return None

# Create the Tensorflow model graph
ppo_model = create_agent_model(env, lr=learning_rate,
                               h_size=hidden_units, epsilon=epsilon,
                               beta=beta, max_step=max_steps, 
                               normalize=normalize, num_layers=num_layers)

is_continuous = (env.brains[brain_name].action_space_type == "continuous")
use_observations = (env.brains[brain_name].number_observations > 0)
use_states = (env.brains[brain_name].state_space_size > 0)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

print("Training started.")
with tf.Session() as sess:
    # Instantiate model parameters
    if load_model:
        print('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(model_path)
        saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        sess.run(init)
    steps, last_reward = sess.run([ppo_model.global_step, ppo_model.last_reward])    
    summary_writer = tf.summary.FileWriter(env_summary_path)
    info = env.reset(train_mode=train_model, progress=get_progress())[brain_name]
    trainer = Trainer(ppo_model, sess, info, is_continuous, use_observations, use_states, train_model)
    if train_model:
        trainer.write_text(summary_writer, 'Hyperparameters', hyperparameter_dict, steps)
    while steps <= max_steps:
        if env.global_done:
            info = env.reset(train_mode=train_model, progress=get_progress())[brain_name]
        # Decide and take an action
        new_info = trainer.take_action(info, env, brain_name, steps, normalize)
        info = new_info
        trainer.process_experiences(info, time_horizon, gamma, lambd)
        if len(trainer.training_buffer['actions']) > buffer_size and train_model:
            # Perform gradient descent with experience buffer
            trainer.update_model(batch_size, num_epoch)
        if steps % summary_freq == 0 and steps != 0 and train_model:
            # Write training statistics to tensorboard.
            trainer.update_best(sess, saver=saver, model_path=env_model_path, steps=steps)
            trainer.write_summary(summary_writer, steps, env._curriculum.lesson_number)
        if steps % save_freq == 0 and steps != 0 and train_model:
            # Save Tensorflow model
            save_model(sess, model_path=env_model_path, steps=steps, saver=saver)
        steps += 1
        sess.run(ppo_model.increment_step)
        if len(trainer.stats['cumulative_reward']) > 0:
            mean_reward = np.mean(trainer.stats['cumulative_reward'])
            sess.run(ppo_model.update_reward, feed_dict={ppo_model.new_reward: mean_reward})
            last_reward = sess.run(ppo_model.last_reward)
    # Final save Tensorflow model
    if steps != 0 and train_model:
        save_model(sess, model_path=model_path, steps=steps, saver=saver)
env.close()
export_graph(env_model_path, env_name)

Training started.


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Updated best model checkpoint!
Step: 10000. Mean Reward: 1.0. Std of Reward: 0.0.


### Export the trained Tensorflow graph
Once the model has been trained and saved, we can export it as a .bytes file which Unity can embed.

In [None]:
export_best_graph(trainer, env_model_path, model_name)
export_graph(env_model_path, env_name)



### Test the Trained Model

In [None]:
port_save = random.randint(0,100)
env = UnityEnvironment(file_name = project_path, base_port = 7013 + port_save)
print(str(env))
brain_name = env.brain_names[0]

In [None]:
load_best = False;
print("The play session has started.")
with tf.Session() as sess:
    if load_best:
        model_path_to_load = env_model_path + "/model_bests"
        if not os.path.exists(model_path_to_load):
            print("Model bests does not exist!")
            model_path_to_load = model_path
    else:
        model_path_to_load = env_model_path
    ckpt = tf.train.get_checkpoint_state(model_path_to_load)
    saver.restore(sess, ckpt.model_checkpoint_path)
    
    steps = sess.run(ppo_model.global_step)
    info = env.reset(train_mode=False)[brain_name]
    trainer = Trainer( ppo_model, sess, info, is_continuous, use_observations, use_states, training=False)
    
    while steps <= max_steps:
        if env.global_done:
            info = env.reset(train_mode=False)[brain_name]
        new_info = trainer.take_action(info, env, brain_name, steps = 0, normalize = False)
        info = new_info
        trainer.process_experiences(info, time_horizon, gamma, lambd)
        steps += 1
        sess.run(ppo_model.increment_step)