# Unity ML Agents
## Proximal Policy Optimization (PPO)
Contains an implementation of PPO as described [here](https://arxiv.org/abs/1707.06347).

In [103]:
import numpy as np
import os
import tensorflow as tf
import random

from pprint import pprint

from ppo.history import *
from ppo.models import *
from ppo.trainer import Trainer
from unityagents import *

### Hyperparameters

In [128]:
#Change these two names when you work on a new project
project_name = "Steering" # The sub-directory name for model and summary statistics or work on a new build 
env_name =     "v2.c" # Name of the training environment file.

load_model = False # Whether to load a saved model.
train_model = True # Whether to train the model.

model_number_to_load = 0 # which model to load
                         #If this value is 0, load the last model in the given version.
    
load_best_model = False # You can either load the checkpoint with the best value or load the last checkpoint.

max_steps = 5e10 # Set maximum number of steps to run environment.
summary_freq = 2500 # Frequency at which to save training statistics.
save_freq = 25000 # Frequency at which to save model.



### Setting up the necesary directory structure and naming conventions
project_path = "Projects/{}/Builds/{}".format(project_name, env_name)
curiculum_path = "Projects/{}/Builds/{}.json".format(project_name, env_name)

curriculum_file = curiculum_path

env_path = "Projects/{}/{}".format(project_name, env_name)

if not os.path.exists(env_path):
    os.makedirs(env_path)

if not load_model:
    model_count = len([i for i in os.listdir(env_path) if os.path.isdir(env_path)]) + 1
    print("This is the {}th model".format(model_count))
elif load_model:
    if model_number_to_load == 0:
        model_number_to_load =  len([i for i in os.listdir(env_path) if os.path.isdir(env_path)])
    print("This is loading the {}th model with the following values:".format(model_number_to_load))
    model_count = model_number_to_load

env_summary_path = 'Projects/{}/{}/m{}/Summaries'.format(project_name, env_name, model_count)
env_model_path = 'Projects/{}/{}/m{}/Models'.format(project_name, env_name, model_count)

if not os.path.exists(env_model_path):
    os.makedirs(env_model_path)

if not os.path.exists(env_summary_path):
    os.makedirs(env_summary_path)

### Algorithm-specific parameters for tuning
gamma = 0.99 # Reward discount rate.
lambd = 0.95 # Lambda parameter for GAE.
time_horizon = 2048 # How many steps to collect per agent before adding to buffer.
beta = 1e-3 # Strength of entropy regularization
num_epoch = 5 # Number of gradient descent steps per batch of experiences.
num_layers = 2 # Number of hidden layers between state/observation encoding and value/policy layers.
epsilon = 0.2 # Acceptable threshold around ratio of old and new policy probabilities.
buffer_size = 2048 # How large the experience buffer should be before gradient descent.
learning_rate = 5e-5 # Model learning rate.
hidden_units = 64 # Number of units in hidden layer.
batch_size = 64 # How many experiences per gradient descent update step.
normalize = False

### Logging dictionary for hyperparameters
hyperparameter_dict = {'max_steps':max_steps, 'projet_name':project_name, 'env_name':env_name,
    'curriculum_file':curriculum_file, 'gamma':gamma, 'lambd':lambd, 'time_horizon':time_horizon,
    'beta':beta, 'num_epoch':num_epoch, 'epsilon':epsilon, 'buffer_size':buffer_size,
    'leaning_rate':learning_rate, 'hidden_units':hidden_units, 'batch_size':batch_size}

documentation = \
'''
NUM ENV = 1

STATES:
        state.Add(transform.position.x);
        state.Add(transform.position.z);

        state.Add(transform.rotation.eulerAngles.y / 180.0f - 1.0f);

        state.Add(car_rb.velocity.x);
        state.Add(car_rb.velocity.z);

        state.Add(car_rb.angularVelocity.y);


        Vector3[] target_points = FindClosestTargetPoints("Target", 1);
        foreach (Vector3 target_point in target_points) {
            if (DebugMode) Debug.DrawLine(transform.position, target_point);
            state.Add(target_point.x - transform.position.x);
            state.Add(target_point.z - transform.position.z);
        }

        Vector3[]  obs_points = FindClosestTargetPoints("Obstacle", 1);
        foreach(Vector3 obs_point in obs_points) {
            if(DebugMode) Debug.DrawLine(transform.position, obs_point);
            state.Add(obs_point.x - transform.position.x);
            state.Add(obs_point.z - transform.position.z);
        }

        Vector3[]  wall_points = FindClosestTargetPoints("Wall", 2);
        foreach (Vector3 wall_point in wall_points) {
            if (DebugMode) Debug.DrawLine(transform.position, wall_point);
            state.Add(wall_point.x - transform.position.x);
            state.Add(wall_point.z - transform.position.z);
        }
        //Debug.Log(string.Format("Collect state and cummulative reward is:{0}", CumulativeReward));
        return state;
        
    private void OnCollisionEnter(Collision collision) {

COLLISION:
        if (collision.collider.gameObject.tag == "Target" && hitting_target_rewards_one) {
            reward += 1;
            done = true;
            //Debug.Log(string.Format("OnCollisionEnter cummulative reward is:{0}", CumulativeReward));
        }
        
AGENT STEP:
        
        action = (int)act[0];
        if (action == 0) {
            car_rb.AddForce(transform.forward * speed_mult, ForceMode.VelocityChange);
        }
        if (action == 1) {
            car_rb.AddForce(-transform.forward * speed_mult, ForceMode.VelocityChange);
        }
        if (action == 2) {
            car_rb.AddTorque(0f, turn_mult, 0f, ForceMode.VelocityChange);
        }
        if (action == 3) {
            car_rb.AddTorque(0f, -turn_mult, 0f, ForceMode.VelocityChange);
        }

        if (car_rb.velocity.magnitude > maxSpeed) car_rb.velocity = car_rb.velocity * slowing_down_constant;

            case RewardStyle.SimpleProgress:
                hitting_target_rewards_one = true;

                float this_distance = Vector3.Distance(transform.position, Target.transform.position);
                float delta = last_distance - this_distance;
                last_distance = this_distance;

                reward += delta / 10f;
                reward -= MinusRewardStep;

                break;
        }
'''
if True:
    pprint(hyperparameter_dict)
    
#Saving the model details in the summary.
model_name = "{}_m{}".format(env_name, model_count)    
with open('./Projects/{}/{}/m{}/Summaries/{}.txt'.format(project_name, env_name, model_count, model_name), 'w') as file:
    file.write("gamma = {}\n".format(gamma))
    file.write("lambd = {}\n".format(lambd))
    file.write("time_horizon = {}\n".format(time_horizon))
    file.write("beta = {}\n".format(beta))
    file.write("num_epoch = {}\n".format(num_epoch))
    file.write("epsilon = {}\n".format(epsilon))
    file.write("buffer_size = {}\n".format(buffer_size))
    file.write("learning_rate = {}\n".format(learning_rate))
    file.write("hidden_units = {}\n".format(hidden_units))
    file.write("batch_size = {}\n\n".format(batch_size))
    file.write("documentation = {}\n".format(documentation))


This is the 2th model
{'batch_size': 64,
 'beta': 0.001,
 'buffer_size': 2048,
 'curriculum_file': 'Projects/Steering/Builds/v2.c.json',
 'env_name': 'v2.c',
 'epsilon': 0.2,
 'gamma': 0.99,
 'hidden_units': 64,
 'lambd': 0.95,
 'leaning_rate': 5e-05,
 'max_steps': 50000000000.0,
 'num_epoch': 5,
 'projet_name': 'Steering',
 'time_horizon': 2048}


### Load the environment

In [127]:
# When the environment crashes it take a while for the socket to get freed up. This random port selection within a range
# helps with that.
port_save = random.randint(0,100)
print(project_path)
env = UnityEnvironment(file_name=project_path, curriculum=curriculum_file, base_port = 6083 + port_save)
print(str(env))
brain_name = env.external_brain_names[0]

Projects/Steering/Builds/v2.c


UnityTimeOutException: The Unity environment took too long to respond. Make sure Projects/Steering/Builds/v2.c does not need user interaction to launch and that the Academy and the external Brain(s) are attached to objects in the Scene.

### Train the Agent(s)

In [124]:
tf.reset_default_graph()

if curriculum_file == "None":
    curriculum_file = None


def get_progress():
    if curriculum_file is not None:
        if env._curriculum.measure_type == "progress":
            return steps / max_steps
        elif env._curriculum.measure_type == "reward":
            return last_reward
        else:
            return None
    else:
        return None

# Create the Tensorflow model graph
ppo_model = create_agent_model(env, lr=learning_rate,
                               h_size=hidden_units, epsilon=epsilon,
                               beta=beta, max_step=max_steps, 
                               normalize=normalize, num_layers=num_layers)

is_continuous = (env.brains[brain_name].action_space_type == "continuous")
use_observations = (env.brains[brain_name].number_observations > 0)
use_states = (env.brains[brain_name].state_space_size > 0)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

print("Training started.")
with tf.Session() as sess:
    # Instantiate model parameters
    if load_model:
        print('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(env_model_path)
        saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        sess.run(init)
    steps, last_reward = sess.run([ppo_model.global_step, ppo_model.last_reward])    
    summary_writer = tf.summary.FileWriter(env_summary_path)
    info = env.reset(train_mode=train_model, progress=get_progress())[brain_name]
    trainer = Trainer(ppo_model, sess, info, is_continuous, use_observations, use_states, train_model)
    if train_model:
        trainer.write_text(summary_writer, 'Hyperparameters', hyperparameter_dict, steps)
    while steps <= max_steps:
        if env.global_done:
            info = env.reset(train_mode=train_model, progress=get_progress())[brain_name]
        # Decide and take an action
        new_info = trainer.take_action(info, env, brain_name, steps, normalize)
        info = new_info
        trainer.process_experiences(info, time_horizon, gamma, lambd)
        if len(trainer.training_buffer['actions']) > buffer_size and train_model:
            # Perform gradient descent with experience buffer
            trainer.update_model(batch_size, num_epoch)
        if steps % summary_freq == 0 and steps != 0 and train_model:
            # Write training statistics to tensorboard.
            trainer.update_best(sess, saver=saver, model_path=env_model_path, steps=steps)
            trainer.write_summary(summary_writer, steps, env._curriculum.lesson_number)
        if steps % save_freq == 0 and steps != 0 and train_model:
            # Save Tensorflow model
            save_model(sess, model_path=env_model_path, steps=steps, saver=saver)
        steps += 1
        sess.run(ppo_model.increment_step)
        if len(trainer.stats['cumulative_reward']) > 0:
            mean_reward = np.mean(trainer.stats['cumulative_reward'])
            sess.run(ppo_model.update_reward, feed_dict={ppo_model.new_reward: mean_reward})
            last_reward = sess.run(ppo_model.last_reward)
    # Final save Tensorflow model
    if steps != 0 and train_model:
        save_model(sess, model_path=model_path, steps=steps, saver=saver)
env.close()
export_graph(env_model_path, env_name)

Training started.
Loading Model...
INFO:tensorflow:Restoring parameters from Projects/Steering/v1.1c/m6/Models\model-750000.cptk


INFO:tensorflow:Restoring parameters from Projects/Steering/v1.1c/m6/Models\model-750000.cptk
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Saved Model
Updated best model checkpoint!
Step: 752500. Mean Reward: 1.5291530745000004. Std of Reward: 0.00011110243021643268.
Updated best model checkpoint!
Step: 755000. Mean Reward: 1.5291655745000001. Std of Reward: 2.220446049250313e-16.
Step: 757500. Mean Reward: 1.5291655745000001. Std of Reward: 2.220446049250313e-16.
Step: 760000. Mean Reward: 1.5115054351756099. Std of Reward: 0.1588301807248924.
Step: 762500. Mean Reward: 1.5291655745000001. Std of Reward: 2.220446049250313e-16.
Step: 765000. Mean Reward: 1.5291655745000001. Std of Reward: 2.220446049250313e-16.


INFO:unityagents:
Lesson changed. Now in Lesson 1 : 	target_position_z -> 10


Updated best model checkpoint!
Step: 767500. Mean Reward: 1.7646641824793654. Std of Reward: 0.47471826219302937.
Updated best model checkpoint!
Step: 770000. Mean Reward: 2.487079911400001. Std of Reward: 4.440892098500626e-16.
Step: 772500. Mean Reward: 2.487079911400001. Std of Reward: 4.440892098500626e-16.
Step: 775000. Mean Reward: 2.4236425330461544. Std of Reward: 0.3908920739499417.
Saved Model
Step: 777500. Mean Reward: 2.487079911400001. Std of Reward: 4.440892098500626e-16.
Step: 780000. Mean Reward: 2.487079911400001. Std of Reward: 4.440892098500626e-16.
Step: 782500. Mean Reward: 2.487079911400001. Std of Reward: 4.440892098500626e-16.
Step: 785000. Mean Reward: 2.425228467505. Std of Reward: 0.38610204671325815.
Step: 787500. Mean Reward: 2.487079911400001. Std of Reward: 4.440892098500626e-16.
Step: 790000. Mean Reward: 2.487079911400001. Std of Reward: 4.440892098500626e-16.


INFO:unityagents:
Lesson changed. Now in Lesson 2 : 	target_position_z -> 20


Updated best model checkpoint!
Step: 792500. Mean Reward: 2.884770702583873. Std of Reward: 0.7143390986801341.
Updated best model checkpoint!
Step: 795000. Mean Reward: 3.473911396900001. Std of Reward: 4.440892098500626e-16.
Step: 797500. Mean Reward: 3.4736445214240006. Std of Reward: 0.0013074174821248394.
Step: 800000. Mean Reward: 3.392930521676924. Std of Reward: 0.4047044217327254.
Saved Model
Step: 802500. Mean Reward: 3.4723407407772813. Std of Reward: 0.007694612124084712.
Step: 805000. Mean Reward: 3.473911396900001. Std of Reward: 4.440892098500626e-16.
Step: 807500. Mean Reward: 3.3912925769961544. Std of Reward: 0.4102449063290584.
Step: 810000. Mean Reward: 3.473911396900001. Std of Reward: 4.440892098500626e-16.
Step: 812500. Mean Reward: 3.473911396900001. Std of Reward: 4.440892098500626e-16.
Step: 815000. Mean Reward: 3.473911396900001. Std of Reward: 4.440892098500626e-16.


INFO:unityagents:
Lesson changed. Now in Lesson 3 : 	target_position_z -> 30


Updated best model checkpoint!
Step: 817500. Mean Reward: 4.017759826042857. Std of Reward: 0.733045613536915.
Updated best model checkpoint!
Step: 820000. Mean Reward: 4.43816258485. Std of Reward: 0.0061643250567661595.
Updated best model checkpoint!
Step: 822500. Mean Reward: 4.440077094599999. Std of Reward: 0.0.
Step: 825000. Mean Reward: 4.218724347649999. Std of Reward: 0.9646248636490786.
Saved Model
Step: 827500. Mean Reward: 4.440077094599999. Std of Reward: 0.0.
Step: 830000. Mean Reward: 4.440077094599999. Std of Reward: 0.0.
Step: 832500. Mean Reward: 4.218724347649999. Std of Reward: 0.9646248636490786.
Step: 835000. Mean Reward: 4.440077094599999. Std of Reward: 0.0.
Step: 837500. Mean Reward: 4.440077094599999. Std of Reward: 0.0.
Step: 840000. Mean Reward: 4.438600670310525. Std of Reward: 0.0062639377619719255.
Step: 842500. Mean Reward: 4.229264954647618. Std of Reward: 0.9425569680312351.
Step: 845000. Mean Reward: 4.4393208908999995. Std of Reward: 0.00320830058537

Step: 1085000. Mean Reward: 4.440077094599999. Std of Reward: 0.0.
Step: 1087500. Mean Reward: 4.218724347649999. Std of Reward: 0.9646248636490786.
Step: 1090000. Mean Reward: 4.439725871315789. Std of Reward: 0.0014901141958553882.
Step: 1092500. Mean Reward: 4.440077094599999. Std of Reward: 0.0.
Step: 1095000. Mean Reward: 4.218724347649999. Std of Reward: 0.9646248636490786.
Step: 1097500. Mean Reward: 4.440077094599999. Std of Reward: 0.0.
Step: 1100000. Mean Reward: 4.440077094599999. Std of Reward: 0.0.
Saved Model
Step: 1102500. Mean Reward: 4.218518125965. Std of Reward: 0.9645781733798182.
Step: 1105000. Mean Reward: 4.4400270946. Std of Reward: 0.00021794494717691286.
Step: 1107500. Mean Reward: 4.439375267310526. Std of Reward: 0.0029776010136517736.
Step: 1110000. Mean Reward: 4.440077094599999. Std of Reward: 0.0.
Step: 1112500. Mean Reward: 4.218724347649999. Std of Reward: 0.9646248636490786.
Step: 1115000. Mean Reward: 4.440077094599999. Std of Reward: 0.0.
Step: 1117

KeyboardInterrupt: 

### Export the trained Tensorflow graph
Once the model has been trained and saved, we can export it as a .bytes file which Unity can embed.

In [125]:
export_best_graph(trainer, env_model_path, model_name)
export_graph(env_model_path, env_name)

Input checkpoint 'Projects/Steering/v1.1c/m6/Models/model_bests\best_model.cptk' doesn't exist!
INFO:tensorflow:Restoring parameters from Projects/Steering/v1.1c/m6/Models\model-1200000.cptk


INFO:tensorflow:Restoring parameters from Projects/Steering/v1.1c/m6/Models\model-1200000.cptk


INFO:tensorflow:Froze 4 variables.


INFO:tensorflow:Froze 4 variables.


Converted 4 variables to const ops.


### Test the Trained Model

In [101]:
port_save = random.randint(0,100)
env = UnityEnvironment(file_name = project_path, base_port = 7013 + port_save)
print(str(env))
brain_name = env.brain_names[0]

INFO:unityagents:
'Academy' started successfully!


Unity Academy name: Academy
        Number of brains: 1
        Reset Parameters :
		target_position_z -> 0.0
Unity brain name: Brain
        Number of observations (per agent): 0
        State space type: continuous
        State space size (per agent): 14
        Action space type: discrete
        Action space size (per agent): 4
        Memory space size (per agent): 0
        Action descriptions: Forward, Backwards, Right, Left


In [102]:
load_best = True;
print("The play session has started.")
with tf.Session() as sess:
    if load_best:
        model_path_to_load = env_model_path + "/model_bests"
        if not os.path.exists(model_path_to_load):
            print("Model bests does not exist!")
            model_path_to_load = model_path
    else:
        model_path_to_load = env_model_path
    ckpt = tf.train.get_checkpoint_state(model_path_to_load)
    saver.restore(sess, ckpt.model_checkpoint_path)
    
    steps = sess.run(ppo_model.global_step)
    info = env.reset(train_mode=False)[brain_name]
    trainer = Trainer( ppo_model, sess, info, is_continuous, use_observations, use_states, training=False)
    
    while steps <= max_steps:
        if env.global_done:
            info = env.reset(train_mode=False)[brain_name]
        new_info = trainer.take_action(info, env, brain_name, steps = 0, normalize = False)
        info = new_info
        trainer.process_experiences(info, time_horizon, gamma, lambd)
        steps += 1
        sess.run(ppo_model.increment_step)

The play session has started.
INFO:tensorflow:Restoring parameters from Projects/Steering/v1.1m/m1/Models/model_bests\best_model.cptk


INFO:tensorflow:Restoring parameters from Projects/Steering/v1.1m/m1/Models/model_bests\best_model.cptk


error: unpack requires a buffer of 4 bytes