# Unity ML Agents
## Proximal Policy Optimization (PPO)
Contains an implementation of PPO as described [here](https://arxiv.org/abs/1707.06347).

In [101]:
import numpy as np
import os
import tensorflow as tf
import random

from pprint import pprint

from ppo.history import *
from ppo.models import *
from ppo.trainer import Trainer
from unityagents import *

### Hyperparameters

In [128]:
#Change these two names when you work on a new project
project_name = "Steering" # The sub-directory name for model and summary statistics or work on a new build 
env_name =     "v8.1cm" # Name of the training environment file.
json_name = "v5.0c"

load_model = True # Whether to load a saved model.
train_model = True # Whether to train the model.

load_model_from_diff_env = True
model_number_to_load = 0 # which model to load
                         #If this value is 0, load the last model in the given version.
                         #If this value is -1, then load the last model of another version
load_model_env_name = "v8.0cm"
    
load_best_model = False # You can either load the checkpoint with the best value or load the last checkpoint.

max_steps = 5e10 # Set maximum number of steps to run environment.
summary_freq = 2500 # Frequency at which to save training statistics.
save_freq = 25000 # Frequency at which to save model.



### Setting up the necesary directory structure and naming conventions
project_path = "Projects/{}/Builds/{}".format(project_name, env_name)
curiculum_path = "Projects/{}/Builds/{}.json".format(project_name, json_name)

curriculum_file = curiculum_path

env_path = "Projects/{}/{}".format(project_name, env_name)
load_model_env_path = "Projects/{}/{}".format(project_name, load_model_env_name)

if not os.path.exists(env_path):
    os.makedirs(env_path)


if not load_model:
    model_count = len([i for i in os.listdir(env_path) if os.path.isdir(env_path)]) + 1
    print("This is the {}th model".format(model_count))
elif load_model:
    if model_number_to_load == 0:
        model_number_to_load =  len([i for i in os.listdir(env_path) if os.path.isdir(env_path)])
    elif load_model_from_diff_env and model_number_to_load == 0:
        model_number_to_load =  len([i for i in os.listdir(load_model_env_path) if os.path.isdir(load_model_env_path)])
        
    print("This is loading the {}th model with the following values:".format(model_number_to_load))
    
    model_count = model_number_to_load

env_summary_path = 'Projects/{}/{}/m{}/Summaries'.format(project_name, env_name, model_count)
if load_model_from_diff_env: 
    load_model_env_path = 'Projects/{}/{}/m{}/Models'.format(project_name, load_model_env_name, model_count)
    print("Loading mode to train from {}".format(load_model_env_path))
env_model_path = 'Projects/{}/{}/m{}/Models'.format(project_name, env_name, model_count)

if not os.path.exists(env_model_path):
    os.makedirs(env_model_path)

if not os.path.exists(env_summary_path):
    os.makedirs(env_summary_path)

### Algorithm-specific parameters for tuning
gamma = 0.99 # Reward discount rate.
lambd = 0.95 # Lambda parameter for GAE.
time_horizon = 2048 # How many steps to collect per agent before adding to buffer.
beta = 1e-3 # Strength of entropy regularization
num_epoch = 5 # Number of gradient descent steps per batch of experiences.
num_layers = 3 # Number of hidden layers between state/observation encoding and value/policy layers.
epsilon = 0.2 # Acceptable threshold around ratio of old and new policy probabilities.
buffer_size = 2048 * 2 # How large the experience buffer should be before gradient descent.
learning_rate = 5e-5 # Model learning rate.
hidden_units = 512 # Number of units in hidden layer.
batch_size = 64 * 2 # How many experiences per gradient descent update step.
normalize = False

### Logging dictionary for hyperparameters
hyperparameter_dict = {'max_steps':max_steps, 'projet_name':project_name, 'env_name':env_name,
    'curriculum_file':curriculum_file, 'gamma':gamma, 'lambd':lambd, 'time_horizon':time_horizon,
    'beta':beta, 'num_epoch':num_epoch, 'epsilon':epsilon, 'buffer_size':buffer_size,
    'leaning_rate':learning_rate, 'hidden_units':hidden_units, 'batch_size':batch_size}

documentation = \
'''
V7
No rotation, static obstacles. 5 wall seekers
No noop
NUM ENV = 1

STATES:

        List<float> state = new List<float>();

        state.Add(transform.rotation.eulerAngles.y / 180.0f - 1.0f);

        state.Add(car_rb.velocity.x);
        state.Add(car_rb.velocity.z);

        state.Add(car_rb.angularVelocity.y);

        state.Add(Vector3.Distance(transform.position, target.transform.position));
        state.Add(target.transform.position.x - transform.position.x);
        state.Add(target.transform.position.z - transform.position.z);

        Vector3[]  wall_points = FindClosestTargetPoints("Wall", 5);
        foreach (Vector3 wall_point in wall_points) {
            if (DebugMode) Debug.DrawLine(transform.position, wall_point);
            state.Add(Vector3.Distance(transform.position, wall_point));
            state.Add(wall_point.x - transform.position.x);
            state.Add(wall_point.z - transform.position.z);
        }

        
    private void OnCollisionEnter(Collision collision) {

COLLISION:
        if(rewardStyle == RewardStyle.ProgressAvoidObs) {
            if (collision.collider.gameObject.tag == "Wall" || collision.collider.gameObject.tag == "Obstacle") {
                reward -= 1f;
                cumulReward = +1;
                done = true;
                //Debug.Log(string.Format("OnCollisionEnter cummulative reward is:{0}", CumulativeReward));
            }
        }
        
AGENT STEP:
        action = (int)act[0];
        if (action == 0) {
            car_rb.AddForce(transform.forward * speed_mult, ForceMode.VelocityChange);
        }
        if (action == 1) {
            car_rb.AddTorque(0f, turn_mult, 0f, ForceMode.VelocityChange);
        }
        if (action == 2) {
            car_rb.AddTorque(0f, -turn_mult, 0f, ForceMode.VelocityChange);
        }
        car_rb.velocity = car_rb.velocity.magnitude * transform.forward;

        if (car_rb.velocity.magnitude > maxSpeed) car_rb.velocity = car_rb.velocity * slowing_down_constant;

            case RewardStyle.ProgressAvoidObs:
                hitting_target_rewards_one = true;

                this_distance = Vector3.Distance(transform.position, Target.transform.position);
                delta = last_distance - this_distance;
                last_distance = this_distance;

                reward += delta / 10f;
                reward -= MinusRewardStep;
                break;

        }
        }
'''
if True:
    pprint(hyperparameter_dict)
    
#Saving the model details in the summary.
model_name = "{}_m{}".format(env_name, model_count)    
with open('./Projects/{}/{}/m{}/Summaries/{}.txt'.format(project_name, env_name, model_count, model_name), 'w') as file:
    file.write("gamma = {}\n".format(gamma))
    file.write("lambd = {}\n".format(lambd))
    file.write("time_horizon = {}\n".format(time_horizon))
    file.write("beta = {}\n".format(beta))
    file.write("num_epoch = {}\n".format(num_epoch))
    file.write("epsilon = {}\n".format(epsilon))
    file.write("buffer_size = {}\n".format(buffer_size))
    file.write("learning_rate = {}\n".format(learning_rate))
    file.write("hidden_units = {}\n".format(hidden_units))
    file.write("batch_size = {}\n\n".format(batch_size))
    file.write("documentation = {}\n".format(documentation))


This is loading the 1th model with the following values:
Loading mode to train from Projects/Steering/v8.0cm/m1/Models
{'batch_size': 128,
 'beta': 0.001,
 'buffer_size': 4096,
 'curriculum_file': 'Projects/Steering/Builds/v5.0c.json',
 'env_name': 'v8.1cm',
 'epsilon': 0.2,
 'gamma': 0.99,
 'hidden_units': 512,
 'lambd': 0.95,
 'leaning_rate': 5e-05,
 'max_steps': 50000000000.0,
 'num_epoch': 5,
 'projet_name': 'Steering',
 'time_horizon': 2048}


### Load the environment

In [129]:
# When the environment crashes it take a while for the socket to get freed up. This random port selection within a range
# helps with that.
port_save = random.randint(0,100)
print(project_path)
env = UnityEnvironment(file_name=project_path, curriculum=curriculum_file, base_port = 6083 + port_save)
print(str(env))
brain_name = env.external_brain_names[0]

Projects/Steering/Builds/v8.1cm


INFO:unityagents:
'Academy' started successfully!


Unity Academy name: Academy
        Number of brains: 1
        Reset Parameters :
		target_position_z -> 0.0
		num_obstacles -> 1.0
Unity brain name: Brain
        Number of observations (per agent): 0
        State space type: continuous
        State space size (per agent): 22
        Action space type: discrete
        Action space size (per agent): 3
        Memory space size (per agent): 0
        Action descriptions: Forward, Right, Left


### Train the Agent(s)

In [130]:
tf.reset_default_graph()

if curriculum_file == "None":
    curriculum_file = None


def get_progress():
    if curriculum_file is not None:
        if env._curriculum.measure_type == "progress":
            return steps / max_steps
        elif env._curriculum.measure_type == "reward":
            return last_reward
        else:
            return None
    else:
        return None

# Create the Tensorflow model graph
ppo_model = create_agent_model(env, lr=learning_rate,
                               h_size=hidden_units, epsilon=epsilon,
                               beta=beta, max_step=max_steps, 
                               normalize=normalize, num_layers=num_layers)

is_continuous = (env.brains[brain_name].action_space_type == "continuous")
use_observations = (env.brains[brain_name].number_observations > 0)
use_states = (env.brains[brain_name].state_space_size > 0)

init = tf.global_variables_initializer()
saver = tf.train.Saver(max_to_keep = 10000)

print("Training started for {}".format(env_name))
with tf.Session() as sess:
    # Instantiate model parameters
    if load_model:
        if load_model_from_diff_env:
            print('Loading Model from different env with path {}'.format(load_model_env_path))
            ckpt = tf.train.get_checkpoint_state(load_model_env_path)
        else:
            print('Loading Model from {}'.format(env_model_path))
            ckpt = tf.train.get_checkpoint_state(env_model_path)
        saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        sess.run(init)
    steps, last_reward = sess.run([ppo_model.global_step, ppo_model.last_reward])    
    summary_writer = tf.summary.FileWriter(env_summary_path)
    info = env.reset(train_mode=train_model, progress=get_progress())[brain_name]
    trainer =  Trainer(ppo_model, sess, info, is_continuous, use_observations, use_states, train_model)
    if train_model:
        trainer.write_text(summary_writer, 'Hyperparameters', hyperparameter_dict, steps)
    while steps <= max_steps:
        if env.global_done:
            info = env.reset(train_mode=train_model, progress=get_progress())[brain_name]
        # Decide and take an action
        new_info = trainer.take_action(info, env, brain_name, steps, normalize)
        info = new_info
        trainer.process_experiences(info, time_horizon, gamma, lambd)
        if len(trainer.training_buffer['actions']) > buffer_size and train_model:
            # Perform gradient descent with experience buffer
            trainer.update_model(batch_size, num_epoch)
        if steps % summary_freq == 0 and steps != 0 and train_model:
            # Write training statistics to tensorboard.
            trainer.update_best(sess, saver=saver, model_path=env_model_path, steps=steps)
            trainer.write_summary(summary_writer, steps, env._curriculum.lesson_number)
        if steps % save_freq == 0 and steps != 0 and train_model:
            # Save Tensorflow model
            save_model(sess, model_path=env_model_path, steps=steps, saver=saver)
        steps += 1
        sess.run(ppo_model.increment_step)
        if len(trainer.stats['cumulative_reward']) > 0:
            mean_reward = np.mean(trainer.stats['cumulative_reward'])
            sess.run(ppo_model.update_reward, feed_dict={ppo_model.new_reward: mean_reward})
            last_reward = sess.run(ppo_model.last_reward)
    # Final save Tensorflow model
    if steps != 0 and train_model:
        save_model(sess, model_path=model_path, steps=steps, saver=saver)
env.close()
export_graph(env_model_path, env_name)

Training started for v8.1cm
Loading Model from different env with path Projects/Steering/v8.0cm/m1/Models
INFO:tensorflow:Restoring parameters from Projects/Steering/v8.0cm/m1/Models\model-350000.cptk


INFO:tensorflow:Restoring parameters from Projects/Steering/v8.0cm/m1/Models\model-350000.cptk
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Saved Model
Updated best model checkpoint!
Step: 352500. Mean Reward: 0.9487370953874343. Std of Reward: 4.170482148704712.
Step: 355000. Mean Reward: 0.8626797599519022. Std of Reward: 4.1228445194690195.
Updated best model checkpoint!
Step: 357500. Mean Reward: 1.0778588325465333. Std of Reward: 4.259773578895024.
Step: 360000. Mean Reward: 0.9736731350666801. Std of Reward: 4.168640159369156.
Step: 362500. Mean Reward: 0.9170957120048122. Std of Reward: 4.18961609236072.
Step: 365000. Mean Reward: 0.9804547800246014. Std of Reward: 4.18683847832272.
Step: 367500. Mean Reward: 1.0667051508534366. Std of Reward: 4.233322035066295.
Step: 370000. Mean Reward: 0.9115353294958478. Std of Reward: 4.1283642245445975.
Updated best model checkpoint!
Step: 372500. Mean Reward: 1.2210780870881661. Std of Reward: 4.352280303310389.
Step: 375000. Mean Reward: 1.1413598361289934. Std of Reward: 4.269855842807951.
Saved Model
Step: 377500. Mean Reward: 1.0048454966002183. Std of Reward: 4.208316130

Step: 597500. Mean Reward: 0.954992021718731. Std of Reward: 4.2654060786910035.
Step: 600000. Mean Reward: 1.0952545219566774. Std of Reward: 4.3787410851775785.
Saved Model
Step: 602500. Mean Reward: 0.8100816117016347. Std of Reward: 4.181896072770293.
Step: 605000. Mean Reward: 1.0672393339055077. Std of Reward: 4.380172732017683.
Step: 607500. Mean Reward: 0.9066652231202156. Std of Reward: 4.206320305166082.
Step: 610000. Mean Reward: 0.6923070191422529. Std of Reward: 4.083419646691783.
Step: 612500. Mean Reward: 0.7119825375337558. Std of Reward: 4.1281570921864175.
Step: 615000. Mean Reward: 0.9672147830074264. Std of Reward: 4.3289116904904015.
Step: 617500. Mean Reward: 1.2993181364771795. Std of Reward: 4.570207455819326.
Step: 620000. Mean Reward: 0.9295799628524068. Std of Reward: 4.292352299958648.
Step: 622500. Mean Reward: 1.1198695457126395. Std of Reward: 4.424604065835456.
Step: 625000. Mean Reward: 0.892660628251767. Std of Reward: 4.249042451131072.
Saved Model
St

Saved Model
Step: 852500. Mean Reward: 1.2009882349414724. Std of Reward: 4.842159125557519.
Step: 860000. Mean Reward: -0.7126476479916524. Std of Reward: 6.07362671704991.
Step: 862500. Mean Reward: -2.0466139141708286. Std of Reward: 1.0877786156498996.
Step: 865000. Mean Reward: -1.5018800277324669. Std of Reward: 0.4463317147233784.
Step: 867500. Mean Reward: -1.035549069721745. Std of Reward: 5.429814708072409.
Step: 870000. Mean Reward: 1.6421028863938574. Std of Reward: 5.254045093129843.
Step: 872500. Mean Reward: -2.0090807235748467. Std of Reward: 1.7631239338987694.
Step: 875000. Mean Reward: -2.068927813231185. Std of Reward: 4.616400757701751.
Saved Model
Step: 877500. Mean Reward: 0.21059732001599343. Std of Reward: 3.669534254653536.
Step: 882500. Mean Reward: -4.1007778068262155. Std of Reward: 2.311115083395315.
Step: 885000. Mean Reward: -0.7116983750072716. Std of Reward: 5.290462863112912.
Step: 887500. Mean Reward: -3.148722101085263. Std of Reward: 1.701307400593

Step: 1317500. Mean Reward: -8.187288058373777. Std of Reward: 0.020317524945039744.
Saved Model
Step: 1327500. Mean Reward: -8.199890597333622. Std of Reward: 0.02830212924676973.
Step: 1335000. Mean Reward: -8.193341296667056. Std of Reward: 0.024746072768442335.
Step: 1342500. Mean Reward: -8.19683414736585. Std of Reward: 0.021451135806979146.
Step: 1350000. Mean Reward: -8.18941566697821. Std of Reward: 0.025978829453764604.
Saved Model
Step: 1360000. Mean Reward: -8.198374826254677. Std of Reward: 0.023105353276510535.
Step: 1367500. Mean Reward: -8.188401681602661. Std of Reward: 0.022028740399871946.
Step: 1375000. Mean Reward: -8.1818220917804. Std of Reward: 0.019253404066350528.
Saved Model
Step: 1385000. Mean Reward: -8.195146791091508. Std of Reward: 0.025685281112680917.
Step: 1392500. Mean Reward: -8.192873193350415. Std of Reward: 0.027459169383644247.
Step: 1400000. Mean Reward: -8.191256677168422. Std of Reward: 0.01774603374222038.
Saved Model
Step: 1407500. Mean Rew

Saved Model
Step: 2080000. Mean Reward: -8.199489141059932. Std of Reward: 0.03416807000039318.
Step: 2087500. Mean Reward: -8.187995495507808. Std of Reward: 0.020370871694462354.
Step: 2097500. Mean Reward: -8.178441549388825. Std of Reward: 0.03107109875558689.
Saved Model
Step: 2105000. Mean Reward: -6.536791041438215. Std of Reward: 5.229098770477962.
Step: 2112500. Mean Reward: -8.205877122617704. Std of Reward: 0.013996429742515007.
Step: 2122500. Mean Reward: -8.204116939914597. Std of Reward: 0.0234025318247967.
Saved Model
Step: 2130000. Mean Reward: -8.191088756926092. Std of Reward: 0.02909264445131662.
Step: 2137500. Mean Reward: -8.175593724569017. Std of Reward: 0.02012793897418219.
Step: 2145000. Mean Reward: -8.194619757175289. Std of Reward: 0.033031000674988306.
Saved Model
Step: 2155000. Mean Reward: -8.19058163189402. Std of Reward: 0.03369382180296558.
Step: 2162500. Mean Reward: -8.199571843224149. Std of Reward: 0.030135964687826217.
Step: 2170000. Mean Reward: 

Step: 2842500. Mean Reward: -8.184598466742642. Std of Reward: 0.021034760227872616.
Step: 2850000. Mean Reward: -8.204175035021652. Std of Reward: 0.023520385109492427.
Saved Model
Step: 2860000. Mean Reward: -8.19146885088247. Std of Reward: 0.02270566628614662.
Step: 2867500. Mean Reward: -7.53461652178247. Std of Reward: 2.0662390107477564.
Step: 2875000. Mean Reward: -8.183125540274027. Std of Reward: 0.01986075070314788.
Saved Model
Step: 2882500. Mean Reward: -6.546913957037631. Std of Reward: 5.231650112200076.
Step: 2892500. Mean Reward: -8.192281161424013. Std of Reward: 0.026223809472764474.
Step: 2900000. Mean Reward: -8.191492845021992. Std of Reward: 0.025611111637478708.
Saved Model
Step: 2907500. Mean Reward: -8.184398652938935. Std of Reward: 0.018811630524704998.
Step: 2917500. Mean Reward: -8.187071649266572. Std of Reward: 0.020205501293990753.
Step: 2925000. Mean Reward: -8.190474511506196. Std of Reward: 0.02552282564496973.
Saved Model
Step: 2932500. Mean Reward:

Saved Model
Step: 3605000. Mean Reward: -8.191451684827955. Std of Reward: 0.0251542181872885.
Step: 3612500. Mean Reward: -8.200062524399295. Std of Reward: 0.016902044024247092.
Step: 3620000. Mean Reward: -8.200629159747756. Std of Reward: 0.021544598166888928.
Saved Model
Step: 3630000. Mean Reward: -8.184345857544931. Std of Reward: 0.022217100825529305.
Step: 3637500. Mean Reward: -8.185221254382972. Std of Reward: 0.022509149619746815.
Step: 3645000. Mean Reward: -8.188151704587801. Std of Reward: 0.02897740257558819.
Saved Model
Step: 3655000. Mean Reward: -8.187356950668267. Std of Reward: 0.02234440689526263.
Step: 3662500. Mean Reward: -8.198151055853405. Std of Reward: 0.024652917386424025.
Step: 3670000. Mean Reward: -8.193132745214575. Std of Reward: 0.027259822923556328.
Saved Model
Step: 3677500. Mean Reward: -8.183471147570332. Std of Reward: 0.01842235308562292.
Step: 3687500. Mean Reward: -8.195764581973565. Std of Reward: 0.0246328299552228.
Step: 3695000. Mean Rewa

Step: 4367500. Mean Reward: -8.18970607906699. Std of Reward: 0.023255355452998904.
Step: 4375000. Mean Reward: -8.207856599223888. Std of Reward: 0.021386295504218316.
Saved Model
Step: 4382500. Mean Reward: -8.189644471678971. Std of Reward: 0.022457081400609077.
Step: 4392500. Mean Reward: -8.189349939101922. Std of Reward: 0.026632917645919477.
Step: 4400000. Mean Reward: -5.160234467086824. Std of Reward: 6.77906828021954.
Saved Model
Step: 4407500. Mean Reward: -8.19246318678474. Std of Reward: 0.024928494051816123.
Step: 4415000. Mean Reward: -8.18530674147561. Std of Reward: 0.027613383157664685.
Step: 4425000. Mean Reward: -8.190428773481663. Std of Reward: 0.022245010106848786.
Saved Model
Step: 4432500. Mean Reward: -8.187782746953424. Std of Reward: 0.019955009044010524.
Step: 4440000. Mean Reward: -8.195397912582518. Std of Reward: 0.02377667080471225.
Step: 4447500. Mean Reward: -8.199237977482616. Std of Reward: 0.020438168757597783.
Saved Model
Step: 4457500. Mean Rewar

Saved Model
Step: 5130000. Mean Reward: -8.193519098104606. Std of Reward: 0.024956952083114874.
Step: 5137500. Mean Reward: -8.19318874495143. Std of Reward: 0.025498404902970975.
Step: 5145000. Mean Reward: -8.187063905493842. Std of Reward: 0.028303096040513887.
Saved Model
Step: 5152500. Mean Reward: -8.195677110413746. Std of Reward: 0.021979001196299922.
Step: 5162500. Mean Reward: -8.19752720077299. Std of Reward: 0.029592333389381248.
Step: 5170000. Mean Reward: -8.205126459285724. Std of Reward: 0.018498149879659156.
Saved Model
Step: 5177500. Mean Reward: -8.198354074668218. Std of Reward: 0.02386819259363952.
Step: 5187500. Mean Reward: -8.18781044202606. Std of Reward: 0.02108800430573785.
Step: 5195000. Mean Reward: -8.191152460072441. Std of Reward: 0.022581771764237313.
Saved Model
Step: 5202500. Mean Reward: -8.193308870736368. Std of Reward: 0.02444921389373064.
Step: 5210000. Mean Reward: -8.197925722596564. Std of Reward: 0.01953960592863326.
Step: 5220000. Mean Rewa

Step: 5890000. Mean Reward: -8.201140023669005. Std of Reward: 0.025423824505954772.
Step: 5900000. Mean Reward: -8.182146493541298. Std of Reward: 0.023270437400185303.
Saved Model
Step: 5907500. Mean Reward: -8.188368112115274. Std of Reward: 0.028952559605040395.
Step: 5915000. Mean Reward: -8.19733360401176. Std of Reward: 0.02159179555147837.
Step: 5925000. Mean Reward: -8.191697312784676. Std of Reward: 0.022512698088566808.
Saved Model
Step: 5932500. Mean Reward: -8.19767696543384. Std of Reward: 0.024738045530704633.
Step: 5940000. Mean Reward: -8.183834649763263. Std of Reward: 0.02342451287036885.
Step: 5947500. Mean Reward: -8.203791581703225. Std of Reward: 0.018620777407804705.
Saved Model
Step: 5957500. Mean Reward: -8.196718026521356. Std of Reward: 0.025261928808606907.
Step: 5965000. Mean Reward: -8.184285241674012. Std of Reward: 0.024804186146202403.
Step: 5972500. Mean Reward: -8.195201188636407. Std of Reward: 0.026755018368800083.
Saved Model
Step: 5980000. Mean R

Step: 6645000. Mean Reward: -8.19297998052583. Std of Reward: 0.01698084517587697.
Saved Model
Step: 6652500. Mean Reward: -8.183310586695082. Std of Reward: 0.013633147358928906.
Step: 6662500. Mean Reward: -8.205978776500032. Std of Reward: 0.024864977344454312.
Step: 6670000. Mean Reward: -8.196723939515957. Std of Reward: 0.024344279619340676.
Saved Model
Step: 6677500. Mean Reward: -6.539022219429044. Std of Reward: 5.229823100896806.
Step: 6685000. Mean Reward: -8.210462080203298. Std of Reward: 0.018790191602975438.
Step: 6695000. Mean Reward: -8.206144067309832. Std of Reward: 0.021384400610463327.
Saved Model
Step: 6702500. Mean Reward: -8.18916500257018. Std of Reward: 0.026966635280191224.
Step: 6710000. Mean Reward: -8.204404870655043. Std of Reward: 0.024369792228822195.
Step: 6720000. Mean Reward: -8.191230737184101. Std of Reward: 0.025516649698638205.
Saved Model
Step: 6727500. Mean Reward: -8.18841209583445. Std of Reward: 0.0283528057105001.
Step: 6735000. Mean Reward

MemoryError: 

### Export the trained Tensorflow graph
Once the model has been trained and saved, we can export it as a .bytes file which Unity can embed.

In [131]:
export_best_graph(trainer, env_model_path, model_name)
export_graph(env_model_path, env_name)

INFO:tensorflow:Restoring parameters from Projects/Steering/v8.1cm/m1/Models/model_bests\best_model.cptk


INFO:tensorflow:Restoring parameters from Projects/Steering/v8.1cm/m1/Models/model_bests\best_model.cptk


INFO:tensorflow:Froze 4 variables.


INFO:tensorflow:Froze 4 variables.


Converted 4 variables to const ops.
INFO:tensorflow:Restoring parameters from Projects/Steering/v8.1cm/m1/Models\model-6825000.cptk


INFO:tensorflow:Restoring parameters from Projects/Steering/v8.1cm/m1/Models\model-6825000.cptk


INFO:tensorflow:Froze 5 variables.


INFO:tensorflow:Froze 5 variables.


Converted 5 variables to const ops.


### Test the Trained Model

In [123]:
port_save = random.randint(0,100)
env = UnityEnvironment(file_name = project_path, base_port = 7013 + port_save)
print(str(env))
brain_name = env.brain_names[0]

INFO:unityagents:
'Academy' started successfully!


Unity Academy name: Academy
        Number of brains: 1
        Reset Parameters :
		target_position_z -> 0.0
		num_obstacles -> 1.0
Unity brain name: Brain
        Number of observations (per agent): 0
        State space type: continuous
        State space size (per agent): 22
        Action space type: discrete
        Action space size (per agent): 3
        Memory space size (per agent): 0
        Action descriptions: Forward, Right, Left


In [124]:
load_best = True;
print("The play session has started.")
with tf.Session() as sess:
    if load_best:
        model_path_to_load = env_model_path + "/model_bests"
        if not os.path.exists(model_path_to_load):
            print("Model bests does not exist!")
            model_path_to_load = model_path
    else:
        model_path_to_load = env_model_path
    ckpt = tf.train.get_checkpoint_state(model_path_to_load)
    saver.restore(sess, ckpt.model_checkpoint_path)
    
    steps = sess.run(ppo_model.global_step)
    info = env.reset(train_mode=False)[brain_name]
    trainer = Trainer( ppo_model, sess, info, is_continuous, use_observations, use_states, training=False)
    
    while steps <= max_steps:
        if env.global_done:
            info = env.reset(train_mode=False)[brain_name]
        new_info = trainer.take_action(info, env, brain_name, steps = 0, normalize = False)
        info = new_info
        trainer.process_experiences(info, time_horizon, gamma, lambd)
        steps += 1
        sess.run(ppo_model.increment_step)

The play session has started.
INFO:tensorflow:Restoring parameters from Projects/Steering/v8.0cm/m1/Models/model_bests\best_model.cptk


INFO:tensorflow:Restoring parameters from Projects/Steering/v8.0cm/m1/Models/model_bests\best_model.cptk


error: unpack requires a buffer of 4 bytes